Remove WebM support (and deps libvpx and opus)

We've had many issues with WebM support and specifically the libvpx library over the years, mostly due to its poor integration in Godot's buildsystem, but without anyone really interested in improving this state. With the new GDExtensions in Godot 4.0, we intend to move video decoding to first-party extensions, and this would likely be done using something like libvlc to expose more codecs. Removing the `webm` module means we can remove libsimplewebm, libvpx and opus, which we were only used for that purpose. Both libvpx and opus were fairly complex pieces of the buildsystem, so this is a nice cleanup. This also removes the compile-time dependency on `yasm`. Fixes lots of compilation or non-working WebM issues which will be linked in the PR.
author: Rémi Verschelde <rverschelde@gmail.com> 2021-10-15 12:05:32 +0200
committer: Rémi Verschelde <rverschelde@gmail.com> 2021-10-15 12:09:11 +0200
commit: ae74e78909ae0bc476112fb43b9580e969879dcd (patch)
tree: 49144c84e18719a7ca54a243effc319ea128ab70 /thirdparty/libvpx/vpx_dsp
parent: e2bfb27efb858c4a1314d314386531cbcdfcf335 (diff)
62 files changed, 0 insertions, 31028 deletions
diff --git a/thirdparty/libvpx/vpx_dsp/arm/armasm_ms/intrapred_neon_asm.asm b/thirdparty/libvpx/vpx_dsp/arm/armasm_ms/intrapred_neon_asm.asm
deleted file mode 100644
index b2846c410b..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/armasm_ms/intrapred_neon_asm.asm
+++ /dev/null
@@ -1,643 +0,0 @@
-; This file was created from a .asm file
-;  using the ads2armasm_ms.pl script.
-;
-;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT  |vpx_v_predictor_4x4_neon|
-    EXPORT  |vpx_v_predictor_8x8_neon|
-    EXPORT  |vpx_v_predictor_16x16_neon|
-    EXPORT  |vpx_v_predictor_32x32_neon|
-    EXPORT  |vpx_h_predictor_4x4_neon|
-    EXPORT  |vpx_h_predictor_8x8_neon|
-    EXPORT  |vpx_h_predictor_16x16_neon|
-    EXPORT  |vpx_h_predictor_32x32_neon|
-    EXPORT  |vpx_tm_predictor_4x4_neon|
-    EXPORT  |vpx_tm_predictor_8x8_neon|
-    EXPORT  |vpx_tm_predictor_16x16_neon|
-    EXPORT  |vpx_tm_predictor_32x32_neon|
-    
-    
-
-    AREA |.text|, CODE, READONLY, ALIGN=2
-
-;void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
-;                              const uint8_t *above,
-;                              const uint8_t *left)
-; r0  uint8_t *dst
-; r1  ptrdiff_t y_stride
-; r2  const uint8_t *above
-; r3  const uint8_t *left
-
-|vpx_v_predictor_4x4_neon| PROC
-    vld1.32             {d0[0]}, [r2]
-    vst1.32             {d0[0]}, [r0], r1
-    vst1.32             {d0[0]}, [r0], r1
-    vst1.32             {d0[0]}, [r0], r1
-    vst1.32             {d0[0]}, [r0], r1
-    bx                  lr
-    ENDP                ; |vpx_v_predictor_4x4_neon|
-    ALIGN 4
-
-;void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
-;                              const uint8_t *above,
-;                              const uint8_t *left)
-; r0  uint8_t *dst
-; r1  ptrdiff_t y_stride
-; r2  const uint8_t *above
-; r3  const uint8_t *left
-
-|vpx_v_predictor_8x8_neon| PROC
-    vld1.8              {d0}, [r2]
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    bx                  lr
-    ENDP                ; |vpx_v_predictor_8x8_neon|
-    ALIGN 4
-
-;void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
-;                                const uint8_t *above,
-;                                const uint8_t *left)
-; r0  uint8_t *dst
-; r1  ptrdiff_t y_stride
-; r2  const uint8_t *above
-; r3  const uint8_t *left
-
-|vpx_v_predictor_16x16_neon| PROC
-    vld1.8              {q0}, [r2]
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    bx                  lr
-    ENDP                ; |vpx_v_predictor_16x16_neon|
-    ALIGN 4
-
-;void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
-;                                const uint8_t *above,
-;                                const uint8_t *left)
-; r0  uint8_t *dst
-; r1  ptrdiff_t y_stride
-; r2  const uint8_t *above
-; r3  const uint8_t *left
-
-|vpx_v_predictor_32x32_neon| PROC
-    vld1.8              {q0, q1}, [r2]
-    mov                 r2, #2
-loop_v
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    subs                r2, r2, #1
-    bgt                 loop_v
-    bx                  lr
-    ENDP                ; |vpx_v_predictor_32x32_neon|
-    ALIGN 4
-
-;void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
-;                              const uint8_t *above,
-;                              const uint8_t *left)
-; r0  uint8_t *dst
-; r1  ptrdiff_t y_stride
-; r2  const uint8_t *above
-; r3  const uint8_t *left
-
-|vpx_h_predictor_4x4_neon| PROC
-    vld1.32             {d1[0]}, [r3]
-    vdup.8              d0, d1[0]
-    vst1.32             {d0[0]}, [r0], r1
-    vdup.8              d0, d1[1]
-    vst1.32             {d0[0]}, [r0], r1
-    vdup.8              d0, d1[2]
-    vst1.32             {d0[0]}, [r0], r1
-    vdup.8              d0, d1[3]
-    vst1.32             {d0[0]}, [r0], r1
-    bx                  lr
-    ENDP                ; |vpx_h_predictor_4x4_neon|
-    ALIGN 4
-
-;void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
-;                              const uint8_t *above,
-;                              const uint8_t *left)
-; r0  uint8_t *dst
-; r1  ptrdiff_t y_stride
-; r2  const uint8_t *above
-; r3  const uint8_t *left
-
-|vpx_h_predictor_8x8_neon| PROC
-    vld1.64             {d1}, [r3]
-    vdup.8              d0, d1[0]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[1]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[2]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[3]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[4]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[5]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[6]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[7]
-    vst1.64             {d0}, [r0], r1
-    bx                  lr
-    ENDP                ; |vpx_h_predictor_8x8_neon|
-    ALIGN 4
-
-;void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
-;                                const uint8_t *above,
-;                                const uint8_t *left)
-; r0  uint8_t *dst
-; r1  ptrdiff_t y_stride
-; r2  const uint8_t *above
-; r3  const uint8_t *left
-
-|vpx_h_predictor_16x16_neon| PROC
-    vld1.8              {q1}, [r3]
-    vdup.8              q0, d2[0]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[1]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[2]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[3]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[4]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[5]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[6]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[7]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[0]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[1]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[2]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[3]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[4]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[5]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[6]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[7]
-    vst1.8              {q0}, [r0], r1
-    bx                  lr
-    ENDP                ; |vpx_h_predictor_16x16_neon|
-    ALIGN 4
-
-;void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
-;                                const uint8_t *above,
-;                                const uint8_t *left)
-; r0  uint8_t *dst
-; r1  ptrdiff_t y_stride
-; r2  const uint8_t *above
-; r3  const uint8_t *left
-
-|vpx_h_predictor_32x32_neon| PROC
-    sub                 r1, r1, #16
-    mov                 r2, #2
-loop_h
-    vld1.8              {q1}, [r3]!
-    vdup.8              q0, d2[0]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[1]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[2]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[3]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[4]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[5]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[6]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[7]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[0]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[1]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[2]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[3]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[4]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[5]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[6]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[7]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    subs                r2, r2, #1
-    bgt                 loop_h
-    bx                  lr
-    ENDP                ; |vpx_h_predictor_32x32_neon|
-    ALIGN 4
-
-;void vpx_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride,
-;                                const uint8_t *above,
-;                                const uint8_t *left)
-; r0  uint8_t *dst
-; r1  ptrdiff_t y_stride
-; r2  const uint8_t *above
-; r3  const uint8_t *left
-
-|vpx_tm_predictor_4x4_neon| PROC
-    ; Load ytop_left = above[-1];
-    sub                 r12, r2, #1
-    vld1.u8             {d0[]}, [r12]
-
-    ; Load above 4 pixels
-    vld1.32             {d2[0]}, [r2]
-
-    ; Compute above - ytop_left
-    vsubl.u8            q3, d2, d0
-
-    ; Load left row by row and compute left + (above - ytop_left)
-    ; 1st row and 2nd row
-    vld1.u8             {d2[]}, [r3]!
-    vld1.u8             {d4[]}, [r3]!
-    vmovl.u8            q1, d2
-    vmovl.u8            q2, d4
-    vadd.s16            q1, q1, q3
-    vadd.s16            q2, q2, q3
-    vqmovun.s16         d0, q1
-    vqmovun.s16         d1, q2
-    vst1.32             {d0[0]}, [r0], r1
-    vst1.32             {d1[0]}, [r0], r1
-
-    ; 3rd row and 4th row
-    vld1.u8             {d2[]}, [r3]!
-    vld1.u8             {d4[]}, [r3]
-    vmovl.u8            q1, d2
-    vmovl.u8            q2, d4
-    vadd.s16            q1, q1, q3
-    vadd.s16            q2, q2, q3
-    vqmovun.s16         d0, q1
-    vqmovun.s16         d1, q2
-    vst1.32             {d0[0]}, [r0], r1
-    vst1.32             {d1[0]}, [r0], r1
-    bx                  lr
-    ENDP                ; |vpx_tm_predictor_4x4_neon|
-    ALIGN 4
-
-;void vpx_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride,
-;                                const uint8_t *above,
-;                                const uint8_t *left)
-; r0  uint8_t *dst
-; r1  ptrdiff_t y_stride
-; r2  const uint8_t *above
-; r3  const uint8_t *left
-
-|vpx_tm_predictor_8x8_neon| PROC
-    ; Load ytop_left = above[-1];
-    sub                 r12, r2, #1
-    vld1.8              {d0[]}, [r12]
-
-    ; preload 8 left
-    vld1.8              {d30}, [r3]
-
-    ; Load above 8 pixels
-    vld1.64             {d2}, [r2]
-
-    vmovl.u8            q10, d30
-
-    ; Compute above - ytop_left
-    vsubl.u8            q3, d2, d0
-
-    ; Load left row by row and compute left + (above - ytop_left)
-    ; 1st row and 2nd row
-    vdup.16             q0, d20[0]
-    vdup.16             q1, d20[1]
-    vadd.s16            q0, q3, q0
-    vadd.s16            q1, q3, q1
-
-    ; 3rd row and 4th row
-    vdup.16             q8, d20[2]
-    vdup.16             q9, d20[3]
-    vadd.s16            q8, q3, q8
-    vadd.s16            q9, q3, q9
-
-    vqmovun.s16         d0, q0
-    vqmovun.s16         d1, q1
-    vqmovun.s16         d2, q8
-    vqmovun.s16         d3, q9
-
-    vst1.64             {d0}, [r0], r1
-    vst1.64             {d1}, [r0], r1
-    vst1.64             {d2}, [r0], r1
-    vst1.64             {d3}, [r0], r1
-
-    ; 5th row and 6th row
-    vdup.16             q0, d21[0]
-    vdup.16             q1, d21[1]
-    vadd.s16            q0, q3, q0
-    vadd.s16            q1, q3, q1
-
-    ; 7th row and 8th row
-    vdup.16             q8, d21[2]
-    vdup.16             q9, d21[3]
-    vadd.s16            q8, q3, q8
-    vadd.s16            q9, q3, q9
-
-    vqmovun.s16         d0, q0
-    vqmovun.s16         d1, q1
-    vqmovun.s16         d2, q8
-    vqmovun.s16         d3, q9
-
-    vst1.64             {d0}, [r0], r1
-    vst1.64             {d1}, [r0], r1
-    vst1.64             {d2}, [r0], r1
-    vst1.64             {d3}, [r0], r1
-
-    bx                  lr
-    ENDP                ; |vpx_tm_predictor_8x8_neon|
-    ALIGN 4
-
-;void vpx_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride,
-;                                const uint8_t *above,
-;                                const uint8_t *left)
-; r0  uint8_t *dst
-; r1  ptrdiff_t y_stride
-; r2  const uint8_t *above
-; r3  const uint8_t *left
-
-|vpx_tm_predictor_16x16_neon| PROC
-    ; Load ytop_left = above[-1];
-    sub                 r12, r2, #1
-    vld1.8              {d0[]}, [r12]
-
-    ; Load above 8 pixels
-    vld1.8              {q1}, [r2]
-
-    ; preload 8 left into r12
-    vld1.8              {d18}, [r3]!
-
-    ; Compute above - ytop_left
-    vsubl.u8            q2, d2, d0
-    vsubl.u8            q3, d3, d0
-
-    vmovl.u8            q10, d18
-
-    ; Load left row by row and compute left + (above - ytop_left)
-    ; Process 8 rows in each single loop and loop 2 times to process 16 rows.
-    mov                 r2, #2
-
-loop_16x16_neon
-    ; Process two rows.
-    vdup.16             q0, d20[0]
-    vdup.16             q8, d20[1]
-    vadd.s16            q1, q0, q2
-    vadd.s16            q0, q0, q3
-    vadd.s16            q11, q8, q2
-    vadd.s16            q8, q8, q3
-    vqmovun.s16         d2, q1
-    vqmovun.s16         d3, q0
-    vqmovun.s16         d22, q11
-    vqmovun.s16         d23, q8
-    vdup.16             q0, d20[2]                  ; proload next 2 rows data
-    vdup.16             q8, d20[3]
-    vst1.64             {d2,d3}, [r0], r1
-    vst1.64             {d22,d23}, [r0], r1
-
-    ; Process two rows.
-    vadd.s16            q1, q0, q2
-    vadd.s16            q0, q0, q3
-    vadd.s16            q11, q8, q2
-    vadd.s16            q8, q8, q3
-    vqmovun.s16         d2, q1
-    vqmovun.s16         d3, q0
-    vqmovun.s16         d22, q11
-    vqmovun.s16         d23, q8
-    vdup.16             q0, d21[0]                  ; proload next 2 rows data
-    vdup.16             q8, d21[1]
-    vst1.64             {d2,d3}, [r0], r1
-    vst1.64             {d22,d23}, [r0], r1
-
-    vadd.s16            q1, q0, q2
-    vadd.s16            q0, q0, q3
-    vadd.s16            q11, q8, q2
-    vadd.s16            q8, q8, q3
-    vqmovun.s16         d2, q1
-    vqmovun.s16         d3, q0
-    vqmovun.s16         d22, q11
-    vqmovun.s16         d23, q8
-    vdup.16             q0, d21[2]                  ; proload next 2 rows data
-    vdup.16             q8, d21[3]
-    vst1.64             {d2,d3}, [r0], r1
-    vst1.64             {d22,d23}, [r0], r1
-
-
-    vadd.s16            q1, q0, q2
-    vadd.s16            q0, q0, q3
-    vadd.s16            q11, q8, q2
-    vadd.s16            q8, q8, q3
-    vqmovun.s16         d2, q1
-    vqmovun.s16         d3, q0
-    vqmovun.s16         d22, q11
-    vqmovun.s16         d23, q8
-    vld1.8              {d18}, [r3]!                  ; preload 8 left into r12
-    vmovl.u8            q10, d18
-    vst1.64             {d2,d3}, [r0], r1
-    vst1.64             {d22,d23}, [r0], r1
-
-    subs                r2, r2, #1
-    bgt                 loop_16x16_neon
-
-    bx                  lr
-    ENDP                ; |vpx_tm_predictor_16x16_neon|
-    ALIGN 4
-
-;void vpx_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride,
-;                                  const uint8_t *above,
-;                                  const uint8_t *left)
-; r0  uint8_t *dst
-; r1  ptrdiff_t y_stride
-; r2  const uint8_t *above
-; r3  const uint8_t *left
-
-|vpx_tm_predictor_32x32_neon| PROC
-    ; Load ytop_left = above[-1];
-    sub                 r12, r2, #1
-    vld1.8              {d0[]}, [r12]
-
-    ; Load above 32 pixels
-    vld1.8              {q1}, [r2]!
-    vld1.8              {q2}, [r2]
-
-    ; preload 8 left pixels
-    vld1.8              {d26}, [r3]!
-
-    ; Compute above - ytop_left
-    vsubl.u8            q8, d2, d0
-    vsubl.u8            q9, d3, d0
-    vsubl.u8            q10, d4, d0
-    vsubl.u8            q11, d5, d0
-
-    vmovl.u8            q3, d26
-
-    ; Load left row by row and compute left + (above - ytop_left)
-    ; Process 8 rows in each single loop and loop 4 times to process 32 rows.
-    mov                 r2, #4
-
-loop_32x32_neon
-    ; Process two rows.
-    vdup.16             q0, d6[0]
-    vdup.16             q2, d6[1]
-    vadd.s16            q12, q0, q8
-    vadd.s16            q13, q0, q9
-    vadd.s16            q14, q0, q10
-    vadd.s16            q15, q0, q11
-    vqmovun.s16         d0, q12
-    vqmovun.s16         d1, q13
-    vadd.s16            q12, q2, q8
-    vadd.s16            q13, q2, q9
-    vqmovun.s16         d2, q14
-    vqmovun.s16         d3, q15
-    vadd.s16            q14, q2, q10
-    vadd.s16            q15, q2, q11
-    vst1.64             {d0-d3}, [r0], r1
-    vqmovun.s16         d24, q12
-    vqmovun.s16         d25, q13
-    vqmovun.s16         d26, q14
-    vqmovun.s16         d27, q15
-    vdup.16             q1, d6[2]
-    vdup.16             q2, d6[3]
-    vst1.64             {d24-d27}, [r0], r1
-
-    ; Process two rows.
-    vadd.s16            q12, q1, q8
-    vadd.s16            q13, q1, q9
-    vadd.s16            q14, q1, q10
-    vadd.s16            q15, q1, q11
-    vqmovun.s16         d0, q12
-    vqmovun.s16         d1, q13
-    vadd.s16            q12, q2, q8
-    vadd.s16            q13, q2, q9
-    vqmovun.s16         d2, q14
-    vqmovun.s16         d3, q15
-    vadd.s16            q14, q2, q10
-    vadd.s16            q15, q2, q11
-    vst1.64             {d0-d3}, [r0], r1
-    vqmovun.s16         d24, q12
-    vqmovun.s16         d25, q13
-    vqmovun.s16         d26, q14
-    vqmovun.s16         d27, q15
-    vdup.16             q0, d7[0]
-    vdup.16             q2, d7[1]
-    vst1.64             {d24-d27}, [r0], r1
-
-    ; Process two rows.
-    vadd.s16            q12, q0, q8
-    vadd.s16            q13, q0, q9
-    vadd.s16            q14, q0, q10
-    vadd.s16            q15, q0, q11
-    vqmovun.s16         d0, q12
-    vqmovun.s16         d1, q13
-    vadd.s16            q12, q2, q8
-    vadd.s16            q13, q2, q9
-    vqmovun.s16         d2, q14
-    vqmovun.s16         d3, q15
-    vadd.s16            q14, q2, q10
-    vadd.s16            q15, q2, q11
-    vst1.64             {d0-d3}, [r0], r1
-    vqmovun.s16         d24, q12
-    vqmovun.s16         d25, q13
-    vqmovun.s16         d26, q14
-    vqmovun.s16         d27, q15
-    vdup.16             q0, d7[2]
-    vdup.16             q2, d7[3]
-    vst1.64             {d24-d27}, [r0], r1
-
-    ; Process two rows.
-    vadd.s16            q12, q0, q8
-    vadd.s16            q13, q0, q9
-    vadd.s16            q14, q0, q10
-    vadd.s16            q15, q0, q11
-    vqmovun.s16         d0, q12
-    vqmovun.s16         d1, q13
-    vadd.s16            q12, q2, q8
-    vadd.s16            q13, q2, q9
-    vqmovun.s16         d2, q14
-    vqmovun.s16         d3, q15
-    vadd.s16            q14, q2, q10
-    vadd.s16            q15, q2, q11
-    vst1.64             {d0-d3}, [r0], r1
-    vqmovun.s16         d24, q12
-    vqmovun.s16         d25, q13
-    vld1.8              {d0}, [r3]!                   ; preload 8 left pixels
-    vqmovun.s16         d26, q14
-    vqmovun.s16         d27, q15
-    vmovl.u8            q3, d0
-    vst1.64             {d24-d27}, [r0], r1
-
-    subs                r2, r2, #1
-    bgt                 loop_32x32_neon
-
-    bx                  lr
-    ENDP                ; |vpx_tm_predictor_32x32_neon|
-    ALIGN 4
-
-    END
diff --git a/thirdparty/libvpx/vpx_dsp/arm/armasm_ms/loopfilter_mb_neon.asm b/thirdparty/libvpx/vpx_dsp/arm/armasm_ms/loopfilter_mb_neon.asm
deleted file mode 100644
index 9c3736faf8..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/armasm_ms/loopfilter_mb_neon.asm
+++ /dev/null
@@ -1,641 +0,0 @@
-; This file was created from a .asm file
-;  using the ads2armasm_ms.pl script.
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT  |vpx_lpf_horizontal_edge_8_neon|
-    EXPORT  |vpx_lpf_horizontal_edge_16_neon|
-    EXPORT  |vpx_lpf_vertical_16_neon|
-
-    AREA |.text|, CODE, READONLY, ALIGN=2
-
-; void mb_lpf_horizontal_edge(uint8_t *s, int p,
-;                             const uint8_t *blimit,
-;                             const uint8_t *limit,
-;                             const uint8_t *thresh,
-;                             int count)
-; r0    uint8_t *s,
-; r1    int p, /* pitch */
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh,
-; r12   int count
-|mb_lpf_horizontal_edge| PROC
-    push        {r4-r8, lr}
-    vpush       {d8-d15}
-    ldr         r4, [sp, #88]              ; load thresh
-
-h_count
-    vld1.8      {d16[]}, [r2]              ; load *blimit
-    vld1.8      {d17[]}, [r3]              ; load *limit
-    vld1.8      {d18[]}, [r4]              ; load *thresh
-
-    sub         r8, r0, r1, lsl #3         ; move src pointer down by 8 lines
-
-    vld1.u8     {d0}, [r8@64], r1          ; p7
-    vld1.u8     {d1}, [r8@64], r1          ; p6
-    vld1.u8     {d2}, [r8@64], r1          ; p5
-    vld1.u8     {d3}, [r8@64], r1          ; p4
-    vld1.u8     {d4}, [r8@64], r1          ; p3
-    vld1.u8     {d5}, [r8@64], r1          ; p2
-    vld1.u8     {d6}, [r8@64], r1          ; p1
-    vld1.u8     {d7}, [r8@64], r1          ; p0
-    vld1.u8     {d8}, [r8@64], r1          ; q0
-    vld1.u8     {d9}, [r8@64], r1          ; q1
-    vld1.u8     {d10}, [r8@64], r1         ; q2
-    vld1.u8     {d11}, [r8@64], r1         ; q3
-    vld1.u8     {d12}, [r8@64], r1         ; q4
-    vld1.u8     {d13}, [r8@64], r1         ; q5
-    vld1.u8     {d14}, [r8@64], r1         ; q6
-    vld1.u8     {d15}, [r8@64], r1         ; q7
-
-    bl          vpx_wide_mbfilter_neon
-
-    tst         r7, #1
-    beq         h_mbfilter
-
-    ; flat && mask were not set for any of the channels. Just store the values
-    ; from filter.
-    sub         r8, r0, r1, lsl #1
-
-    vst1.u8     {d25}, [r8@64], r1         ; store op1
-    vst1.u8     {d24}, [r8@64], r1         ; store op0
-    vst1.u8     {d23}, [r8@64], r1         ; store oq0
-    vst1.u8     {d26}, [r8@64], r1         ; store oq1
-
-    b           h_next
-
-h_mbfilter
-    tst         r7, #2
-    beq         h_wide_mbfilter
-
-    ; flat2 was not set for any of the channels. Just store the values from
-    ; mbfilter.
-    sub         r8, r0, r1, lsl #1
-    sub         r8, r8, r1
-
-    vst1.u8     {d18}, [r8@64], r1         ; store op2
-    vst1.u8     {d19}, [r8@64], r1         ; store op1
-    vst1.u8     {d20}, [r8@64], r1         ; store op0
-    vst1.u8     {d21}, [r8@64], r1         ; store oq0
-    vst1.u8     {d22}, [r8@64], r1         ; store oq1
-    vst1.u8     {d23}, [r8@64], r1         ; store oq2
-
-    b           h_next
-
-h_wide_mbfilter
-    sub         r8, r0, r1, lsl #3
-    add         r8, r8, r1
-
-    vst1.u8     {d16}, [r8@64], r1         ; store op6
-    vst1.u8     {d24}, [r8@64], r1         ; store op5
-    vst1.u8     {d25}, [r8@64], r1         ; store op4
-    vst1.u8     {d26}, [r8@64], r1         ; store op3
-    vst1.u8     {d27}, [r8@64], r1         ; store op2
-    vst1.u8     {d18}, [r8@64], r1         ; store op1
-    vst1.u8     {d19}, [r8@64], r1         ; store op0
-    vst1.u8     {d20}, [r8@64], r1         ; store oq0
-    vst1.u8     {d21}, [r8@64], r1         ; store oq1
-    vst1.u8     {d22}, [r8@64], r1         ; store oq2
-    vst1.u8     {d23}, [r8@64], r1         ; store oq3
-    vst1.u8     {d1}, [r8@64], r1          ; store oq4
-    vst1.u8     {d2}, [r8@64], r1          ; store oq5
-    vst1.u8     {d3}, [r8@64], r1          ; store oq6
-
-h_next
-    add         r0, r0, #8
-    subs        r12, r12, #1
-    bne         h_count
-
-    vpop        {d8-d15}
-    pop         {r4-r8, pc}
-
-    ENDP        ; |mb_lpf_horizontal_edge|
-    ALIGN 4
-
-; void vpx_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch,
-;                                     const uint8_t *blimit,
-;                                     const uint8_t *limit,
-;                                     const uint8_t *thresh)
-; r0    uint8_t *s,
-; r1    int pitch,
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh
-|vpx_lpf_horizontal_edge_8_neon| PROC
-    mov r12, #1
-    b mb_lpf_horizontal_edge
-    ENDP        ; |vpx_lpf_horizontal_edge_8_neon|
-    ALIGN 4
-
-; void vpx_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch,
-;                                      const uint8_t *blimit,
-;                                      const uint8_t *limit,
-;                                      const uint8_t *thresh)
-; r0    uint8_t *s,
-; r1    int pitch,
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh
-|vpx_lpf_horizontal_edge_16_neon| PROC
-    mov r12, #2
-    b mb_lpf_horizontal_edge
-    ENDP        ; |vpx_lpf_horizontal_edge_16_neon|
-    ALIGN 4
-
-; void vpx_lpf_vertical_16_neon(uint8_t *s, int p,
-;                               const uint8_t *blimit,
-;                               const uint8_t *limit,
-;                               const uint8_t *thresh)
-; r0    uint8_t *s,
-; r1    int p, /* pitch */
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh,
-|vpx_lpf_vertical_16_neon| PROC
-    push        {r4-r8, lr}
-    vpush       {d8-d15}
-    ldr         r4, [sp, #88]              ; load thresh
-
-    vld1.8      {d16[]}, [r2]              ; load *blimit
-    vld1.8      {d17[]}, [r3]              ; load *limit
-    vld1.8      {d18[]}, [r4]              ; load *thresh
-
-    sub         r8, r0, #8
-
-    vld1.8      {d0}, [r8@64], r1
-    vld1.8      {d8}, [r0@64], r1
-    vld1.8      {d1}, [r8@64], r1
-    vld1.8      {d9}, [r0@64], r1
-    vld1.8      {d2}, [r8@64], r1
-    vld1.8      {d10}, [r0@64], r1
-    vld1.8      {d3}, [r8@64], r1
-    vld1.8      {d11}, [r0@64], r1
-    vld1.8      {d4}, [r8@64], r1
-    vld1.8      {d12}, [r0@64], r1
-    vld1.8      {d5}, [r8@64], r1
-    vld1.8      {d13}, [r0@64], r1
-    vld1.8      {d6}, [r8@64], r1
-    vld1.8      {d14}, [r0@64], r1
-    vld1.8      {d7}, [r8@64], r1
-    vld1.8      {d15}, [r0@64], r1
-
-    sub         r0, r0, r1, lsl #3
-
-    vtrn.32     q0, q2
-    vtrn.32     q1, q3
-    vtrn.32     q4, q6
-    vtrn.32     q5, q7
-
-    vtrn.16     q0, q1
-    vtrn.16     q2, q3
-    vtrn.16     q4, q5
-    vtrn.16     q6, q7
-
-    vtrn.8      d0, d1
-    vtrn.8      d2, d3
-    vtrn.8      d4, d5
-    vtrn.8      d6, d7
-
-    vtrn.8      d8, d9
-    vtrn.8      d10, d11
-    vtrn.8      d12, d13
-    vtrn.8      d14, d15
-
-    bl          vpx_wide_mbfilter_neon
-
-    tst         r7, #1
-    beq         v_mbfilter
-
-    ; flat && mask were not set for any of the channels. Just store the values
-    ; from filter.
-    sub         r8, r0, #2
-
-    vswp        d23, d25
-
-    vst4.8      {d23[0], d24[0], d25[0], d26[0]}, [r8], r1
-    vst4.8      {d23[1], d24[1], d25[1], d26[1]}, [r8], r1
-    vst4.8      {d23[2], d24[2], d25[2], d26[2]}, [r8], r1
-    vst4.8      {d23[3], d24[3], d25[3], d26[3]}, [r8], r1
-    vst4.8      {d23[4], d24[4], d25[4], d26[4]}, [r8], r1
-    vst4.8      {d23[5], d24[5], d25[5], d26[5]}, [r8], r1
-    vst4.8      {d23[6], d24[6], d25[6], d26[6]}, [r8], r1
-    vst4.8      {d23[7], d24[7], d25[7], d26[7]}, [r8], r1
-
-    b           v_end
-
-v_mbfilter
-    tst         r7, #2
-    beq         v_wide_mbfilter
-
-    ; flat2 was not set for any of the channels. Just store the values from
-    ; mbfilter.
-    sub         r8, r0, #3
-
-    vst3.8      {d18[0], d19[0], d20[0]}, [r8], r1
-    vst3.8      {d21[0], d22[0], d23[0]}, [r0], r1
-    vst3.8      {d18[1], d19[1], d20[1]}, [r8], r1
-    vst3.8      {d21[1], d22[1], d23[1]}, [r0], r1
-    vst3.8      {d18[2], d19[2], d20[2]}, [r8], r1
-    vst3.8      {d21[2], d22[2], d23[2]}, [r0], r1
-    vst3.8      {d18[3], d19[3], d20[3]}, [r8], r1
-    vst3.8      {d21[3], d22[3], d23[3]}, [r0], r1
-    vst3.8      {d18[4], d19[4], d20[4]}, [r8], r1
-    vst3.8      {d21[4], d22[4], d23[4]}, [r0], r1
-    vst3.8      {d18[5], d19[5], d20[5]}, [r8], r1
-    vst3.8      {d21[5], d22[5], d23[5]}, [r0], r1
-    vst3.8      {d18[6], d19[6], d20[6]}, [r8], r1
-    vst3.8      {d21[6], d22[6], d23[6]}, [r0], r1
-    vst3.8      {d18[7], d19[7], d20[7]}, [r8], r1
-    vst3.8      {d21[7], d22[7], d23[7]}, [r0], r1
-
-    b           v_end
-
-v_wide_mbfilter
-    sub         r8, r0, #8
-
-    vtrn.32     d0,  d26
-    vtrn.32     d16, d27
-    vtrn.32     d24, d18
-    vtrn.32     d25, d19
-
-    vtrn.16     d0,  d24
-    vtrn.16     d16, d25
-    vtrn.16     d26, d18
-    vtrn.16     d27, d19
-
-    vtrn.8      d0,  d16
-    vtrn.8      d24, d25
-    vtrn.8      d26, d27
-    vtrn.8      d18, d19
-
-    vtrn.32     d20, d1
-    vtrn.32     d21, d2
-    vtrn.32     d22, d3
-    vtrn.32     d23, d15
-
-    vtrn.16     d20, d22
-    vtrn.16     d21, d23
-    vtrn.16     d1,  d3
-    vtrn.16     d2,  d15
-
-    vtrn.8      d20, d21
-    vtrn.8      d22, d23
-    vtrn.8      d1,  d2
-    vtrn.8      d3,  d15
-
-    vst1.8      {d0}, [r8@64], r1
-    vst1.8      {d20}, [r0@64], r1
-    vst1.8      {d16}, [r8@64], r1
-    vst1.8      {d21}, [r0@64], r1
-    vst1.8      {d24}, [r8@64], r1
-    vst1.8      {d22}, [r0@64], r1
-    vst1.8      {d25}, [r8@64], r1
-    vst1.8      {d23}, [r0@64], r1
-    vst1.8      {d26}, [r8@64], r1
-    vst1.8      {d1}, [r0@64], r1
-    vst1.8      {d27}, [r8@64], r1
-    vst1.8      {d2}, [r0@64], r1
-    vst1.8      {d18}, [r8@64], r1
-    vst1.8      {d3}, [r0@64], r1
-    vst1.8      {d19}, [r8@64], r1
-    vst1.8      {d15}, [r0@64], r1
-
-v_end
-    vpop        {d8-d15}
-    pop         {r4-r8, pc}
-
-    ENDP        ; |vpx_lpf_vertical_16_neon|
-    ALIGN 4
-
-; void vpx_wide_mbfilter_neon();
-; This is a helper function for the loopfilters. The invidual functions do the
-; necessary load, transpose (if necessary) and store.
-;
-; r0-r3 PRESERVE
-; d16    blimit
-; d17    limit
-; d18    thresh
-; d0    p7
-; d1    p6
-; d2    p5
-; d3    p4
-; d4    p3
-; d5    p2
-; d6    p1
-; d7    p0
-; d8    q0
-; d9    q1
-; d10   q2
-; d11   q3
-; d12   q4
-; d13   q5
-; d14   q6
-; d15   q7
-|vpx_wide_mbfilter_neon| PROC
-    mov         r7, #0
-
-    ; filter_mask
-    vabd.u8     d19, d4, d5                ; abs(p3 - p2)
-    vabd.u8     d20, d5, d6                ; abs(p2 - p1)
-    vabd.u8     d21, d6, d7                ; abs(p1 - p0)
-    vabd.u8     d22, d9, d8                ; abs(q1 - q0)
-    vabd.u8     d23, d10, d9               ; abs(q2 - q1)
-    vabd.u8     d24, d11, d10              ; abs(q3 - q2)
-
-    ; only compare the largest value to limit
-    vmax.u8     d19, d19, d20              ; max(abs(p3 - p2), abs(p2 - p1))
-    vmax.u8     d20, d21, d22              ; max(abs(p1 - p0), abs(q1 - q0))
-    vmax.u8     d23, d23, d24              ; max(abs(q2 - q1), abs(q3 - q2))
-    vmax.u8     d19, d19, d20
-
-    vabd.u8     d24, d7, d8                ; abs(p0 - q0)
-
-    vmax.u8     d19, d19, d23
-
-    vabd.u8     d23, d6, d9                ; a = abs(p1 - q1)
-    vqadd.u8    d24, d24, d24              ; b = abs(p0 - q0) * 2
-
-    ; abs () > limit
-    vcge.u8     d19, d17, d19
-
-    ; flatmask4
-    vabd.u8     d25, d7, d5                ; abs(p0 - p2)
-    vabd.u8     d26, d8, d10               ; abs(q0 - q2)
-    vabd.u8     d27, d4, d7                ; abs(p3 - p0)
-    vabd.u8     d28, d11, d8               ; abs(q3 - q0)
-
-    ; only compare the largest value to thresh
-    vmax.u8     d25, d25, d26              ; max(abs(p0 - p2), abs(q0 - q2))
-    vmax.u8     d26, d27, d28              ; max(abs(p3 - p0), abs(q3 - q0))
-    vmax.u8     d25, d25, d26
-    vmax.u8     d20, d20, d25
-
-    vshr.u8     d23, d23, #1               ; a = a / 2
-    vqadd.u8    d24, d24, d23              ; a = b + a
-
-    vmov.u8     d30, #1
-    vcge.u8     d24, d16, d24              ; (a > blimit * 2 + limit) * -1
-
-    vcge.u8     d20, d30, d20              ; flat
-
-    vand        d19, d19, d24              ; mask
-
-    ; hevmask
-    vcgt.u8     d21, d21, d18              ; (abs(p1 - p0) > thresh)*-1
-    vcgt.u8     d22, d22, d18              ; (abs(q1 - q0) > thresh)*-1
-    vorr        d21, d21, d22              ; hev
-
-    vand        d16, d20, d19              ; flat && mask
-    vmov        r5, r6, d16
-
-    ; flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7)
-    vabd.u8     d22, d3, d7                ; abs(p4 - p0)
-    vabd.u8     d23, d12, d8               ; abs(q4 - q0)
-    vabd.u8     d24, d7, d2                ; abs(p0 - p5)
-    vabd.u8     d25, d8, d13               ; abs(q0 - q5)
-    vabd.u8     d26, d1, d7                ; abs(p6 - p0)
-    vabd.u8     d27, d14, d8               ; abs(q6 - q0)
-    vabd.u8     d28, d0, d7                ; abs(p7 - p0)
-    vabd.u8     d29, d15, d8               ; abs(q7 - q0)
-
-    ; only compare the largest value to thresh
-    vmax.u8     d22, d22, d23              ; max(abs(p4 - p0), abs(q4 - q0))
-    vmax.u8     d23, d24, d25              ; max(abs(p0 - p5), abs(q0 - q5))
-    vmax.u8     d24, d26, d27              ; max(abs(p6 - p0), abs(q6 - q0))
-    vmax.u8     d25, d28, d29              ; max(abs(p7 - p0), abs(q7 - q0))
-
-    vmax.u8     d26, d22, d23
-    vmax.u8     d27, d24, d25
-    vmax.u8     d23, d26, d27
-
-    vcge.u8     d18, d30, d23              ; flat2
-
-    vmov.u8     d22, #0x80
-
-    orrs        r5, r5, r6                 ; Check for 0
-    orreq       r7, r7, #1                 ; Only do filter branch
-
-    vand        d17, d18, d16              ; flat2 && flat && mask
-    vmov        r5, r6, d17
-
-    ; mbfilter() function
-
-    ; filter() function
-    ; convert to signed
-    veor        d23, d8, d22               ; qs0
-    veor        d24, d7, d22               ; ps0
-    veor        d25, d6, d22               ; ps1
-    veor        d26, d9, d22               ; qs1
-
-    vmov.u8     d27, #3
-
-    vsub.s8     d28, d23, d24              ; ( qs0 - ps0)
-    vqsub.s8    d29, d25, d26              ; filter = clamp(ps1-qs1)
-    vmull.s8    q15, d28, d27              ; 3 * ( qs0 - ps0)
-    vand        d29, d29, d21              ; filter &= hev
-    vaddw.s8    q15, q15, d29              ; filter + 3 * (qs0 - ps0)
-    vmov.u8     d29, #4
-
-    ; filter = clamp(filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d28, q15
-
-    vand        d28, d28, d19              ; filter &= mask
-
-    vqadd.s8    d30, d28, d27              ; filter2 = clamp(filter+3)
-    vqadd.s8    d29, d28, d29              ; filter1 = clamp(filter+4)
-    vshr.s8     d30, d30, #3               ; filter2 >>= 3
-    vshr.s8     d29, d29, #3               ; filter1 >>= 3
-
-
-    vqadd.s8    d24, d24, d30              ; op0 = clamp(ps0 + filter2)
-    vqsub.s8    d23, d23, d29              ; oq0 = clamp(qs0 - filter1)
-
-    ; outer tap adjustments: ++filter1 >> 1
-    vrshr.s8    d29, d29, #1
-    vbic        d29, d29, d21              ; filter &= ~hev
-
-    vqadd.s8    d25, d25, d29              ; op1 = clamp(ps1 + filter)
-    vqsub.s8    d26, d26, d29              ; oq1 = clamp(qs1 - filter)
-
-    veor        d24, d24, d22              ; *f_op0 = u^0x80
-    veor        d23, d23, d22              ; *f_oq0 = u^0x80
-    veor        d25, d25, d22              ; *f_op1 = u^0x80
-    veor        d26, d26, d22              ; *f_oq1 = u^0x80
-
-    tst         r7, #1
-    bxne        lr
-
-    orrs        r5, r5, r6                 ; Check for 0
-    orreq       r7, r7, #2                 ; Only do mbfilter branch
-
-    ; mbfilter flat && mask branch
-    ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's
-    ; and using vibt on the q's?
-    vmov.u8     d29, #2
-    vaddl.u8    q15, d7, d8                ; op2 = p0 + q0
-    vmlal.u8    q15, d4, d27               ; op2 = p0 + q0 + p3 * 3
-    vmlal.u8    q15, d5, d29               ; op2 = p0 + q0 + p3 * 3 + p2 * 2
-    vaddl.u8    q10, d4, d5
-    vaddw.u8    q15, d6                    ; op2=p1 + p0 + q0 + p3 * 3 + p2 *2
-    vaddl.u8    q14, d6, d9
-    vqrshrn.u16 d18, q15, #3               ; r_op2
-
-    vsub.i16    q15, q10
-    vaddl.u8    q10, d4, d6
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d7, d10
-    vqrshrn.u16 d19, q15, #3               ; r_op1
-
-    vsub.i16    q15, q10
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d8, d11
-    vqrshrn.u16 d20, q15, #3               ; r_op0
-
-    vsubw.u8    q15, d4                    ; oq0 = op0 - p3
-    vsubw.u8    q15, d7                    ; oq0 -= p0
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d9, d11
-    vqrshrn.u16 d21, q15, #3               ; r_oq0
-
-    vsubw.u8    q15, d5                    ; oq1 = oq0 - p2
-    vsubw.u8    q15, d8                    ; oq1 -= q0
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d10, d11
-    vqrshrn.u16 d22, q15, #3               ; r_oq1
-
-    vsubw.u8    q15, d6                    ; oq2 = oq0 - p1
-    vsubw.u8    q15, d9                    ; oq2 -= q1
-    vadd.i16    q15, q14
-    vqrshrn.u16 d27, q15, #3               ; r_oq2
-
-    ; Filter does not set op2 or oq2, so use p2 and q2.
-    vbif        d18, d5, d16               ; t_op2 |= p2 & ~(flat & mask)
-    vbif        d19, d25, d16              ; t_op1 |= f_op1 & ~(flat & mask)
-    vbif        d20, d24, d16              ; t_op0 |= f_op0 & ~(flat & mask)
-    vbif        d21, d23, d16              ; t_oq0 |= f_oq0 & ~(flat & mask)
-    vbif        d22, d26, d16              ; t_oq1 |= f_oq1 & ~(flat & mask)
-
-    vbit        d23, d27, d16              ; t_oq2 |= r_oq2 & (flat & mask)
-    vbif        d23, d10, d16              ; t_oq2 |= q2 & ~(flat & mask)
-
-    tst         r7, #2
-    bxne        lr
-
-    ; wide_mbfilter flat2 && flat && mask branch
-    vmov.u8     d16, #7
-    vaddl.u8    q15, d7, d8                ; op6 = p0 + q0
-    vaddl.u8    q12, d2, d3
-    vaddl.u8    q13, d4, d5
-    vaddl.u8    q14, d1, d6
-    vmlal.u8    q15, d0, d16               ; op6 += p7 * 3
-    vadd.i16    q12, q13
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d2, d9
-    vadd.i16    q15, q12
-    vaddl.u8    q12, d0, d1
-    vaddw.u8    q15, d1
-    vaddl.u8    q13, d0, d2
-    vadd.i16    q14, q15, q14
-    vqrshrn.u16 d16, q15, #4               ; w_op6
-
-    vsub.i16    q15, q14, q12
-    vaddl.u8    q14, d3, d10
-    vqrshrn.u16 d24, q15, #4               ; w_op5
-
-    vsub.i16    q15, q13
-    vaddl.u8    q13, d0, d3
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d4, d11
-    vqrshrn.u16 d25, q15, #4               ; w_op4
-
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d0, d4
-    vsub.i16    q15, q13
-    vsub.i16    q14, q15, q14
-    vqrshrn.u16 d26, q15, #4               ; w_op3
-
-    vaddw.u8    q15, q14, d5               ; op2 += p2
-    vaddl.u8    q14, d0, d5
-    vaddw.u8    q15, d12                   ; op2 += q4
-    vbif        d26, d4, d17               ; op3 |= p3 & ~(f2 & f & m)
-    vqrshrn.u16 d27, q15, #4               ; w_op2
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d0, d6
-    vaddw.u8    q15, d6                    ; op1 += p1
-    vaddw.u8    q15, d13                   ; op1 += q5
-    vbif        d27, d18, d17              ; op2 |= t_op2 & ~(f2 & f & m)
-    vqrshrn.u16 d18, q15, #4               ; w_op1
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d0, d7
-    vaddw.u8    q15, d7                    ; op0 += p0
-    vaddw.u8    q15, d14                   ; op0 += q6
-    vbif        d18, d19, d17              ; op1 |= t_op1 & ~(f2 & f & m)
-    vqrshrn.u16 d19, q15, #4               ; w_op0
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d1, d8
-    vaddw.u8    q15, d8                    ; oq0 += q0
-    vaddw.u8    q15, d15                   ; oq0 += q7
-    vbif        d19, d20, d17              ; op0 |= t_op0 & ~(f2 & f & m)
-    vqrshrn.u16 d20, q15, #4               ; w_oq0
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d2, d9
-    vaddw.u8    q15, d9                    ; oq1 += q1
-    vaddl.u8    q4, d10, d15
-    vaddw.u8    q15, d15                   ; oq1 += q7
-    vbif        d20, d21, d17              ; oq0 |= t_oq0 & ~(f2 & f & m)
-    vqrshrn.u16 d21, q15, #4               ; w_oq1
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d3, d10
-    vadd.i16    q15, q4
-    vaddl.u8    q4, d11, d15
-    vbif        d21, d22, d17              ; oq1 |= t_oq1 & ~(f2 & f & m)
-    vqrshrn.u16 d22, q15, #4               ; w_oq2
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d4, d11
-    vadd.i16    q15, q4
-    vaddl.u8    q4, d12, d15
-    vbif        d22, d23, d17              ; oq2 |= t_oq2 & ~(f2 & f & m)
-    vqrshrn.u16 d23, q15, #4               ; w_oq3
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d5, d12
-    vadd.i16    q15, q4
-    vaddl.u8    q4, d13, d15
-    vbif        d16, d1, d17               ; op6 |= p6 & ~(f2 & f & m)
-    vqrshrn.u16 d1, q15, #4                ; w_oq4
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d6, d13
-    vadd.i16    q15, q4
-    vaddl.u8    q4, d14, d15
-    vbif        d24, d2, d17               ; op5 |= p5 & ~(f2 & f & m)
-    vqrshrn.u16 d2, q15, #4                ; w_oq5
-
-    vsub.i16    q15, q14
-    vbif        d25, d3, d17               ; op4 |= p4 & ~(f2 & f & m)
-    vadd.i16    q15, q4
-    vbif        d23, d11, d17              ; oq3 |= q3 & ~(f2 & f & m)
-    vqrshrn.u16 d3, q15, #4                ; w_oq6
-    vbif        d1, d12, d17               ; oq4 |= q4 & ~(f2 & f & m)
-    vbif        d2, d13, d17               ; oq5 |= q5 & ~(f2 & f & m)
-    vbif        d3, d14, d17               ; oq6 |= q6 & ~(f2 & f & m)
-
-    bx          lr
-    ENDP        ; |vpx_wide_mbfilter_neon|
-    ALIGN 4
-
-    END
diff --git a/thirdparty/libvpx/vpx_dsp/arm/armasm_ms/save_reg_neon.asm b/thirdparty/libvpx/vpx_dsp/arm/armasm_ms/save_reg_neon.asm
deleted file mode 100644
index 4cf9988e65..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/armasm_ms/save_reg_neon.asm
+++ /dev/null
@@ -1,39 +0,0 @@
-; This file was created from a .asm file
-;  using the ads2armasm_ms.pl script.
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vpx_push_neon|
-    EXPORT  |vpx_pop_neon|
-
-    
-    
-
-    AREA |.text|, CODE, READONLY, ALIGN=2
-
-|vpx_push_neon| PROC
-    vst1.i64            {d8, d9, d10, d11}, [r0]!
-    vst1.i64            {d12, d13, d14, d15}, [r0]!
-    bx              lr
-
-    ENDP
-    ALIGN 4
-
-|vpx_pop_neon| PROC
-    vld1.i64            {d8, d9, d10, d11}, [r0]!
-    vld1.i64            {d12, d13, d14, d15}, [r0]!
-    bx              lr
-
-    ENDP
-    ALIGN 4
-
-    END
-
diff --git a/thirdparty/libvpx/vpx_dsp/arm/gas/intrapred_neon_asm.s b/thirdparty/libvpx/vpx_dsp/arm/gas/intrapred_neon_asm.s
deleted file mode 100644
index 3932227fc5..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/gas/intrapred_neon_asm.s
+++ /dev/null
@@ -1,658 +0,0 @@
-@ This file was created from a .asm file
-@  using the ads2gas.pl script.
-	.equ DO1STROUNDING, 0
-@
-@  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-@
-@  Use of this source code is governed by a BSD-style license
-@  that can be found in the LICENSE file in the root of the source
-@  tree. An additional intellectual property rights grant can be found
-@  in the file PATENTS.  All contributing project authors may
-@  be found in the AUTHORS file in the root of the source tree.
-@
-
-    .global vpx_v_predictor_4x4_neon 
-	.type vpx_v_predictor_4x4_neon, function
-    .global vpx_v_predictor_8x8_neon 
-	.type vpx_v_predictor_8x8_neon, function
-    .global vpx_v_predictor_16x16_neon 
-	.type vpx_v_predictor_16x16_neon, function
-    .global vpx_v_predictor_32x32_neon 
-	.type vpx_v_predictor_32x32_neon, function
-    .global vpx_h_predictor_4x4_neon 
-	.type vpx_h_predictor_4x4_neon, function
-    .global vpx_h_predictor_8x8_neon 
-	.type vpx_h_predictor_8x8_neon, function
-    .global vpx_h_predictor_16x16_neon 
-	.type vpx_h_predictor_16x16_neon, function
-    .global vpx_h_predictor_32x32_neon 
-	.type vpx_h_predictor_32x32_neon, function
-    .global vpx_tm_predictor_4x4_neon 
-	.type vpx_tm_predictor_4x4_neon, function
-    .global vpx_tm_predictor_8x8_neon 
-	.type vpx_tm_predictor_8x8_neon, function
-    .global vpx_tm_predictor_16x16_neon 
-	.type vpx_tm_predictor_16x16_neon, function
-    .global vpx_tm_predictor_32x32_neon 
-	.type vpx_tm_predictor_32x32_neon, function
-   .arm
-   .eabi_attribute 24, 1 @Tag_ABI_align_needed
-   .eabi_attribute 25, 1 @Tag_ABI_align_preserved
-
-.text
-.p2align 2
-
-@void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
-@                              const uint8_t *above,
-@                              const uint8_t *left)
-@ r0  uint8_t *dst
-@ r1  ptrdiff_t y_stride
-@ r2  const uint8_t *above
-@ r3  const uint8_t *left
-
-_vpx_v_predictor_4x4_neon:
-	vpx_v_predictor_4x4_neon: @ PROC
-    vld1.32             {d0[0]}, [r2]
-    vst1.32             {d0[0]}, [r0], r1
-    vst1.32             {d0[0]}, [r0], r1
-    vst1.32             {d0[0]}, [r0], r1
-    vst1.32             {d0[0]}, [r0], r1
-    bx                  lr
-	.size vpx_v_predictor_4x4_neon, .-vpx_v_predictor_4x4_neon    @ ENDP                @ |vpx_v_predictor_4x4_neon|
-
-@void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
-@                              const uint8_t *above,
-@                              const uint8_t *left)
-@ r0  uint8_t *dst
-@ r1  ptrdiff_t y_stride
-@ r2  const uint8_t *above
-@ r3  const uint8_t *left
-
-_vpx_v_predictor_8x8_neon:
-	vpx_v_predictor_8x8_neon: @ PROC
-    vld1.8              {d0}, [r2]
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    bx                  lr
-	.size vpx_v_predictor_8x8_neon, .-vpx_v_predictor_8x8_neon    @ ENDP                @ |vpx_v_predictor_8x8_neon|
-
-@void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
-@                                const uint8_t *above,
-@                                const uint8_t *left)
-@ r0  uint8_t *dst
-@ r1  ptrdiff_t y_stride
-@ r2  const uint8_t *above
-@ r3  const uint8_t *left
-
-_vpx_v_predictor_16x16_neon:
-	vpx_v_predictor_16x16_neon: @ PROC
-    vld1.8              {q0}, [r2]
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    bx                  lr
-	.size vpx_v_predictor_16x16_neon, .-vpx_v_predictor_16x16_neon    @ ENDP                @ |vpx_v_predictor_16x16_neon|
-
-@void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
-@                                const uint8_t *above,
-@                                const uint8_t *left)
-@ r0  uint8_t *dst
-@ r1  ptrdiff_t y_stride
-@ r2  const uint8_t *above
-@ r3  const uint8_t *left
-
-_vpx_v_predictor_32x32_neon:
-	vpx_v_predictor_32x32_neon: @ PROC
-    vld1.8              {q0, q1}, [r2]
-    mov                 r2, #2
-loop_v:
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    subs                r2, r2, #1
-    bgt                 loop_v
-    bx                  lr
-	.size vpx_v_predictor_32x32_neon, .-vpx_v_predictor_32x32_neon    @ ENDP                @ |vpx_v_predictor_32x32_neon|
-
-@void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
-@                              const uint8_t *above,
-@                              const uint8_t *left)
-@ r0  uint8_t *dst
-@ r1  ptrdiff_t y_stride
-@ r2  const uint8_t *above
-@ r3  const uint8_t *left
-
-_vpx_h_predictor_4x4_neon:
-	vpx_h_predictor_4x4_neon: @ PROC
-    vld1.32             {d1[0]}, [r3]
-    vdup.8              d0, d1[0]
-    vst1.32             {d0[0]}, [r0], r1
-    vdup.8              d0, d1[1]
-    vst1.32             {d0[0]}, [r0], r1
-    vdup.8              d0, d1[2]
-    vst1.32             {d0[0]}, [r0], r1
-    vdup.8              d0, d1[3]
-    vst1.32             {d0[0]}, [r0], r1
-    bx                  lr
-	.size vpx_h_predictor_4x4_neon, .-vpx_h_predictor_4x4_neon    @ ENDP                @ |vpx_h_predictor_4x4_neon|
-
-@void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
-@                              const uint8_t *above,
-@                              const uint8_t *left)
-@ r0  uint8_t *dst
-@ r1  ptrdiff_t y_stride
-@ r2  const uint8_t *above
-@ r3  const uint8_t *left
-
-_vpx_h_predictor_8x8_neon:
-	vpx_h_predictor_8x8_neon: @ PROC
-    vld1.64             {d1}, [r3]
-    vdup.8              d0, d1[0]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[1]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[2]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[3]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[4]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[5]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[6]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[7]
-    vst1.64             {d0}, [r0], r1
-    bx                  lr
-	.size vpx_h_predictor_8x8_neon, .-vpx_h_predictor_8x8_neon    @ ENDP                @ |vpx_h_predictor_8x8_neon|
-
-@void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
-@                                const uint8_t *above,
-@                                const uint8_t *left)
-@ r0  uint8_t *dst
-@ r1  ptrdiff_t y_stride
-@ r2  const uint8_t *above
-@ r3  const uint8_t *left
-
-_vpx_h_predictor_16x16_neon:
-	vpx_h_predictor_16x16_neon: @ PROC
-    vld1.8              {q1}, [r3]
-    vdup.8              q0, d2[0]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[1]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[2]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[3]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[4]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[5]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[6]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[7]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[0]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[1]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[2]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[3]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[4]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[5]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[6]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[7]
-    vst1.8              {q0}, [r0], r1
-    bx                  lr
-	.size vpx_h_predictor_16x16_neon, .-vpx_h_predictor_16x16_neon    @ ENDP                @ |vpx_h_predictor_16x16_neon|
-
-@void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
-@                                const uint8_t *above,
-@                                const uint8_t *left)
-@ r0  uint8_t *dst
-@ r1  ptrdiff_t y_stride
-@ r2  const uint8_t *above
-@ r3  const uint8_t *left
-
-_vpx_h_predictor_32x32_neon:
-	vpx_h_predictor_32x32_neon: @ PROC
-    sub                 r1, r1, #16
-    mov                 r2, #2
-loop_h:
-    vld1.8              {q1}, [r3]!
-    vdup.8              q0, d2[0]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[1]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[2]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[3]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[4]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[5]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[6]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[7]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[0]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[1]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[2]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[3]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[4]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[5]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[6]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[7]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    subs                r2, r2, #1
-    bgt                 loop_h
-    bx                  lr
-	.size vpx_h_predictor_32x32_neon, .-vpx_h_predictor_32x32_neon    @ ENDP                @ |vpx_h_predictor_32x32_neon|
-
-@void vpx_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride,
-@                                const uint8_t *above,
-@                                const uint8_t *left)
-@ r0  uint8_t *dst
-@ r1  ptrdiff_t y_stride
-@ r2  const uint8_t *above
-@ r3  const uint8_t *left
-
-_vpx_tm_predictor_4x4_neon:
-	vpx_tm_predictor_4x4_neon: @ PROC
-    @ Load ytop_left = above[-1];
-    sub                 r12, r2, #1
-    vld1.u8             {d0[]}, [r12]
-
-    @ Load above 4 pixels
-    vld1.32             {d2[0]}, [r2]
-
-    @ Compute above - ytop_left
-    vsubl.u8            q3, d2, d0
-
-    @ Load left row by row and compute left + (above - ytop_left)
-    @ 1st row and 2nd row
-    vld1.u8             {d2[]}, [r3]!
-    vld1.u8             {d4[]}, [r3]!
-    vmovl.u8            q1, d2
-    vmovl.u8            q2, d4
-    vadd.s16            q1, q1, q3
-    vadd.s16            q2, q2, q3
-    vqmovun.s16         d0, q1
-    vqmovun.s16         d1, q2
-    vst1.32             {d0[0]}, [r0], r1
-    vst1.32             {d1[0]}, [r0], r1
-
-    @ 3rd row and 4th row
-    vld1.u8             {d2[]}, [r3]!
-    vld1.u8             {d4[]}, [r3]
-    vmovl.u8            q1, d2
-    vmovl.u8            q2, d4
-    vadd.s16            q1, q1, q3
-    vadd.s16            q2, q2, q3
-    vqmovun.s16         d0, q1
-    vqmovun.s16         d1, q2
-    vst1.32             {d0[0]}, [r0], r1
-    vst1.32             {d1[0]}, [r0], r1
-    bx                  lr
-	.size vpx_tm_predictor_4x4_neon, .-vpx_tm_predictor_4x4_neon    @ ENDP                @ |vpx_tm_predictor_4x4_neon|
-
-@void vpx_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride,
-@                                const uint8_t *above,
-@                                const uint8_t *left)
-@ r0  uint8_t *dst
-@ r1  ptrdiff_t y_stride
-@ r2  const uint8_t *above
-@ r3  const uint8_t *left
-
-_vpx_tm_predictor_8x8_neon:
-	vpx_tm_predictor_8x8_neon: @ PROC
-    @ Load ytop_left = above[-1];
-    sub                 r12, r2, #1
-    vld1.8              {d0[]}, [r12]
-
-    @ preload 8 left
-    vld1.8              {d30}, [r3]
-
-    @ Load above 8 pixels
-    vld1.64             {d2}, [r2]
-
-    vmovl.u8            q10, d30
-
-    @ Compute above - ytop_left
-    vsubl.u8            q3, d2, d0
-
-    @ Load left row by row and compute left + (above - ytop_left)
-    @ 1st row and 2nd row
-    vdup.16             q0, d20[0]
-    vdup.16             q1, d20[1]
-    vadd.s16            q0, q3, q0
-    vadd.s16            q1, q3, q1
-
-    @ 3rd row and 4th row
-    vdup.16             q8, d20[2]
-    vdup.16             q9, d20[3]
-    vadd.s16            q8, q3, q8
-    vadd.s16            q9, q3, q9
-
-    vqmovun.s16         d0, q0
-    vqmovun.s16         d1, q1
-    vqmovun.s16         d2, q8
-    vqmovun.s16         d3, q9
-
-    vst1.64             {d0}, [r0], r1
-    vst1.64             {d1}, [r0], r1
-    vst1.64             {d2}, [r0], r1
-    vst1.64             {d3}, [r0], r1
-
-    @ 5th row and 6th row
-    vdup.16             q0, d21[0]
-    vdup.16             q1, d21[1]
-    vadd.s16            q0, q3, q0
-    vadd.s16            q1, q3, q1
-
-    @ 7th row and 8th row
-    vdup.16             q8, d21[2]
-    vdup.16             q9, d21[3]
-    vadd.s16            q8, q3, q8
-    vadd.s16            q9, q3, q9
-
-    vqmovun.s16         d0, q0
-    vqmovun.s16         d1, q1
-    vqmovun.s16         d2, q8
-    vqmovun.s16         d3, q9
-
-    vst1.64             {d0}, [r0], r1
-    vst1.64             {d1}, [r0], r1
-    vst1.64             {d2}, [r0], r1
-    vst1.64             {d3}, [r0], r1
-
-    bx                  lr
-	.size vpx_tm_predictor_8x8_neon, .-vpx_tm_predictor_8x8_neon    @ ENDP                @ |vpx_tm_predictor_8x8_neon|
-
-@void vpx_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride,
-@                                const uint8_t *above,
-@                                const uint8_t *left)
-@ r0  uint8_t *dst
-@ r1  ptrdiff_t y_stride
-@ r2  const uint8_t *above
-@ r3  const uint8_t *left
-
-_vpx_tm_predictor_16x16_neon:
-	vpx_tm_predictor_16x16_neon: @ PROC
-    @ Load ytop_left = above[-1];
-    sub                 r12, r2, #1
-    vld1.8              {d0[]}, [r12]
-
-    @ Load above 8 pixels
-    vld1.8              {q1}, [r2]
-
-    @ preload 8 left into r12
-    vld1.8              {d18}, [r3]!
-
-    @ Compute above - ytop_left
-    vsubl.u8            q2, d2, d0
-    vsubl.u8            q3, d3, d0
-
-    vmovl.u8            q10, d18
-
-    @ Load left row by row and compute left + (above - ytop_left)
-    @ Process 8 rows in each single loop and loop 2 times to process 16 rows.
-    mov                 r2, #2
-
-loop_16x16_neon:
-    @ Process two rows.
-    vdup.16             q0, d20[0]
-    vdup.16             q8, d20[1]
-    vadd.s16            q1, q0, q2
-    vadd.s16            q0, q0, q3
-    vadd.s16            q11, q8, q2
-    vadd.s16            q8, q8, q3
-    vqmovun.s16         d2, q1
-    vqmovun.s16         d3, q0
-    vqmovun.s16         d22, q11
-    vqmovun.s16         d23, q8
-    vdup.16             q0, d20[2]                  @ proload next 2 rows data
-    vdup.16             q8, d20[3]
-    vst1.64             {d2,d3}, [r0], r1
-    vst1.64             {d22,d23}, [r0], r1
-
-    @ Process two rows.
-    vadd.s16            q1, q0, q2
-    vadd.s16            q0, q0, q3
-    vadd.s16            q11, q8, q2
-    vadd.s16            q8, q8, q3
-    vqmovun.s16         d2, q1
-    vqmovun.s16         d3, q0
-    vqmovun.s16         d22, q11
-    vqmovun.s16         d23, q8
-    vdup.16             q0, d21[0]                  @ proload next 2 rows data
-    vdup.16             q8, d21[1]
-    vst1.64             {d2,d3}, [r0], r1
-    vst1.64             {d22,d23}, [r0], r1
-
-    vadd.s16            q1, q0, q2
-    vadd.s16            q0, q0, q3
-    vadd.s16            q11, q8, q2
-    vadd.s16            q8, q8, q3
-    vqmovun.s16         d2, q1
-    vqmovun.s16         d3, q0
-    vqmovun.s16         d22, q11
-    vqmovun.s16         d23, q8
-    vdup.16             q0, d21[2]                  @ proload next 2 rows data
-    vdup.16             q8, d21[3]
-    vst1.64             {d2,d3}, [r0], r1
-    vst1.64             {d22,d23}, [r0], r1
-
-
-    vadd.s16            q1, q0, q2
-    vadd.s16            q0, q0, q3
-    vadd.s16            q11, q8, q2
-    vadd.s16            q8, q8, q3
-    vqmovun.s16         d2, q1
-    vqmovun.s16         d3, q0
-    vqmovun.s16         d22, q11
-    vqmovun.s16         d23, q8
-    vld1.8              {d18}, [r3]!                  @ preload 8 left into r12
-    vmovl.u8            q10, d18
-    vst1.64             {d2,d3}, [r0], r1
-    vst1.64             {d22,d23}, [r0], r1
-
-    subs                r2, r2, #1
-    bgt                 loop_16x16_neon
-
-    bx                  lr
-	.size vpx_tm_predictor_16x16_neon, .-vpx_tm_predictor_16x16_neon    @ ENDP                @ |vpx_tm_predictor_16x16_neon|
-
-@void vpx_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride,
-@                                  const uint8_t *above,
-@                                  const uint8_t *left)
-@ r0  uint8_t *dst
-@ r1  ptrdiff_t y_stride
-@ r2  const uint8_t *above
-@ r3  const uint8_t *left
-
-_vpx_tm_predictor_32x32_neon:
-	vpx_tm_predictor_32x32_neon: @ PROC
-    @ Load ytop_left = above[-1];
-    sub                 r12, r2, #1
-    vld1.8              {d0[]}, [r12]
-
-    @ Load above 32 pixels
-    vld1.8              {q1}, [r2]!
-    vld1.8              {q2}, [r2]
-
-    @ preload 8 left pixels
-    vld1.8              {d26}, [r3]!
-
-    @ Compute above - ytop_left
-    vsubl.u8            q8, d2, d0
-    vsubl.u8            q9, d3, d0
-    vsubl.u8            q10, d4, d0
-    vsubl.u8            q11, d5, d0
-
-    vmovl.u8            q3, d26
-
-    @ Load left row by row and compute left + (above - ytop_left)
-    @ Process 8 rows in each single loop and loop 4 times to process 32 rows.
-    mov                 r2, #4
-
-loop_32x32_neon:
-    @ Process two rows.
-    vdup.16             q0, d6[0]
-    vdup.16             q2, d6[1]
-    vadd.s16            q12, q0, q8
-    vadd.s16            q13, q0, q9
-    vadd.s16            q14, q0, q10
-    vadd.s16            q15, q0, q11
-    vqmovun.s16         d0, q12
-    vqmovun.s16         d1, q13
-    vadd.s16            q12, q2, q8
-    vadd.s16            q13, q2, q9
-    vqmovun.s16         d2, q14
-    vqmovun.s16         d3, q15
-    vadd.s16            q14, q2, q10
-    vadd.s16            q15, q2, q11
-    vst1.64             {d0-d3}, [r0], r1
-    vqmovun.s16         d24, q12
-    vqmovun.s16         d25, q13
-    vqmovun.s16         d26, q14
-    vqmovun.s16         d27, q15
-    vdup.16             q1, d6[2]
-    vdup.16             q2, d6[3]
-    vst1.64             {d24-d27}, [r0], r1
-
-    @ Process two rows.
-    vadd.s16            q12, q1, q8
-    vadd.s16            q13, q1, q9
-    vadd.s16            q14, q1, q10
-    vadd.s16            q15, q1, q11
-    vqmovun.s16         d0, q12
-    vqmovun.s16         d1, q13
-    vadd.s16            q12, q2, q8
-    vadd.s16            q13, q2, q9
-    vqmovun.s16         d2, q14
-    vqmovun.s16         d3, q15
-    vadd.s16            q14, q2, q10
-    vadd.s16            q15, q2, q11
-    vst1.64             {d0-d3}, [r0], r1
-    vqmovun.s16         d24, q12
-    vqmovun.s16         d25, q13
-    vqmovun.s16         d26, q14
-    vqmovun.s16         d27, q15
-    vdup.16             q0, d7[0]
-    vdup.16             q2, d7[1]
-    vst1.64             {d24-d27}, [r0], r1
-
-    @ Process two rows.
-    vadd.s16            q12, q0, q8
-    vadd.s16            q13, q0, q9
-    vadd.s16            q14, q0, q10
-    vadd.s16            q15, q0, q11
-    vqmovun.s16         d0, q12
-    vqmovun.s16         d1, q13
-    vadd.s16            q12, q2, q8
-    vadd.s16            q13, q2, q9
-    vqmovun.s16         d2, q14
-    vqmovun.s16         d3, q15
-    vadd.s16            q14, q2, q10
-    vadd.s16            q15, q2, q11
-    vst1.64             {d0-d3}, [r0], r1
-    vqmovun.s16         d24, q12
-    vqmovun.s16         d25, q13
-    vqmovun.s16         d26, q14
-    vqmovun.s16         d27, q15
-    vdup.16             q0, d7[2]
-    vdup.16             q2, d7[3]
-    vst1.64             {d24-d27}, [r0], r1
-
-    @ Process two rows.
-    vadd.s16            q12, q0, q8
-    vadd.s16            q13, q0, q9
-    vadd.s16            q14, q0, q10
-    vadd.s16            q15, q0, q11
-    vqmovun.s16         d0, q12
-    vqmovun.s16         d1, q13
-    vadd.s16            q12, q2, q8
-    vadd.s16            q13, q2, q9
-    vqmovun.s16         d2, q14
-    vqmovun.s16         d3, q15
-    vadd.s16            q14, q2, q10
-    vadd.s16            q15, q2, q11
-    vst1.64             {d0-d3}, [r0], r1
-    vqmovun.s16         d24, q12
-    vqmovun.s16         d25, q13
-    vld1.8              {d0}, [r3]!                   @ preload 8 left pixels
-    vqmovun.s16         d26, q14
-    vqmovun.s16         d27, q15
-    vmovl.u8            q3, d0
-    vst1.64             {d24-d27}, [r0], r1
-
-    subs                r2, r2, #1
-    bgt                 loop_32x32_neon
-
-    bx                  lr
-	.size vpx_tm_predictor_32x32_neon, .-vpx_tm_predictor_32x32_neon    @ ENDP                @ |vpx_tm_predictor_32x32_neon|
-
-	.section	.note.GNU-stack,"",%progbits
diff --git a/thirdparty/libvpx/vpx_dsp/arm/gas/loopfilter_mb_neon.s b/thirdparty/libvpx/vpx_dsp/arm/gas/loopfilter_mb_neon.s
deleted file mode 100644
index f6b05406fb..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/gas/loopfilter_mb_neon.s
+++ /dev/null
@@ -1,647 +0,0 @@
-@ This file was created from a .asm file
-@  using the ads2gas.pl script.
-	.equ DO1STROUNDING, 0
-@
-@  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-@
-@  Use of this source code is governed by a BSD-style license
-@  that can be found in the LICENSE file in the root of the source
-@  tree. An additional intellectual property rights grant can be found
-@  in the file PATENTS.  All contributing project authors may
-@  be found in the AUTHORS file in the root of the source tree.
-@
-
-    .global vpx_lpf_horizontal_edge_8_neon 
-	.type vpx_lpf_horizontal_edge_8_neon, function
-    .global vpx_lpf_horizontal_edge_16_neon 
-	.type vpx_lpf_horizontal_edge_16_neon, function
-    .global vpx_lpf_vertical_16_neon 
-	.type vpx_lpf_vertical_16_neon, function
-   .arm
-
-.text
-.p2align 2
-
-@ void mb_lpf_horizontal_edge(uint8_t *s, int p,
-@                             const uint8_t *blimit,
-@                             const uint8_t *limit,
-@                             const uint8_t *thresh,
-@                             int count)
-@ r0    uint8_t *s,
-@ r1    int p, /* pitch */
-@ r2    const uint8_t *blimit,
-@ r3    const uint8_t *limit,
-@ sp    const uint8_t *thresh,
-@ r12   int count
-_mb_lpf_horizontal_edge:
-	mb_lpf_horizontal_edge: @ PROC
-    push        {r4-r8, lr}
-    vpush       {d8-d15}
-    ldr         r4, [sp, #88]              @ load thresh
-
-h_count:
-    vld1.8      {d16[]}, [r2]              @ load *blimit
-    vld1.8      {d17[]}, [r3]              @ load *limit
-    vld1.8      {d18[]}, [r4]              @ load *thresh
-
-    sub         r8, r0, r1, lsl #3         @ move src pointer down by 8 lines
-
-    vld1.u8     {d0}, [r8,:64], r1          @ p7
-    vld1.u8     {d1}, [r8,:64], r1          @ p6
-    vld1.u8     {d2}, [r8,:64], r1          @ p5
-    vld1.u8     {d3}, [r8,:64], r1          @ p4
-    vld1.u8     {d4}, [r8,:64], r1          @ p3
-    vld1.u8     {d5}, [r8,:64], r1          @ p2
-    vld1.u8     {d6}, [r8,:64], r1          @ p1
-    vld1.u8     {d7}, [r8,:64], r1          @ p0
-    vld1.u8     {d8}, [r8,:64], r1          @ q0
-    vld1.u8     {d9}, [r8,:64], r1          @ q1
-    vld1.u8     {d10}, [r8,:64], r1         @ q2
-    vld1.u8     {d11}, [r8,:64], r1         @ q3
-    vld1.u8     {d12}, [r8,:64], r1         @ q4
-    vld1.u8     {d13}, [r8,:64], r1         @ q5
-    vld1.u8     {d14}, [r8,:64], r1         @ q6
-    vld1.u8     {d15}, [r8,:64], r1         @ q7
-
-    bl          vpx_wide_mbfilter_neon
-
-    tst         r7, #1
-    beq         h_mbfilter
-
-    @ flat && mask were not set for any of the channels. Just store the values
-    @ from filter.
-    sub         r8, r0, r1, lsl #1
-
-    vst1.u8     {d25}, [r8,:64], r1         @ store op1
-    vst1.u8     {d24}, [r8,:64], r1         @ store op0
-    vst1.u8     {d23}, [r8,:64], r1         @ store oq0
-    vst1.u8     {d26}, [r8,:64], r1         @ store oq1
-
-    b           h_next
-
-h_mbfilter:
-    tst         r7, #2
-    beq         h_wide_mbfilter
-
-    @ flat2 was not set for any of the channels. Just store the values from
-    @ mbfilter.
-    sub         r8, r0, r1, lsl #1
-    sub         r8, r8, r1
-
-    vst1.u8     {d18}, [r8,:64], r1         @ store op2
-    vst1.u8     {d19}, [r8,:64], r1         @ store op1
-    vst1.u8     {d20}, [r8,:64], r1         @ store op0
-    vst1.u8     {d21}, [r8,:64], r1         @ store oq0
-    vst1.u8     {d22}, [r8,:64], r1         @ store oq1
-    vst1.u8     {d23}, [r8,:64], r1         @ store oq2
-
-    b           h_next
-
-h_wide_mbfilter:
-    sub         r8, r0, r1, lsl #3
-    add         r8, r8, r1
-
-    vst1.u8     {d16}, [r8,:64], r1         @ store op6
-    vst1.u8     {d24}, [r8,:64], r1         @ store op5
-    vst1.u8     {d25}, [r8,:64], r1         @ store op4
-    vst1.u8     {d26}, [r8,:64], r1         @ store op3
-    vst1.u8     {d27}, [r8,:64], r1         @ store op2
-    vst1.u8     {d18}, [r8,:64], r1         @ store op1
-    vst1.u8     {d19}, [r8,:64], r1         @ store op0
-    vst1.u8     {d20}, [r8,:64], r1         @ store oq0
-    vst1.u8     {d21}, [r8,:64], r1         @ store oq1
-    vst1.u8     {d22}, [r8,:64], r1         @ store oq2
-    vst1.u8     {d23}, [r8,:64], r1         @ store oq3
-    vst1.u8     {d1}, [r8,:64], r1          @ store oq4
-    vst1.u8     {d2}, [r8,:64], r1          @ store oq5
-    vst1.u8     {d3}, [r8,:64], r1          @ store oq6
-
-h_next:
-    add         r0, r0, #8
-    subs        r12, r12, #1
-    bne         h_count
-
-    vpop        {d8-d15}
-    pop         {r4-r8, pc}
-
-	.size mb_lpf_horizontal_edge, .-mb_lpf_horizontal_edge    @ ENDP        @ |mb_lpf_horizontal_edge|
-
-@ void vpx_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch,
-@                                     const uint8_t *blimit,
-@                                     const uint8_t *limit,
-@                                     const uint8_t *thresh)
-@ r0    uint8_t *s,
-@ r1    int pitch,
-@ r2    const uint8_t *blimit,
-@ r3    const uint8_t *limit,
-@ sp    const uint8_t *thresh
-_vpx_lpf_horizontal_edge_8_neon:
-	vpx_lpf_horizontal_edge_8_neon: @ PROC
-    mov r12, #1
-    b mb_lpf_horizontal_edge
-	.size vpx_lpf_horizontal_edge_8_neon, .-vpx_lpf_horizontal_edge_8_neon    @ ENDP        @ |vpx_lpf_horizontal_edge_8_neon|
-
-@ void vpx_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch,
-@                                      const uint8_t *blimit,
-@                                      const uint8_t *limit,
-@                                      const uint8_t *thresh)
-@ r0    uint8_t *s,
-@ r1    int pitch,
-@ r2    const uint8_t *blimit,
-@ r3    const uint8_t *limit,
-@ sp    const uint8_t *thresh
-_vpx_lpf_horizontal_edge_16_neon:
-	vpx_lpf_horizontal_edge_16_neon: @ PROC
-    mov r12, #2
-    b mb_lpf_horizontal_edge
-	.size vpx_lpf_horizontal_edge_16_neon, .-vpx_lpf_horizontal_edge_16_neon    @ ENDP        @ |vpx_lpf_horizontal_edge_16_neon|
-
-@ void vpx_lpf_vertical_16_neon(uint8_t *s, int p,
-@                               const uint8_t *blimit,
-@                               const uint8_t *limit,
-@                               const uint8_t *thresh)
-@ r0    uint8_t *s,
-@ r1    int p, /* pitch */
-@ r2    const uint8_t *blimit,
-@ r3    const uint8_t *limit,
-@ sp    const uint8_t *thresh,
-_vpx_lpf_vertical_16_neon:
-	vpx_lpf_vertical_16_neon: @ PROC
-    push        {r4-r8, lr}
-    vpush       {d8-d15}
-    ldr         r4, [sp, #88]              @ load thresh
-
-    vld1.8      {d16[]}, [r2]              @ load *blimit
-    vld1.8      {d17[]}, [r3]              @ load *limit
-    vld1.8      {d18[]}, [r4]              @ load *thresh
-
-    sub         r8, r0, #8
-
-    vld1.8      {d0}, [r8,:64], r1
-    vld1.8      {d8}, [r0,:64], r1
-    vld1.8      {d1}, [r8,:64], r1
-    vld1.8      {d9}, [r0,:64], r1
-    vld1.8      {d2}, [r8,:64], r1
-    vld1.8      {d10}, [r0,:64], r1
-    vld1.8      {d3}, [r8,:64], r1
-    vld1.8      {d11}, [r0,:64], r1
-    vld1.8      {d4}, [r8,:64], r1
-    vld1.8      {d12}, [r0,:64], r1
-    vld1.8      {d5}, [r8,:64], r1
-    vld1.8      {d13}, [r0,:64], r1
-    vld1.8      {d6}, [r8,:64], r1
-    vld1.8      {d14}, [r0,:64], r1
-    vld1.8      {d7}, [r8,:64], r1
-    vld1.8      {d15}, [r0,:64], r1
-
-    sub         r0, r0, r1, lsl #3
-
-    vtrn.32     q0, q2
-    vtrn.32     q1, q3
-    vtrn.32     q4, q6
-    vtrn.32     q5, q7
-
-    vtrn.16     q0, q1
-    vtrn.16     q2, q3
-    vtrn.16     q4, q5
-    vtrn.16     q6, q7
-
-    vtrn.8      d0, d1
-    vtrn.8      d2, d3
-    vtrn.8      d4, d5
-    vtrn.8      d6, d7
-
-    vtrn.8      d8, d9
-    vtrn.8      d10, d11
-    vtrn.8      d12, d13
-    vtrn.8      d14, d15
-
-    bl          vpx_wide_mbfilter_neon
-
-    tst         r7, #1
-    beq         v_mbfilter
-
-    @ flat && mask were not set for any of the channels. Just store the values
-    @ from filter.
-    sub         r8, r0, #2
-
-    vswp        d23, d25
-
-    vst4.8      {d23[0], d24[0], d25[0], d26[0]}, [r8], r1
-    vst4.8      {d23[1], d24[1], d25[1], d26[1]}, [r8], r1
-    vst4.8      {d23[2], d24[2], d25[2], d26[2]}, [r8], r1
-    vst4.8      {d23[3], d24[3], d25[3], d26[3]}, [r8], r1
-    vst4.8      {d23[4], d24[4], d25[4], d26[4]}, [r8], r1
-    vst4.8      {d23[5], d24[5], d25[5], d26[5]}, [r8], r1
-    vst4.8      {d23[6], d24[6], d25[6], d26[6]}, [r8], r1
-    vst4.8      {d23[7], d24[7], d25[7], d26[7]}, [r8], r1
-
-    b           v_end
-
-v_mbfilter:
-    tst         r7, #2
-    beq         v_wide_mbfilter
-
-    @ flat2 was not set for any of the channels. Just store the values from
-    @ mbfilter.
-    sub         r8, r0, #3
-
-    vst3.8      {d18[0], d19[0], d20[0]}, [r8], r1
-    vst3.8      {d21[0], d22[0], d23[0]}, [r0], r1
-    vst3.8      {d18[1], d19[1], d20[1]}, [r8], r1
-    vst3.8      {d21[1], d22[1], d23[1]}, [r0], r1
-    vst3.8      {d18[2], d19[2], d20[2]}, [r8], r1
-    vst3.8      {d21[2], d22[2], d23[2]}, [r0], r1
-    vst3.8      {d18[3], d19[3], d20[3]}, [r8], r1
-    vst3.8      {d21[3], d22[3], d23[3]}, [r0], r1
-    vst3.8      {d18[4], d19[4], d20[4]}, [r8], r1
-    vst3.8      {d21[4], d22[4], d23[4]}, [r0], r1
-    vst3.8      {d18[5], d19[5], d20[5]}, [r8], r1
-    vst3.8      {d21[5], d22[5], d23[5]}, [r0], r1
-    vst3.8      {d18[6], d19[6], d20[6]}, [r8], r1
-    vst3.8      {d21[6], d22[6], d23[6]}, [r0], r1
-    vst3.8      {d18[7], d19[7], d20[7]}, [r8], r1
-    vst3.8      {d21[7], d22[7], d23[7]}, [r0], r1
-
-    b           v_end
-
-v_wide_mbfilter:
-    sub         r8, r0, #8
-
-    vtrn.32     d0,  d26
-    vtrn.32     d16, d27
-    vtrn.32     d24, d18
-    vtrn.32     d25, d19
-
-    vtrn.16     d0,  d24
-    vtrn.16     d16, d25
-    vtrn.16     d26, d18
-    vtrn.16     d27, d19
-
-    vtrn.8      d0,  d16
-    vtrn.8      d24, d25
-    vtrn.8      d26, d27
-    vtrn.8      d18, d19
-
-    vtrn.32     d20, d1
-    vtrn.32     d21, d2
-    vtrn.32     d22, d3
-    vtrn.32     d23, d15
-
-    vtrn.16     d20, d22
-    vtrn.16     d21, d23
-    vtrn.16     d1,  d3
-    vtrn.16     d2,  d15
-
-    vtrn.8      d20, d21
-    vtrn.8      d22, d23
-    vtrn.8      d1,  d2
-    vtrn.8      d3,  d15
-
-    vst1.8      {d0}, [r8,:64], r1
-    vst1.8      {d20}, [r0,:64], r1
-    vst1.8      {d16}, [r8,:64], r1
-    vst1.8      {d21}, [r0,:64], r1
-    vst1.8      {d24}, [r8,:64], r1
-    vst1.8      {d22}, [r0,:64], r1
-    vst1.8      {d25}, [r8,:64], r1
-    vst1.8      {d23}, [r0,:64], r1
-    vst1.8      {d26}, [r8,:64], r1
-    vst1.8      {d1}, [r0,:64], r1
-    vst1.8      {d27}, [r8,:64], r1
-    vst1.8      {d2}, [r0,:64], r1
-    vst1.8      {d18}, [r8,:64], r1
-    vst1.8      {d3}, [r0,:64], r1
-    vst1.8      {d19}, [r8,:64], r1
-    vst1.8      {d15}, [r0,:64], r1
-
-v_end:
-    vpop        {d8-d15}
-    pop         {r4-r8, pc}
-
-	.size vpx_lpf_vertical_16_neon, .-vpx_lpf_vertical_16_neon    @ ENDP        @ |vpx_lpf_vertical_16_neon|
-
-@ void vpx_wide_mbfilter_neon();
-@ This is a helper function for the loopfilters. The invidual functions do the
-@ necessary load, transpose (if necessary) and store.
-@
-@ r0-r3 PRESERVE
-@ d16    blimit
-@ d17    limit
-@ d18    thresh
-@ d0    p7
-@ d1    p6
-@ d2    p5
-@ d3    p4
-@ d4    p3
-@ d5    p2
-@ d6    p1
-@ d7    p0
-@ d8    q0
-@ d9    q1
-@ d10   q2
-@ d11   q3
-@ d12   q4
-@ d13   q5
-@ d14   q6
-@ d15   q7
-_vpx_wide_mbfilter_neon:
-	vpx_wide_mbfilter_neon: @ PROC
-    mov         r7, #0
-
-    @ filter_mask
-    vabd.u8     d19, d4, d5                @ abs(p3 - p2)
-    vabd.u8     d20, d5, d6                @ abs(p2 - p1)
-    vabd.u8     d21, d6, d7                @ abs(p1 - p0)
-    vabd.u8     d22, d9, d8                @ abs(q1 - q0)
-    vabd.u8     d23, d10, d9               @ abs(q2 - q1)
-    vabd.u8     d24, d11, d10              @ abs(q3 - q2)
-
-    @ only compare the largest value to limit
-    vmax.u8     d19, d19, d20              @ max(abs(p3 - p2), abs(p2 - p1))
-    vmax.u8     d20, d21, d22              @ max(abs(p1 - p0), abs(q1 - q0))
-    vmax.u8     d23, d23, d24              @ max(abs(q2 - q1), abs(q3 - q2))
-    vmax.u8     d19, d19, d20
-
-    vabd.u8     d24, d7, d8                @ abs(p0 - q0)
-
-    vmax.u8     d19, d19, d23
-
-    vabd.u8     d23, d6, d9                @ a = abs(p1 - q1)
-    vqadd.u8    d24, d24, d24              @ b = abs(p0 - q0) * 2
-
-    @ abs () > limit
-    vcge.u8     d19, d17, d19
-
-    @ flatmask4
-    vabd.u8     d25, d7, d5                @ abs(p0 - p2)
-    vabd.u8     d26, d8, d10               @ abs(q0 - q2)
-    vabd.u8     d27, d4, d7                @ abs(p3 - p0)
-    vabd.u8     d28, d11, d8               @ abs(q3 - q0)
-
-    @ only compare the largest value to thresh
-    vmax.u8     d25, d25, d26              @ max(abs(p0 - p2), abs(q0 - q2))
-    vmax.u8     d26, d27, d28              @ max(abs(p3 - p0), abs(q3 - q0))
-    vmax.u8     d25, d25, d26
-    vmax.u8     d20, d20, d25
-
-    vshr.u8     d23, d23, #1               @ a = a / 2
-    vqadd.u8    d24, d24, d23              @ a = b + a
-
-    vmov.u8     d30, #1
-    vcge.u8     d24, d16, d24              @ (a > blimit * 2 + limit) * -1
-
-    vcge.u8     d20, d30, d20              @ flat
-
-    vand        d19, d19, d24              @ mask
-
-    @ hevmask
-    vcgt.u8     d21, d21, d18              @ (abs(p1 - p0) > thresh)*-1
-    vcgt.u8     d22, d22, d18              @ (abs(q1 - q0) > thresh)*-1
-    vorr        d21, d21, d22              @ hev
-
-    vand        d16, d20, d19              @ flat && mask
-    vmov        r5, r6, d16
-
-    @ flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7)
-    vabd.u8     d22, d3, d7                @ abs(p4 - p0)
-    vabd.u8     d23, d12, d8               @ abs(q4 - q0)
-    vabd.u8     d24, d7, d2                @ abs(p0 - p5)
-    vabd.u8     d25, d8, d13               @ abs(q0 - q5)
-    vabd.u8     d26, d1, d7                @ abs(p6 - p0)
-    vabd.u8     d27, d14, d8               @ abs(q6 - q0)
-    vabd.u8     d28, d0, d7                @ abs(p7 - p0)
-    vabd.u8     d29, d15, d8               @ abs(q7 - q0)
-
-    @ only compare the largest value to thresh
-    vmax.u8     d22, d22, d23              @ max(abs(p4 - p0), abs(q4 - q0))
-    vmax.u8     d23, d24, d25              @ max(abs(p0 - p5), abs(q0 - q5))
-    vmax.u8     d24, d26, d27              @ max(abs(p6 - p0), abs(q6 - q0))
-    vmax.u8     d25, d28, d29              @ max(abs(p7 - p0), abs(q7 - q0))
-
-    vmax.u8     d26, d22, d23
-    vmax.u8     d27, d24, d25
-    vmax.u8     d23, d26, d27
-
-    vcge.u8     d18, d30, d23              @ flat2
-
-    vmov.u8     d22, #0x80
-
-    orrs        r5, r5, r6                 @ Check for 0
-    orreq       r7, r7, #1                 @ Only do filter branch
-
-    vand        d17, d18, d16              @ flat2 && flat && mask
-    vmov        r5, r6, d17
-
-    @ mbfilter() function
-
-    @ filter() function
-    @ convert to signed
-    veor        d23, d8, d22               @ qs0
-    veor        d24, d7, d22               @ ps0
-    veor        d25, d6, d22               @ ps1
-    veor        d26, d9, d22               @ qs1
-
-    vmov.u8     d27, #3
-
-    vsub.s8     d28, d23, d24              @ ( qs0 - ps0)
-    vqsub.s8    d29, d25, d26              @ filter = clamp(ps1-qs1)
-    vmull.s8    q15, d28, d27              @ 3 * ( qs0 - ps0)
-    vand        d29, d29, d21              @ filter &= hev
-    vaddw.s8    q15, q15, d29              @ filter + 3 * (qs0 - ps0)
-    vmov.u8     d29, #4
-
-    @ filter = clamp(filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d28, q15
-
-    vand        d28, d28, d19              @ filter &= mask
-
-    vqadd.s8    d30, d28, d27              @ filter2 = clamp(filter+3)
-    vqadd.s8    d29, d28, d29              @ filter1 = clamp(filter+4)
-    vshr.s8     d30, d30, #3               @ filter2 >>= 3
-    vshr.s8     d29, d29, #3               @ filter1 >>= 3
-
-
-    vqadd.s8    d24, d24, d30              @ op0 = clamp(ps0 + filter2)
-    vqsub.s8    d23, d23, d29              @ oq0 = clamp(qs0 - filter1)
-
-    @ outer tap adjustments: ++filter1 >> 1
-    vrshr.s8    d29, d29, #1
-    vbic        d29, d29, d21              @ filter &= ~hev
-
-    vqadd.s8    d25, d25, d29              @ op1 = clamp(ps1 + filter)
-    vqsub.s8    d26, d26, d29              @ oq1 = clamp(qs1 - filter)
-
-    veor        d24, d24, d22              @ *f_op0 = u^0x80
-    veor        d23, d23, d22              @ *f_oq0 = u^0x80
-    veor        d25, d25, d22              @ *f_op1 = u^0x80
-    veor        d26, d26, d22              @ *f_oq1 = u^0x80
-
-    tst         r7, #1
-    bxne        lr
-
-    orrs        r5, r5, r6                 @ Check for 0
-    orreq       r7, r7, #2                 @ Only do mbfilter branch
-
-    @ mbfilter flat && mask branch
-    @ TODO(fgalligan): Can I decrease the cycles shifting to consective d's
-    @ and using vibt on the q's?
-    vmov.u8     d29, #2
-    vaddl.u8    q15, d7, d8                @ op2 = p0 + q0
-    vmlal.u8    q15, d4, d27               @ op2 = p0 + q0 + p3 * 3
-    vmlal.u8    q15, d5, d29               @ op2 = p0 + q0 + p3 * 3 + p2 * 2
-    vaddl.u8    q10, d4, d5
-    vaddw.u8    q15, d6                    @ op2=p1 + p0 + q0 + p3 * 3 + p2 *2
-    vaddl.u8    q14, d6, d9
-    vqrshrn.u16 d18, q15, #3               @ r_op2
-
-    vsub.i16    q15, q10
-    vaddl.u8    q10, d4, d6
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d7, d10
-    vqrshrn.u16 d19, q15, #3               @ r_op1
-
-    vsub.i16    q15, q10
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d8, d11
-    vqrshrn.u16 d20, q15, #3               @ r_op0
-
-    vsubw.u8    q15, d4                    @ oq0 = op0 - p3
-    vsubw.u8    q15, d7                    @ oq0 -= p0
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d9, d11
-    vqrshrn.u16 d21, q15, #3               @ r_oq0
-
-    vsubw.u8    q15, d5                    @ oq1 = oq0 - p2
-    vsubw.u8    q15, d8                    @ oq1 -= q0
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d10, d11
-    vqrshrn.u16 d22, q15, #3               @ r_oq1
-
-    vsubw.u8    q15, d6                    @ oq2 = oq0 - p1
-    vsubw.u8    q15, d9                    @ oq2 -= q1
-    vadd.i16    q15, q14
-    vqrshrn.u16 d27, q15, #3               @ r_oq2
-
-    @ Filter does not set op2 or oq2, so use p2 and q2.
-    vbif        d18, d5, d16               @ t_op2 |= p2 & ~(flat & mask)
-    vbif        d19, d25, d16              @ t_op1 |= f_op1 & ~(flat & mask)
-    vbif        d20, d24, d16              @ t_op0 |= f_op0 & ~(flat & mask)
-    vbif        d21, d23, d16              @ t_oq0 |= f_oq0 & ~(flat & mask)
-    vbif        d22, d26, d16              @ t_oq1 |= f_oq1 & ~(flat & mask)
-
-    vbit        d23, d27, d16              @ t_oq2 |= r_oq2 & (flat & mask)
-    vbif        d23, d10, d16              @ t_oq2 |= q2 & ~(flat & mask)
-
-    tst         r7, #2
-    bxne        lr
-
-    @ wide_mbfilter flat2 && flat && mask branch
-    vmov.u8     d16, #7
-    vaddl.u8    q15, d7, d8                @ op6 = p0 + q0
-    vaddl.u8    q12, d2, d3
-    vaddl.u8    q13, d4, d5
-    vaddl.u8    q14, d1, d6
-    vmlal.u8    q15, d0, d16               @ op6 += p7 * 3
-    vadd.i16    q12, q13
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d2, d9
-    vadd.i16    q15, q12
-    vaddl.u8    q12, d0, d1
-    vaddw.u8    q15, d1
-    vaddl.u8    q13, d0, d2
-    vadd.i16    q14, q15, q14
-    vqrshrn.u16 d16, q15, #4               @ w_op6
-
-    vsub.i16    q15, q14, q12
-    vaddl.u8    q14, d3, d10
-    vqrshrn.u16 d24, q15, #4               @ w_op5
-
-    vsub.i16    q15, q13
-    vaddl.u8    q13, d0, d3
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d4, d11
-    vqrshrn.u16 d25, q15, #4               @ w_op4
-
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d0, d4
-    vsub.i16    q15, q13
-    vsub.i16    q14, q15, q14
-    vqrshrn.u16 d26, q15, #4               @ w_op3
-
-    vaddw.u8    q15, q14, d5               @ op2 += p2
-    vaddl.u8    q14, d0, d5
-    vaddw.u8    q15, d12                   @ op2 += q4
-    vbif        d26, d4, d17               @ op3 |= p3 & ~(f2 & f & m)
-    vqrshrn.u16 d27, q15, #4               @ w_op2
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d0, d6
-    vaddw.u8    q15, d6                    @ op1 += p1
-    vaddw.u8    q15, d13                   @ op1 += q5
-    vbif        d27, d18, d17              @ op2 |= t_op2 & ~(f2 & f & m)
-    vqrshrn.u16 d18, q15, #4               @ w_op1
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d0, d7
-    vaddw.u8    q15, d7                    @ op0 += p0
-    vaddw.u8    q15, d14                   @ op0 += q6
-    vbif        d18, d19, d17              @ op1 |= t_op1 & ~(f2 & f & m)
-    vqrshrn.u16 d19, q15, #4               @ w_op0
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d1, d8
-    vaddw.u8    q15, d8                    @ oq0 += q0
-    vaddw.u8    q15, d15                   @ oq0 += q7
-    vbif        d19, d20, d17              @ op0 |= t_op0 & ~(f2 & f & m)
-    vqrshrn.u16 d20, q15, #4               @ w_oq0
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d2, d9
-    vaddw.u8    q15, d9                    @ oq1 += q1
-    vaddl.u8    q4, d10, d15
-    vaddw.u8    q15, d15                   @ oq1 += q7
-    vbif        d20, d21, d17              @ oq0 |= t_oq0 & ~(f2 & f & m)
-    vqrshrn.u16 d21, q15, #4               @ w_oq1
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d3, d10
-    vadd.i16    q15, q4
-    vaddl.u8    q4, d11, d15
-    vbif        d21, d22, d17              @ oq1 |= t_oq1 & ~(f2 & f & m)
-    vqrshrn.u16 d22, q15, #4               @ w_oq2
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d4, d11
-    vadd.i16    q15, q4
-    vaddl.u8    q4, d12, d15
-    vbif        d22, d23, d17              @ oq2 |= t_oq2 & ~(f2 & f & m)
-    vqrshrn.u16 d23, q15, #4               @ w_oq3
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d5, d12
-    vadd.i16    q15, q4
-    vaddl.u8    q4, d13, d15
-    vbif        d16, d1, d17               @ op6 |= p6 & ~(f2 & f & m)
-    vqrshrn.u16 d1, q15, #4                @ w_oq4
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d6, d13
-    vadd.i16    q15, q4
-    vaddl.u8    q4, d14, d15
-    vbif        d24, d2, d17               @ op5 |= p5 & ~(f2 & f & m)
-    vqrshrn.u16 d2, q15, #4                @ w_oq5
-
-    vsub.i16    q15, q14
-    vbif        d25, d3, d17               @ op4 |= p4 & ~(f2 & f & m)
-    vadd.i16    q15, q4
-    vbif        d23, d11, d17              @ oq3 |= q3 & ~(f2 & f & m)
-    vqrshrn.u16 d3, q15, #4                @ w_oq6
-    vbif        d1, d12, d17               @ oq4 |= q4 & ~(f2 & f & m)
-    vbif        d2, d13, d17               @ oq5 |= q5 & ~(f2 & f & m)
-    vbif        d3, d14, d17               @ oq6 |= q6 & ~(f2 & f & m)
-
-    bx          lr
-	.size vpx_wide_mbfilter_neon, .-vpx_wide_mbfilter_neon    @ ENDP        @ |vpx_wide_mbfilter_neon|
-
-	.section	.note.GNU-stack,"",%progbits
diff --git a/thirdparty/libvpx/vpx_dsp/arm/gas/save_reg_neon.s b/thirdparty/libvpx/vpx_dsp/arm/gas/save_reg_neon.s
deleted file mode 100644
index e8852fa0d0..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/gas/save_reg_neon.s
+++ /dev/null
@@ -1,44 +0,0 @@
-@ This file was created from a .asm file
-@  using the ads2gas.pl script.
-	.equ DO1STROUNDING, 0
-@
-@  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-@
-@  Use of this source code is governed by a BSD-style license
-@  that can be found in the LICENSE file in the root of the source
-@  tree. An additional intellectual property rights grant can be found
-@  in the file PATENTS.  All contributing project authors may
-@  be found in the AUTHORS file in the root of the source tree.
-@
-
-
-    .global vpx_push_neon 
-	.type vpx_push_neon, function
-    .global vpx_pop_neon 
-	.type vpx_pop_neon, function
-
-   .arm
-   .eabi_attribute 24, 1 @Tag_ABI_align_needed
-   .eabi_attribute 25, 1 @Tag_ABI_align_preserved
-
-.text
-.p2align 2
-
-_vpx_push_neon:
-	vpx_push_neon: @ PROC
-    vst1.i64            {d8, d9, d10, d11}, [r0]!
-    vst1.i64            {d12, d13, d14, d15}, [r0]!
-    bx              lr
-
-	.size vpx_push_neon, .-vpx_push_neon    @ ENDP
-
-_vpx_pop_neon:
-	vpx_pop_neon: @ PROC
-    vld1.i64            {d8, d9, d10, d11}, [r0]!
-    vld1.i64            {d12, d13, d14, d15}, [r0]!
-    bx              lr
-
-	.size vpx_pop_neon, .-vpx_pop_neon    @ ENDP
-
-
-	.section	.note.GNU-stack,"",%progbits
diff --git a/thirdparty/libvpx/vpx_dsp/arm/gas_apple/intrapred_neon_asm.s b/thirdparty/libvpx/vpx_dsp/arm/gas_apple/intrapred_neon_asm.s
deleted file mode 100644
index 1c527afcff..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/gas_apple/intrapred_neon_asm.s
+++ /dev/null
@@ -1,660 +0,0 @@
-@ This file was created from a .asm file
-@  using the ads2gas_apple.pl script.
-
-	.set WIDE_REFERENCE, 0
-	.set ARCHITECTURE, 5
-	.set DO1STROUNDING, 0
- @
- @  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- @
- @  Use of this source code is governed by a BSD-style license
- @  that can be found in the LICENSE file in the root of the source
- @  tree. An additional intellectual property rights grant can be found
- @  in the file PATENTS.  All contributing project authors may
- @  be found in the AUTHORS file in the root of the source tree.
- @
-
-    .globl _vpx_v_predictor_4x4_neon
-	.globl vpx_v_predictor_4x4_neon
-    .globl _vpx_v_predictor_8x8_neon
-	.globl vpx_v_predictor_8x8_neon
-    .globl _vpx_v_predictor_16x16_neon
-	.globl vpx_v_predictor_16x16_neon
-    .globl _vpx_v_predictor_32x32_neon
-	.globl vpx_v_predictor_32x32_neon
-    .globl _vpx_h_predictor_4x4_neon
-	.globl vpx_h_predictor_4x4_neon
-    .globl _vpx_h_predictor_8x8_neon
-	.globl vpx_h_predictor_8x8_neon
-    .globl _vpx_h_predictor_16x16_neon
-	.globl vpx_h_predictor_16x16_neon
-    .globl _vpx_h_predictor_32x32_neon
-	.globl vpx_h_predictor_32x32_neon
-    .globl _vpx_tm_predictor_4x4_neon
-	.globl vpx_tm_predictor_4x4_neon
-    .globl _vpx_tm_predictor_8x8_neon
-	.globl vpx_tm_predictor_8x8_neon
-    .globl _vpx_tm_predictor_16x16_neon
-	.globl vpx_tm_predictor_16x16_neon
-    .globl _vpx_tm_predictor_32x32_neon
-	.globl vpx_tm_predictor_32x32_neon
-   @ ARM
-   @ 
-   @ PRESERVE8
-
-.text
-.p2align 2
-
- @void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
- @                              const uint8_t *above,
- @                              const uint8_t *left)
- @ r0  uint8_t *dst
- @ r1  ptrdiff_t y_stride
- @ r2  const uint8_t *above
- @ r3  const uint8_t *left
-
-_vpx_v_predictor_4x4_neon:
-	vpx_v_predictor_4x4_neon: @
-    vld1.32             {d0[0]}, [r2]
-    vst1.32             {d0[0]}, [r0], r1
-    vst1.32             {d0[0]}, [r0], r1
-    vst1.32             {d0[0]}, [r0], r1
-    vst1.32             {d0[0]}, [r0], r1
-    bx                  lr
-    @                 @ |vpx_v_predictor_4x4_neon|
-
- @void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
- @                              const uint8_t *above,
- @                              const uint8_t *left)
- @ r0  uint8_t *dst
- @ r1  ptrdiff_t y_stride
- @ r2  const uint8_t *above
- @ r3  const uint8_t *left
-
-_vpx_v_predictor_8x8_neon:
-	vpx_v_predictor_8x8_neon: @
-    vld1.8              {d0}, [r2]
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    bx                  lr
-    @                 @ |vpx_v_predictor_8x8_neon|
-
- @void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
- @                                const uint8_t *above,
- @                                const uint8_t *left)
- @ r0  uint8_t *dst
- @ r1  ptrdiff_t y_stride
- @ r2  const uint8_t *above
- @ r3  const uint8_t *left
-
-_vpx_v_predictor_16x16_neon:
-	vpx_v_predictor_16x16_neon: @
-    vld1.8              {q0}, [r2]
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    bx                  lr
-    @                 @ |vpx_v_predictor_16x16_neon|
-
- @void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
- @                                const uint8_t *above,
- @                                const uint8_t *left)
- @ r0  uint8_t *dst
- @ r1  ptrdiff_t y_stride
- @ r2  const uint8_t *above
- @ r3  const uint8_t *left
-
-_vpx_v_predictor_32x32_neon:
-	vpx_v_predictor_32x32_neon: @
-    vld1.8              {q0, q1}, [r2]
-    mov                 r2, #2
-loop_v:
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    subs                r2, r2, #1
-    bgt                 loop_v
-    bx                  lr
-    @                 @ |vpx_v_predictor_32x32_neon|
-
- @void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
- @                              const uint8_t *above,
- @                              const uint8_t *left)
- @ r0  uint8_t *dst
- @ r1  ptrdiff_t y_stride
- @ r2  const uint8_t *above
- @ r3  const uint8_t *left
-
-_vpx_h_predictor_4x4_neon:
-	vpx_h_predictor_4x4_neon: @
-    vld1.32             {d1[0]}, [r3]
-    vdup.8              d0, d1[0]
-    vst1.32             {d0[0]}, [r0], r1
-    vdup.8              d0, d1[1]
-    vst1.32             {d0[0]}, [r0], r1
-    vdup.8              d0, d1[2]
-    vst1.32             {d0[0]}, [r0], r1
-    vdup.8              d0, d1[3]
-    vst1.32             {d0[0]}, [r0], r1
-    bx                  lr
-    @                 @ |vpx_h_predictor_4x4_neon|
-
- @void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
- @                              const uint8_t *above,
- @                              const uint8_t *left)
- @ r0  uint8_t *dst
- @ r1  ptrdiff_t y_stride
- @ r2  const uint8_t *above
- @ r3  const uint8_t *left
-
-_vpx_h_predictor_8x8_neon:
-	vpx_h_predictor_8x8_neon: @
-    vld1.64             {d1}, [r3]
-    vdup.8              d0, d1[0]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[1]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[2]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[3]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[4]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[5]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[6]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[7]
-    vst1.64             {d0}, [r0], r1
-    bx                  lr
-    @                 @ |vpx_h_predictor_8x8_neon|
-
- @void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
- @                                const uint8_t *above,
- @                                const uint8_t *left)
- @ r0  uint8_t *dst
- @ r1  ptrdiff_t y_stride
- @ r2  const uint8_t *above
- @ r3  const uint8_t *left
-
-_vpx_h_predictor_16x16_neon:
-	vpx_h_predictor_16x16_neon: @
-    vld1.8              {q1}, [r3]
-    vdup.8              q0, d2[0]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[1]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[2]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[3]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[4]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[5]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[6]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[7]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[0]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[1]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[2]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[3]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[4]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[5]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[6]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[7]
-    vst1.8              {q0}, [r0], r1
-    bx                  lr
-    @                 @ |vpx_h_predictor_16x16_neon|
-
- @void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
- @                                const uint8_t *above,
- @                                const uint8_t *left)
- @ r0  uint8_t *dst
- @ r1  ptrdiff_t y_stride
- @ r2  const uint8_t *above
- @ r3  const uint8_t *left
-
-_vpx_h_predictor_32x32_neon:
-	vpx_h_predictor_32x32_neon: @
-    sub                 r1, r1, #16
-    mov                 r2, #2
-loop_h:
-    vld1.8              {q1}, [r3]!
-    vdup.8              q0, d2[0]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[1]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[2]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[3]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[4]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[5]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[6]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[7]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[0]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[1]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[2]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[3]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[4]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[5]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[6]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[7]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    subs                r2, r2, #1
-    bgt                 loop_h
-    bx                  lr
-    @                 @ |vpx_h_predictor_32x32_neon|
-
- @void vpx_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride,
- @                                const uint8_t *above,
- @                                const uint8_t *left)
- @ r0  uint8_t *dst
- @ r1  ptrdiff_t y_stride
- @ r2  const uint8_t *above
- @ r3  const uint8_t *left
-
-_vpx_tm_predictor_4x4_neon:
-	vpx_tm_predictor_4x4_neon: @
-     @ Load ytop_left = above[-1] @
-    sub                 r12, r2, #1
-    vld1.u8             {d0[]}, [r12]
-
-     @ Load above 4 pixels
-    vld1.32             {d2[0]}, [r2]
-
-     @ Compute above - ytop_left
-    vsubl.u8            q3, d2, d0
-
-     @ Load left row by row and compute left + (above - ytop_left)
-     @ 1st row and 2nd row
-    vld1.u8             {d2[]}, [r3]!
-    vld1.u8             {d4[]}, [r3]!
-    vmovl.u8            q1, d2
-    vmovl.u8            q2, d4
-    vadd.s16            q1, q1, q3
-    vadd.s16            q2, q2, q3
-    vqmovun.s16         d0, q1
-    vqmovun.s16         d1, q2
-    vst1.32             {d0[0]}, [r0], r1
-    vst1.32             {d1[0]}, [r0], r1
-
-     @ 3rd row and 4th row
-    vld1.u8             {d2[]}, [r3]!
-    vld1.u8             {d4[]}, [r3]
-    vmovl.u8            q1, d2
-    vmovl.u8            q2, d4
-    vadd.s16            q1, q1, q3
-    vadd.s16            q2, q2, q3
-    vqmovun.s16         d0, q1
-    vqmovun.s16         d1, q2
-    vst1.32             {d0[0]}, [r0], r1
-    vst1.32             {d1[0]}, [r0], r1
-    bx                  lr
-    @                 @ |vpx_tm_predictor_4x4_neon|
-
- @void vpx_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride,
- @                                const uint8_t *above,
- @                                const uint8_t *left)
- @ r0  uint8_t *dst
- @ r1  ptrdiff_t y_stride
- @ r2  const uint8_t *above
- @ r3  const uint8_t *left
-
-_vpx_tm_predictor_8x8_neon:
-	vpx_tm_predictor_8x8_neon: @
-     @ Load ytop_left = above[-1] @
-    sub                 r12, r2, #1
-    vld1.8              {d0[]}, [r12]
-
-     @ preload 8 left
-    vld1.8              {d30}, [r3]
-
-     @ Load above 8 pixels
-    vld1.64             {d2}, [r2]
-
-    vmovl.u8            q10, d30
-
-     @ Compute above - ytop_left
-    vsubl.u8            q3, d2, d0
-
-     @ Load left row by row and compute left + (above - ytop_left)
-     @ 1st row and 2nd row
-    vdup.16             q0, d20[0]
-    vdup.16             q1, d20[1]
-    vadd.s16            q0, q3, q0
-    vadd.s16            q1, q3, q1
-
-     @ 3rd row and 4th row
-    vdup.16             q8, d20[2]
-    vdup.16             q9, d20[3]
-    vadd.s16            q8, q3, q8
-    vadd.s16            q9, q3, q9
-
-    vqmovun.s16         d0, q0
-    vqmovun.s16         d1, q1
-    vqmovun.s16         d2, q8
-    vqmovun.s16         d3, q9
-
-    vst1.64             {d0}, [r0], r1
-    vst1.64             {d1}, [r0], r1
-    vst1.64             {d2}, [r0], r1
-    vst1.64             {d3}, [r0], r1
-
-     @ 5th row and 6th row
-    vdup.16             q0, d21[0]
-    vdup.16             q1, d21[1]
-    vadd.s16            q0, q3, q0
-    vadd.s16            q1, q3, q1
-
-     @ 7th row and 8th row
-    vdup.16             q8, d21[2]
-    vdup.16             q9, d21[3]
-    vadd.s16            q8, q3, q8
-    vadd.s16            q9, q3, q9
-
-    vqmovun.s16         d0, q0
-    vqmovun.s16         d1, q1
-    vqmovun.s16         d2, q8
-    vqmovun.s16         d3, q9
-
-    vst1.64             {d0}, [r0], r1
-    vst1.64             {d1}, [r0], r1
-    vst1.64             {d2}, [r0], r1
-    vst1.64             {d3}, [r0], r1
-
-    bx                  lr
-    @                 @ |vpx_tm_predictor_8x8_neon|
-
- @void vpx_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride,
- @                                const uint8_t *above,
- @                                const uint8_t *left)
- @ r0  uint8_t *dst
- @ r1  ptrdiff_t y_stride
- @ r2  const uint8_t *above
- @ r3  const uint8_t *left
-
-_vpx_tm_predictor_16x16_neon:
-	vpx_tm_predictor_16x16_neon: @
-     @ Load ytop_left = above[-1] @
-    sub                 r12, r2, #1
-    vld1.8              {d0[]}, [r12]
-
-     @ Load above 8 pixels
-    vld1.8              {q1}, [r2]
-
-     @ preload 8 left into r12
-    vld1.8              {d18}, [r3]!
-
-     @ Compute above - ytop_left
-    vsubl.u8            q2, d2, d0
-    vsubl.u8            q3, d3, d0
-
-    vmovl.u8            q10, d18
-
-     @ Load left row by row and compute left + (above - ytop_left)
-     @ Process 8 rows in each single loop and loop 2 times to process 16 rows.
-    mov                 r2, #2
-
-loop_16x16_neon:
-     @ Process two rows.
-    vdup.16             q0, d20[0]
-    vdup.16             q8, d20[1]
-    vadd.s16            q1, q0, q2
-    vadd.s16            q0, q0, q3
-    vadd.s16            q11, q8, q2
-    vadd.s16            q8, q8, q3
-    vqmovun.s16         d2, q1
-    vqmovun.s16         d3, q0
-    vqmovun.s16         d22, q11
-    vqmovun.s16         d23, q8
-    vdup.16             q0, d20[2]                   @ proload next 2 rows data
-    vdup.16             q8, d20[3]
-    vst1.64             {d2,d3}, [r0], r1
-    vst1.64             {d22,d23}, [r0], r1
-
-     @ Process two rows.
-    vadd.s16            q1, q0, q2
-    vadd.s16            q0, q0, q3
-    vadd.s16            q11, q8, q2
-    vadd.s16            q8, q8, q3
-    vqmovun.s16         d2, q1
-    vqmovun.s16         d3, q0
-    vqmovun.s16         d22, q11
-    vqmovun.s16         d23, q8
-    vdup.16             q0, d21[0]                   @ proload next 2 rows data
-    vdup.16             q8, d21[1]
-    vst1.64             {d2,d3}, [r0], r1
-    vst1.64             {d22,d23}, [r0], r1
-
-    vadd.s16            q1, q0, q2
-    vadd.s16            q0, q0, q3
-    vadd.s16            q11, q8, q2
-    vadd.s16            q8, q8, q3
-    vqmovun.s16         d2, q1
-    vqmovun.s16         d3, q0
-    vqmovun.s16         d22, q11
-    vqmovun.s16         d23, q8
-    vdup.16             q0, d21[2]                   @ proload next 2 rows data
-    vdup.16             q8, d21[3]
-    vst1.64             {d2,d3}, [r0], r1
-    vst1.64             {d22,d23}, [r0], r1
-
-
-    vadd.s16            q1, q0, q2
-    vadd.s16            q0, q0, q3
-    vadd.s16            q11, q8, q2
-    vadd.s16            q8, q8, q3
-    vqmovun.s16         d2, q1
-    vqmovun.s16         d3, q0
-    vqmovun.s16         d22, q11
-    vqmovun.s16         d23, q8
-    vld1.8              {d18}, [r3]!                   @ preload 8 left into r12
-    vmovl.u8            q10, d18
-    vst1.64             {d2,d3}, [r0], r1
-    vst1.64             {d22,d23}, [r0], r1
-
-    subs                r2, r2, #1
-    bgt                 loop_16x16_neon
-
-    bx                  lr
-    @                 @ |vpx_tm_predictor_16x16_neon|
-
- @void vpx_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride,
- @                                  const uint8_t *above,
- @                                  const uint8_t *left)
- @ r0  uint8_t *dst
- @ r1  ptrdiff_t y_stride
- @ r2  const uint8_t *above
- @ r3  const uint8_t *left
-
-_vpx_tm_predictor_32x32_neon:
-	vpx_tm_predictor_32x32_neon: @
-     @ Load ytop_left = above[-1] @
-    sub                 r12, r2, #1
-    vld1.8              {d0[]}, [r12]
-
-     @ Load above 32 pixels
-    vld1.8              {q1}, [r2]!
-    vld1.8              {q2}, [r2]
-
-     @ preload 8 left pixels
-    vld1.8              {d26}, [r3]!
-
-     @ Compute above - ytop_left
-    vsubl.u8            q8, d2, d0
-    vsubl.u8            q9, d3, d0
-    vsubl.u8            q10, d4, d0
-    vsubl.u8            q11, d5, d0
-
-    vmovl.u8            q3, d26
-
-     @ Load left row by row and compute left + (above - ytop_left)
-     @ Process 8 rows in each single loop and loop 4 times to process 32 rows.
-    mov                 r2, #4
-
-loop_32x32_neon:
-     @ Process two rows.
-    vdup.16             q0, d6[0]
-    vdup.16             q2, d6[1]
-    vadd.s16            q12, q0, q8
-    vadd.s16            q13, q0, q9
-    vadd.s16            q14, q0, q10
-    vadd.s16            q15, q0, q11
-    vqmovun.s16         d0, q12
-    vqmovun.s16         d1, q13
-    vadd.s16            q12, q2, q8
-    vadd.s16            q13, q2, q9
-    vqmovun.s16         d2, q14
-    vqmovun.s16         d3, q15
-    vadd.s16            q14, q2, q10
-    vadd.s16            q15, q2, q11
-    vst1.64             {d0-d3}, [r0], r1
-    vqmovun.s16         d24, q12
-    vqmovun.s16         d25, q13
-    vqmovun.s16         d26, q14
-    vqmovun.s16         d27, q15
-    vdup.16             q1, d6[2]
-    vdup.16             q2, d6[3]
-    vst1.64             {d24-d27}, [r0], r1
-
-     @ Process two rows.
-    vadd.s16            q12, q1, q8
-    vadd.s16            q13, q1, q9
-    vadd.s16            q14, q1, q10
-    vadd.s16            q15, q1, q11
-    vqmovun.s16         d0, q12
-    vqmovun.s16         d1, q13
-    vadd.s16            q12, q2, q8
-    vadd.s16            q13, q2, q9
-    vqmovun.s16         d2, q14
-    vqmovun.s16         d3, q15
-    vadd.s16            q14, q2, q10
-    vadd.s16            q15, q2, q11
-    vst1.64             {d0-d3}, [r0], r1
-    vqmovun.s16         d24, q12
-    vqmovun.s16         d25, q13
-    vqmovun.s16         d26, q14
-    vqmovun.s16         d27, q15
-    vdup.16             q0, d7[0]
-    vdup.16             q2, d7[1]
-    vst1.64             {d24-d27}, [r0], r1
-
-     @ Process two rows.
-    vadd.s16            q12, q0, q8
-    vadd.s16            q13, q0, q9
-    vadd.s16            q14, q0, q10
-    vadd.s16            q15, q0, q11
-    vqmovun.s16         d0, q12
-    vqmovun.s16         d1, q13
-    vadd.s16            q12, q2, q8
-    vadd.s16            q13, q2, q9
-    vqmovun.s16         d2, q14
-    vqmovun.s16         d3, q15
-    vadd.s16            q14, q2, q10
-    vadd.s16            q15, q2, q11
-    vst1.64             {d0-d3}, [r0], r1
-    vqmovun.s16         d24, q12
-    vqmovun.s16         d25, q13
-    vqmovun.s16         d26, q14
-    vqmovun.s16         d27, q15
-    vdup.16             q0, d7[2]
-    vdup.16             q2, d7[3]
-    vst1.64             {d24-d27}, [r0], r1
-
-     @ Process two rows.
-    vadd.s16            q12, q0, q8
-    vadd.s16            q13, q0, q9
-    vadd.s16            q14, q0, q10
-    vadd.s16            q15, q0, q11
-    vqmovun.s16         d0, q12
-    vqmovun.s16         d1, q13
-    vadd.s16            q12, q2, q8
-    vadd.s16            q13, q2, q9
-    vqmovun.s16         d2, q14
-    vqmovun.s16         d3, q15
-    vadd.s16            q14, q2, q10
-    vadd.s16            q15, q2, q11
-    vst1.64             {d0-d3}, [r0], r1
-    vqmovun.s16         d24, q12
-    vqmovun.s16         d25, q13
-    vld1.8              {d0}, [r3]!                    @ preload 8 left pixels
-    vqmovun.s16         d26, q14
-    vqmovun.s16         d27, q15
-    vmovl.u8            q3, d0
-    vst1.64             {d24-d27}, [r0], r1
-
-    subs                r2, r2, #1
-    bgt                 loop_32x32_neon
-
-    bx                  lr
-    @                 @ |vpx_tm_predictor_32x32_neon|
-
diff --git a/thirdparty/libvpx/vpx_dsp/arm/gas_apple/loopfilter_mb_neon.s b/thirdparty/libvpx/vpx_dsp/arm/gas_apple/loopfilter_mb_neon.s
deleted file mode 100644
index 69f7e5207e..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/gas_apple/loopfilter_mb_neon.s
+++ /dev/null
@@ -1,649 +0,0 @@
-@ This file was created from a .asm file
-@  using the ads2gas_apple.pl script.
-
-	.set WIDE_REFERENCE, 0
-	.set ARCHITECTURE, 5
-	.set DO1STROUNDING, 0
- @
- @  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- @
- @  Use of this source code is governed by a BSD-style license
- @  that can be found in the LICENSE file in the root of the source
- @  tree. An additional intellectual property rights grant can be found
- @  in the file PATENTS.  All contributing project authors may
- @  be found in the AUTHORS file in the root of the source tree.
- @
-
-    .globl _vpx_lpf_horizontal_edge_8_neon
-	.globl vpx_lpf_horizontal_edge_8_neon
-    .globl _vpx_lpf_horizontal_edge_16_neon
-	.globl vpx_lpf_horizontal_edge_16_neon
-    .globl _vpx_lpf_vertical_16_neon
-	.globl vpx_lpf_vertical_16_neon
-   @ ARM
-
-.text
-.p2align 2
-
- @ void mb_lpf_horizontal_edge(uint8_t *s, int p,
- @                             const uint8_t *blimit,
- @                             const uint8_t *limit,
- @                             const uint8_t *thresh,
- @                             int count)
- @ r0    uint8_t *s,
- @ r1    int p, /* pitch */
- @ r2    const uint8_t *blimit,
- @ r3    const uint8_t *limit,
- @ sp    const uint8_t *thresh,
- @ r12   int count
-_mb_lpf_horizontal_edge:
-	mb_lpf_horizontal_edge: @
-    push        {r4-r8, lr}
-    vpush       {d8-d15}
-    ldr         r4, [sp, #88]               @ load thresh
-
-h_count:
-    vld1.8      {d16[]}, [r2]               @ load *blimit
-    vld1.8      {d17[]}, [r3]               @ load *limit
-    vld1.8      {d18[]}, [r4]               @ load *thresh
-
-    sub         r8, r0, r1, lsl #3          @ move src pointer down by 8 lines
-
-    vld1.u8     {d0}, [r8,:64], r1           @ p7
-    vld1.u8     {d1}, [r8,:64], r1           @ p6
-    vld1.u8     {d2}, [r8,:64], r1           @ p5
-    vld1.u8     {d3}, [r8,:64], r1           @ p4
-    vld1.u8     {d4}, [r8,:64], r1           @ p3
-    vld1.u8     {d5}, [r8,:64], r1           @ p2
-    vld1.u8     {d6}, [r8,:64], r1           @ p1
-    vld1.u8     {d7}, [r8,:64], r1           @ p0
-    vld1.u8     {d8}, [r8,:64], r1           @ q0
-    vld1.u8     {d9}, [r8,:64], r1           @ q1
-    vld1.u8     {d10}, [r8,:64], r1          @ q2
-    vld1.u8     {d11}, [r8,:64], r1          @ q3
-    vld1.u8     {d12}, [r8,:64], r1          @ q4
-    vld1.u8     {d13}, [r8,:64], r1          @ q5
-    vld1.u8     {d14}, [r8,:64], r1          @ q6
-    vld1.u8     {d15}, [r8,:64], r1          @ q7
-
-    bl          vpx_wide_mbfilter_neon
-
-    tst         r7, #1
-    beq         h_mbfilter
-
-     @ flat && mask were not set for any of the channels. Just store the values
-     @ from filter.
-    sub         r8, r0, r1, lsl #1
-
-    vst1.u8     {d25}, [r8,:64], r1          @ store op1
-    vst1.u8     {d24}, [r8,:64], r1          @ store op0
-    vst1.u8     {d23}, [r8,:64], r1          @ store oq0
-    vst1.u8     {d26}, [r8,:64], r1          @ store oq1
-
-    b           h_next
-
-h_mbfilter:
-    tst         r7, #2
-    beq         h_wide_mbfilter
-
-     @ flat2 was not set for any of the channels. Just store the values from
-     @ mbfilter.
-    sub         r8, r0, r1, lsl #1
-    sub         r8, r8, r1
-
-    vst1.u8     {d18}, [r8,:64], r1          @ store op2
-    vst1.u8     {d19}, [r8,:64], r1          @ store op1
-    vst1.u8     {d20}, [r8,:64], r1          @ store op0
-    vst1.u8     {d21}, [r8,:64], r1          @ store oq0
-    vst1.u8     {d22}, [r8,:64], r1          @ store oq1
-    vst1.u8     {d23}, [r8,:64], r1          @ store oq2
-
-    b           h_next
-
-h_wide_mbfilter:
-    sub         r8, r0, r1, lsl #3
-    add         r8, r8, r1
-
-    vst1.u8     {d16}, [r8,:64], r1          @ store op6
-    vst1.u8     {d24}, [r8,:64], r1          @ store op5
-    vst1.u8     {d25}, [r8,:64], r1          @ store op4
-    vst1.u8     {d26}, [r8,:64], r1          @ store op3
-    vst1.u8     {d27}, [r8,:64], r1          @ store op2
-    vst1.u8     {d18}, [r8,:64], r1          @ store op1
-    vst1.u8     {d19}, [r8,:64], r1          @ store op0
-    vst1.u8     {d20}, [r8,:64], r1          @ store oq0
-    vst1.u8     {d21}, [r8,:64], r1          @ store oq1
-    vst1.u8     {d22}, [r8,:64], r1          @ store oq2
-    vst1.u8     {d23}, [r8,:64], r1          @ store oq3
-    vst1.u8     {d1}, [r8,:64], r1           @ store oq4
-    vst1.u8     {d2}, [r8,:64], r1           @ store oq5
-    vst1.u8     {d3}, [r8,:64], r1           @ store oq6
-
-h_next:
-    add         r0, r0, #8
-    subs        r12, r12, #1
-    bne         h_count
-
-    vpop        {d8-d15}
-    pop         {r4-r8, pc}
-
-    @         @ |mb_lpf_horizontal_edge|
-
- @ void vpx_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch,
- @                                     const uint8_t *blimit,
- @                                     const uint8_t *limit,
- @                                     const uint8_t *thresh)
- @ r0    uint8_t *s,
- @ r1    int pitch,
- @ r2    const uint8_t *blimit,
- @ r3    const uint8_t *limit,
- @ sp    const uint8_t *thresh
-_vpx_lpf_horizontal_edge_8_neon:
-	vpx_lpf_horizontal_edge_8_neon: @
-    mov r12, #1
-    b mb_lpf_horizontal_edge
-    @         @ |vpx_lpf_horizontal_edge_8_neon|
-
- @ void vpx_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch,
- @                                      const uint8_t *blimit,
- @                                      const uint8_t *limit,
- @                                      const uint8_t *thresh)
- @ r0    uint8_t *s,
- @ r1    int pitch,
- @ r2    const uint8_t *blimit,
- @ r3    const uint8_t *limit,
- @ sp    const uint8_t *thresh
-_vpx_lpf_horizontal_edge_16_neon:
-	vpx_lpf_horizontal_edge_16_neon: @
-    mov r12, #2
-    b mb_lpf_horizontal_edge
-    @         @ |vpx_lpf_horizontal_edge_16_neon|
-
- @ void vpx_lpf_vertical_16_neon(uint8_t *s, int p,
- @                               const uint8_t *blimit,
- @                               const uint8_t *limit,
- @                               const uint8_t *thresh)
- @ r0    uint8_t *s,
- @ r1    int p, /* pitch */
- @ r2    const uint8_t *blimit,
- @ r3    const uint8_t *limit,
- @ sp    const uint8_t *thresh,
-_vpx_lpf_vertical_16_neon:
-	vpx_lpf_vertical_16_neon: @
-    push        {r4-r8, lr}
-    vpush       {d8-d15}
-    ldr         r4, [sp, #88]               @ load thresh
-
-    vld1.8      {d16[]}, [r2]               @ load *blimit
-    vld1.8      {d17[]}, [r3]               @ load *limit
-    vld1.8      {d18[]}, [r4]               @ load *thresh
-
-    sub         r8, r0, #8
-
-    vld1.8      {d0}, [r8,:64], r1
-    vld1.8      {d8}, [r0,:64], r1
-    vld1.8      {d1}, [r8,:64], r1
-    vld1.8      {d9}, [r0,:64], r1
-    vld1.8      {d2}, [r8,:64], r1
-    vld1.8      {d10}, [r0,:64], r1
-    vld1.8      {d3}, [r8,:64], r1
-    vld1.8      {d11}, [r0,:64], r1
-    vld1.8      {d4}, [r8,:64], r1
-    vld1.8      {d12}, [r0,:64], r1
-    vld1.8      {d5}, [r8,:64], r1
-    vld1.8      {d13}, [r0,:64], r1
-    vld1.8      {d6}, [r8,:64], r1
-    vld1.8      {d14}, [r0,:64], r1
-    vld1.8      {d7}, [r8,:64], r1
-    vld1.8      {d15}, [r0,:64], r1
-
-    sub         r0, r0, r1, lsl #3
-
-    vtrn.32     q0, q2
-    vtrn.32     q1, q3
-    vtrn.32     q4, q6
-    vtrn.32     q5, q7
-
-    vtrn.16     q0, q1
-    vtrn.16     q2, q3
-    vtrn.16     q4, q5
-    vtrn.16     q6, q7
-
-    vtrn.8      d0, d1
-    vtrn.8      d2, d3
-    vtrn.8      d4, d5
-    vtrn.8      d6, d7
-
-    vtrn.8      d8, d9
-    vtrn.8      d10, d11
-    vtrn.8      d12, d13
-    vtrn.8      d14, d15
-
-    bl          vpx_wide_mbfilter_neon
-
-    tst         r7, #1
-    beq         v_mbfilter
-
-     @ flat && mask were not set for any of the channels. Just store the values
-     @ from filter.
-    sub         r8, r0, #2
-
-    vswp        d23, d25
-
-    vst4.8      {d23[0], d24[0], d25[0], d26[0]}, [r8], r1
-    vst4.8      {d23[1], d24[1], d25[1], d26[1]}, [r8], r1
-    vst4.8      {d23[2], d24[2], d25[2], d26[2]}, [r8], r1
-    vst4.8      {d23[3], d24[3], d25[3], d26[3]}, [r8], r1
-    vst4.8      {d23[4], d24[4], d25[4], d26[4]}, [r8], r1
-    vst4.8      {d23[5], d24[5], d25[5], d26[5]}, [r8], r1
-    vst4.8      {d23[6], d24[6], d25[6], d26[6]}, [r8], r1
-    vst4.8      {d23[7], d24[7], d25[7], d26[7]}, [r8], r1
-
-    b           v_end
-
-v_mbfilter:
-    tst         r7, #2
-    beq         v_wide_mbfilter
-
-     @ flat2 was not set for any of the channels. Just store the values from
-     @ mbfilter.
-    sub         r8, r0, #3
-
-    vst3.8      {d18[0], d19[0], d20[0]}, [r8], r1
-    vst3.8      {d21[0], d22[0], d23[0]}, [r0], r1
-    vst3.8      {d18[1], d19[1], d20[1]}, [r8], r1
-    vst3.8      {d21[1], d22[1], d23[1]}, [r0], r1
-    vst3.8      {d18[2], d19[2], d20[2]}, [r8], r1
-    vst3.8      {d21[2], d22[2], d23[2]}, [r0], r1
-    vst3.8      {d18[3], d19[3], d20[3]}, [r8], r1
-    vst3.8      {d21[3], d22[3], d23[3]}, [r0], r1
-    vst3.8      {d18[4], d19[4], d20[4]}, [r8], r1
-    vst3.8      {d21[4], d22[4], d23[4]}, [r0], r1
-    vst3.8      {d18[5], d19[5], d20[5]}, [r8], r1
-    vst3.8      {d21[5], d22[5], d23[5]}, [r0], r1
-    vst3.8      {d18[6], d19[6], d20[6]}, [r8], r1
-    vst3.8      {d21[6], d22[6], d23[6]}, [r0], r1
-    vst3.8      {d18[7], d19[7], d20[7]}, [r8], r1
-    vst3.8      {d21[7], d22[7], d23[7]}, [r0], r1
-
-    b           v_end
-
-v_wide_mbfilter:
-    sub         r8, r0, #8
-
-    vtrn.32     d0,  d26
-    vtrn.32     d16, d27
-    vtrn.32     d24, d18
-    vtrn.32     d25, d19
-
-    vtrn.16     d0,  d24
-    vtrn.16     d16, d25
-    vtrn.16     d26, d18
-    vtrn.16     d27, d19
-
-    vtrn.8      d0,  d16
-    vtrn.8      d24, d25
-    vtrn.8      d26, d27
-    vtrn.8      d18, d19
-
-    vtrn.32     d20, d1
-    vtrn.32     d21, d2
-    vtrn.32     d22, d3
-    vtrn.32     d23, d15
-
-    vtrn.16     d20, d22
-    vtrn.16     d21, d23
-    vtrn.16     d1,  d3
-    vtrn.16     d2,  d15
-
-    vtrn.8      d20, d21
-    vtrn.8      d22, d23
-    vtrn.8      d1,  d2
-    vtrn.8      d3,  d15
-
-    vst1.8      {d0}, [r8,:64], r1
-    vst1.8      {d20}, [r0,:64], r1
-    vst1.8      {d16}, [r8,:64], r1
-    vst1.8      {d21}, [r0,:64], r1
-    vst1.8      {d24}, [r8,:64], r1
-    vst1.8      {d22}, [r0,:64], r1
-    vst1.8      {d25}, [r8,:64], r1
-    vst1.8      {d23}, [r0,:64], r1
-    vst1.8      {d26}, [r8,:64], r1
-    vst1.8      {d1}, [r0,:64], r1
-    vst1.8      {d27}, [r8,:64], r1
-    vst1.8      {d2}, [r0,:64], r1
-    vst1.8      {d18}, [r8,:64], r1
-    vst1.8      {d3}, [r0,:64], r1
-    vst1.8      {d19}, [r8,:64], r1
-    vst1.8      {d15}, [r0,:64], r1
-
-v_end:
-    vpop        {d8-d15}
-    pop         {r4-r8, pc}
-
-    @         @ |vpx_lpf_vertical_16_neon|
-
- @ void vpx_wide_mbfilter_neon() @
- @ This is a helper function for the loopfilters. The invidual functions do the
- @ necessary load, transpose (if necessary) and store.
- @
- @ r0-r3 PRESERVE
- @ d16    blimit
- @ d17    limit
- @ d18    thresh
- @ d0    p7
- @ d1    p6
- @ d2    p5
- @ d3    p4
- @ d4    p3
- @ d5    p2
- @ d6    p1
- @ d7    p0
- @ d8    q0
- @ d9    q1
- @ d10   q2
- @ d11   q3
- @ d12   q4
- @ d13   q5
- @ d14   q6
- @ d15   q7
-_vpx_wide_mbfilter_neon:
-	vpx_wide_mbfilter_neon: @
-    mov         r7, #0
-
-     @ filter_mask
-    vabd.u8     d19, d4, d5                 @ abs(p3 - p2)
-    vabd.u8     d20, d5, d6                 @ abs(p2 - p1)
-    vabd.u8     d21, d6, d7                 @ abs(p1 - p0)
-    vabd.u8     d22, d9, d8                 @ abs(q1 - q0)
-    vabd.u8     d23, d10, d9                @ abs(q2 - q1)
-    vabd.u8     d24, d11, d10               @ abs(q3 - q2)
-
-     @ only compare the largest value to limit
-    vmax.u8     d19, d19, d20               @ max(abs(p3 - p2), abs(p2 - p1))
-    vmax.u8     d20, d21, d22               @ max(abs(p1 - p0), abs(q1 - q0))
-    vmax.u8     d23, d23, d24               @ max(abs(q2 - q1), abs(q3 - q2))
-    vmax.u8     d19, d19, d20
-
-    vabd.u8     d24, d7, d8                 @ abs(p0 - q0)
-
-    vmax.u8     d19, d19, d23
-
-    vabd.u8     d23, d6, d9                 @ a = abs(p1 - q1)
-    vqadd.u8    d24, d24, d24               @ b = abs(p0 - q0) * 2
-
-     @ abs () > limit
-    vcge.u8     d19, d17, d19
-
-     @ flatmask4
-    vabd.u8     d25, d7, d5                 @ abs(p0 - p2)
-    vabd.u8     d26, d8, d10                @ abs(q0 - q2)
-    vabd.u8     d27, d4, d7                 @ abs(p3 - p0)
-    vabd.u8     d28, d11, d8                @ abs(q3 - q0)
-
-     @ only compare the largest value to thresh
-    vmax.u8     d25, d25, d26               @ max(abs(p0 - p2), abs(q0 - q2))
-    vmax.u8     d26, d27, d28               @ max(abs(p3 - p0), abs(q3 - q0))
-    vmax.u8     d25, d25, d26
-    vmax.u8     d20, d20, d25
-
-    vshr.u8     d23, d23, #1                @ a = a / 2
-    vqadd.u8    d24, d24, d23               @ a = b + a
-
-    vmov.u8     d30, #1
-    vcge.u8     d24, d16, d24               @ (a > blimit * 2 + limit) * -1
-
-    vcge.u8     d20, d30, d20               @ flat
-
-    vand        d19, d19, d24               @ mask
-
-     @ hevmask
-    vcgt.u8     d21, d21, d18               @ (abs(p1 - p0) > thresh)*-1
-    vcgt.u8     d22, d22, d18               @ (abs(q1 - q0) > thresh)*-1
-    vorr        d21, d21, d22               @ hev
-
-    vand        d16, d20, d19               @ flat && mask
-    vmov        r5, r6, d16
-
-     @ flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7)
-    vabd.u8     d22, d3, d7                 @ abs(p4 - p0)
-    vabd.u8     d23, d12, d8                @ abs(q4 - q0)
-    vabd.u8     d24, d7, d2                 @ abs(p0 - p5)
-    vabd.u8     d25, d8, d13                @ abs(q0 - q5)
-    vabd.u8     d26, d1, d7                 @ abs(p6 - p0)
-    vabd.u8     d27, d14, d8                @ abs(q6 - q0)
-    vabd.u8     d28, d0, d7                 @ abs(p7 - p0)
-    vabd.u8     d29, d15, d8                @ abs(q7 - q0)
-
-     @ only compare the largest value to thresh
-    vmax.u8     d22, d22, d23               @ max(abs(p4 - p0), abs(q4 - q0))
-    vmax.u8     d23, d24, d25               @ max(abs(p0 - p5), abs(q0 - q5))
-    vmax.u8     d24, d26, d27               @ max(abs(p6 - p0), abs(q6 - q0))
-    vmax.u8     d25, d28, d29               @ max(abs(p7 - p0), abs(q7 - q0))
-
-    vmax.u8     d26, d22, d23
-    vmax.u8     d27, d24, d25
-    vmax.u8     d23, d26, d27
-
-    vcge.u8     d18, d30, d23               @ flat2
-
-    vmov.u8     d22, #0x80
-
-    orrs        r5, r5, r6                  @ Check for 0
-    orreq       r7, r7, #1                  @ Only do filter branch
-
-    vand        d17, d18, d16               @ flat2 && flat && mask
-    vmov        r5, r6, d17
-
-     @ mbfilter() function
-
-     @ filter() function
-     @ convert to signed
-    veor        d23, d8, d22                @ qs0
-    veor        d24, d7, d22                @ ps0
-    veor        d25, d6, d22                @ ps1
-    veor        d26, d9, d22                @ qs1
-
-    vmov.u8     d27, #3
-
-    vsub.s8     d28, d23, d24               @ ( qs0 - ps0)
-    vqsub.s8    d29, d25, d26               @ filter = clamp(ps1-qs1)
-    vmull.s8    q15, d28, d27               @ 3 * ( qs0 - ps0)
-    vand        d29, d29, d21               @ filter &= hev
-    vaddw.s8    q15, q15, d29               @ filter + 3 * (qs0 - ps0)
-    vmov.u8     d29, #4
-
-     @ filter = clamp(filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d28, q15
-
-    vand        d28, d28, d19               @ filter &= mask
-
-    vqadd.s8    d30, d28, d27               @ filter2 = clamp(filter+3)
-    vqadd.s8    d29, d28, d29               @ filter1 = clamp(filter+4)
-    vshr.s8     d30, d30, #3                @ filter2 >>= 3
-    vshr.s8     d29, d29, #3                @ filter1 >>= 3
-
-
-    vqadd.s8    d24, d24, d30               @ op0 = clamp(ps0 + filter2)
-    vqsub.s8    d23, d23, d29               @ oq0 = clamp(qs0 - filter1)
-
-     @ outer tap adjustments: ++filter1 >> 1
-    vrshr.s8    d29, d29, #1
-    vbic        d29, d29, d21               @ filter &= ~hev
-
-    vqadd.s8    d25, d25, d29               @ op1 = clamp(ps1 + filter)
-    vqsub.s8    d26, d26, d29               @ oq1 = clamp(qs1 - filter)
-
-    veor        d24, d24, d22               @ *f_op0 = u^0x80
-    veor        d23, d23, d22               @ *f_oq0 = u^0x80
-    veor        d25, d25, d22               @ *f_op1 = u^0x80
-    veor        d26, d26, d22               @ *f_oq1 = u^0x80
-
-    tst         r7, #1
-    bxne        lr
-
-    orrs        r5, r5, r6                  @ Check for 0
-    orreq       r7, r7, #2                  @ Only do mbfilter branch
-
-     @ mbfilter flat && mask branch
-     @ TODO(fgalligan): Can I decrease the cycles shifting to consective d's
-     @ and using vibt on the q's?
-    vmov.u8     d29, #2
-    vaddl.u8    q15, d7, d8                 @ op2 = p0 + q0
-    vmlal.u8    q15, d4, d27                @ op2 = p0 + q0 + p3 * 3
-    vmlal.u8    q15, d5, d29                @ op2 = p0 + q0 + p3 * 3 + p2 * 2
-    vaddl.u8    q10, d4, d5
-    vaddw.u8    q15, d6                     @ op2=p1 + p0 + q0 + p3 * 3 + p2 *2
-    vaddl.u8    q14, d6, d9
-    vqrshrn.u16 d18, q15, #3                @ r_op2
-
-    vsub.i16    q15, q10
-    vaddl.u8    q10, d4, d6
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d7, d10
-    vqrshrn.u16 d19, q15, #3                @ r_op1
-
-    vsub.i16    q15, q10
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d8, d11
-    vqrshrn.u16 d20, q15, #3                @ r_op0
-
-    vsubw.u8    q15, d4                     @ oq0 = op0 - p3
-    vsubw.u8    q15, d7                     @ oq0 -= p0
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d9, d11
-    vqrshrn.u16 d21, q15, #3                @ r_oq0
-
-    vsubw.u8    q15, d5                     @ oq1 = oq0 - p2
-    vsubw.u8    q15, d8                     @ oq1 -= q0
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d10, d11
-    vqrshrn.u16 d22, q15, #3                @ r_oq1
-
-    vsubw.u8    q15, d6                     @ oq2 = oq0 - p1
-    vsubw.u8    q15, d9                     @ oq2 -= q1
-    vadd.i16    q15, q14
-    vqrshrn.u16 d27, q15, #3                @ r_oq2
-
-     @ Filter does not set op2 or oq2, so use p2 and q2.
-    vbif        d18, d5, d16                @ t_op2 |= p2 & ~(flat & mask)
-    vbif        d19, d25, d16               @ t_op1 |= f_op1 & ~(flat & mask)
-    vbif        d20, d24, d16               @ t_op0 |= f_op0 & ~(flat & mask)
-    vbif        d21, d23, d16               @ t_oq0 |= f_oq0 & ~(flat & mask)
-    vbif        d22, d26, d16               @ t_oq1 |= f_oq1 & ~(flat & mask)
-
-    vbit        d23, d27, d16               @ t_oq2 |= r_oq2 & (flat & mask)
-    vbif        d23, d10, d16               @ t_oq2 |= q2 & ~(flat & mask)
-
-    tst         r7, #2
-    bxne        lr
-
-     @ wide_mbfilter flat2 && flat && mask branch
-    vmov.u8     d16, #7
-    vaddl.u8    q15, d7, d8                 @ op6 = p0 + q0
-    vaddl.u8    q12, d2, d3
-    vaddl.u8    q13, d4, d5
-    vaddl.u8    q14, d1, d6
-    vmlal.u8    q15, d0, d16                @ op6 += p7 * 3
-    vadd.i16    q12, q13
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d2, d9
-    vadd.i16    q15, q12
-    vaddl.u8    q12, d0, d1
-    vaddw.u8    q15, d1
-    vaddl.u8    q13, d0, d2
-    vadd.i16    q14, q15, q14
-    vqrshrn.u16 d16, q15, #4                @ w_op6
-
-    vsub.i16    q15, q14, q12
-    vaddl.u8    q14, d3, d10
-    vqrshrn.u16 d24, q15, #4                @ w_op5
-
-    vsub.i16    q15, q13
-    vaddl.u8    q13, d0, d3
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d4, d11
-    vqrshrn.u16 d25, q15, #4                @ w_op4
-
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d0, d4
-    vsub.i16    q15, q13
-    vsub.i16    q14, q15, q14
-    vqrshrn.u16 d26, q15, #4                @ w_op3
-
-    vaddw.u8    q15, q14, d5                @ op2 += p2
-    vaddl.u8    q14, d0, d5
-    vaddw.u8    q15, d12                    @ op2 += q4
-    vbif        d26, d4, d17                @ op3 |= p3 & ~(f2 & f & m)
-    vqrshrn.u16 d27, q15, #4                @ w_op2
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d0, d6
-    vaddw.u8    q15, d6                     @ op1 += p1
-    vaddw.u8    q15, d13                    @ op1 += q5
-    vbif        d27, d18, d17               @ op2 |= t_op2 & ~(f2 & f & m)
-    vqrshrn.u16 d18, q15, #4                @ w_op1
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d0, d7
-    vaddw.u8    q15, d7                     @ op0 += p0
-    vaddw.u8    q15, d14                    @ op0 += q6
-    vbif        d18, d19, d17               @ op1 |= t_op1 & ~(f2 & f & m)
-    vqrshrn.u16 d19, q15, #4                @ w_op0
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d1, d8
-    vaddw.u8    q15, d8                     @ oq0 += q0
-    vaddw.u8    q15, d15                    @ oq0 += q7
-    vbif        d19, d20, d17               @ op0 |= t_op0 & ~(f2 & f & m)
-    vqrshrn.u16 d20, q15, #4                @ w_oq0
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d2, d9
-    vaddw.u8    q15, d9                     @ oq1 += q1
-    vaddl.u8    q4, d10, d15
-    vaddw.u8    q15, d15                    @ oq1 += q7
-    vbif        d20, d21, d17               @ oq0 |= t_oq0 & ~(f2 & f & m)
-    vqrshrn.u16 d21, q15, #4                @ w_oq1
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d3, d10
-    vadd.i16    q15, q4
-    vaddl.u8    q4, d11, d15
-    vbif        d21, d22, d17               @ oq1 |= t_oq1 & ~(f2 & f & m)
-    vqrshrn.u16 d22, q15, #4                @ w_oq2
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d4, d11
-    vadd.i16    q15, q4
-    vaddl.u8    q4, d12, d15
-    vbif        d22, d23, d17               @ oq2 |= t_oq2 & ~(f2 & f & m)
-    vqrshrn.u16 d23, q15, #4                @ w_oq3
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d5, d12
-    vadd.i16    q15, q4
-    vaddl.u8    q4, d13, d15
-    vbif        d16, d1, d17                @ op6 |= p6 & ~(f2 & f & m)
-    vqrshrn.u16 d1, q15, #4                 @ w_oq4
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d6, d13
-    vadd.i16    q15, q4
-    vaddl.u8    q4, d14, d15
-    vbif        d24, d2, d17                @ op5 |= p5 & ~(f2 & f & m)
-    vqrshrn.u16 d2, q15, #4                 @ w_oq5
-
-    vsub.i16    q15, q14
-    vbif        d25, d3, d17                @ op4 |= p4 & ~(f2 & f & m)
-    vadd.i16    q15, q4
-    vbif        d23, d11, d17               @ oq3 |= q3 & ~(f2 & f & m)
-    vqrshrn.u16 d3, q15, #4                 @ w_oq6
-    vbif        d1, d12, d17                @ oq4 |= q4 & ~(f2 & f & m)
-    vbif        d2, d13, d17                @ oq5 |= q5 & ~(f2 & f & m)
-    vbif        d3, d14, d17                @ oq6 |= q6 & ~(f2 & f & m)
-
-    bx          lr
-    @         @ |vpx_wide_mbfilter_neon|
-
diff --git a/thirdparty/libvpx/vpx_dsp/arm/gas_apple/save_reg_neon.s b/thirdparty/libvpx/vpx_dsp/arm/gas_apple/save_reg_neon.s
deleted file mode 100644
index f322b698b4..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/gas_apple/save_reg_neon.s
+++ /dev/null
@@ -1,46 +0,0 @@
-@ This file was created from a .asm file
-@  using the ads2gas_apple.pl script.
-
-	.set WIDE_REFERENCE, 0
-	.set ARCHITECTURE, 5
-	.set DO1STROUNDING, 0
- @
- @  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- @
- @  Use of this source code is governed by a BSD-style license
- @  that can be found in the LICENSE file in the root of the source
- @  tree. An additional intellectual property rights grant can be found
- @  in the file PATENTS.  All contributing project authors may
- @  be found in the AUTHORS file in the root of the source tree.
- @
-
-
-    .globl _vpx_push_neon
-	.globl vpx_push_neon
-    .globl _vpx_pop_neon
-	.globl vpx_pop_neon
-
-   @ ARM
-   @ 
-   @ PRESERVE8
-
-.text
-.p2align 2
-
-_vpx_push_neon:
-	vpx_push_neon: @
-    vst1.i64            {d8, d9, d10, d11}, [r0]!
-    vst1.i64            {d12, d13, d14, d15}, [r0]!
-    bx              lr
-
-    @
-
-_vpx_pop_neon:
-	vpx_pop_neon: @
-    vld1.i64            {d8, d9, d10, d11}, [r0]!
-    vld1.i64            {d12, d13, d14, d15}, [r0]!
-    bx              lr
-
-    @
-
-
diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c
deleted file mode 100644
index f734e48027..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "vpx_dsp/inv_txfm.h"
-#include "vpx_ports/mem.h"
-
-void vpx_idct16x16_1_add_neon(
-        int16_t *input,
-        uint8_t *dest,
-        int dest_stride) {
-    uint8x8_t d2u8, d3u8, d30u8, d31u8;
-    uint64x1_t d2u64, d3u64, d4u64, d5u64;
-    uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
-    int16x8_t q0s16;
-    uint8_t *d1, *d2;
-    int16_t i, j, a1, cospi_16_64 = 11585;
-    int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
-    out = dct_const_round_shift(out * cospi_16_64);
-    a1 = ROUND_POWER_OF_TWO(out, 6);
-
-    q0s16 = vdupq_n_s16(a1);
-    q0u16 = vreinterpretq_u16_s16(q0s16);
-
-    for (d1 = d2 = dest, i = 0; i < 4; i++) {
-        for (j = 0; j < 2; j++) {
-            d2u64 = vld1_u64((const uint64_t *)d1);
-            d3u64 = vld1_u64((const uint64_t *)(d1 + 8));
-            d1 += dest_stride;
-            d4u64 = vld1_u64((const uint64_t *)d1);
-            d5u64 = vld1_u64((const uint64_t *)(d1 + 8));
-            d1 += dest_stride;
-
-            q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
-            q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
-            q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
-            q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
-
-            d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-            d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-            d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-            d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-
-            vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-            vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8));
-            d2 += dest_stride;
-            vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
-            vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8));
-            d2 += dest_stride;
-        }
-    }
-    return;
-}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct16x16_add_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
deleted file mode 100644
index 651ebb21f9..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
+++ /dev/null
@@ -1,1317 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_config.h"
-#include "vpx_dsp/txfm_common.h"
-
-static INLINE void TRANSPOSE8X8(
-        int16x8_t *q8s16,
-        int16x8_t *q9s16,
-        int16x8_t *q10s16,
-        int16x8_t *q11s16,
-        int16x8_t *q12s16,
-        int16x8_t *q13s16,
-        int16x8_t *q14s16,
-        int16x8_t *q15s16) {
-    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-    int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
-    int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
-
-    d16s16 = vget_low_s16(*q8s16);
-    d17s16 = vget_high_s16(*q8s16);
-    d18s16 = vget_low_s16(*q9s16);
-    d19s16 = vget_high_s16(*q9s16);
-    d20s16 = vget_low_s16(*q10s16);
-    d21s16 = vget_high_s16(*q10s16);
-    d22s16 = vget_low_s16(*q11s16);
-    d23s16 = vget_high_s16(*q11s16);
-    d24s16 = vget_low_s16(*q12s16);
-    d25s16 = vget_high_s16(*q12s16);
-    d26s16 = vget_low_s16(*q13s16);
-    d27s16 = vget_high_s16(*q13s16);
-    d28s16 = vget_low_s16(*q14s16);
-    d29s16 = vget_high_s16(*q14s16);
-    d30s16 = vget_low_s16(*q15s16);
-    d31s16 = vget_high_s16(*q15s16);
-
-    *q8s16  = vcombine_s16(d16s16, d24s16);  // vswp d17, d24
-    *q9s16  = vcombine_s16(d18s16, d26s16);  // vswp d19, d26
-    *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
-    *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
-    *q12s16 = vcombine_s16(d17s16, d25s16);
-    *q13s16 = vcombine_s16(d19s16, d27s16);
-    *q14s16 = vcombine_s16(d21s16, d29s16);
-    *q15s16 = vcombine_s16(d23s16, d31s16);
-
-    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
-                        vreinterpretq_s32_s16(*q10s16));
-    q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
-                        vreinterpretq_s32_s16(*q11s16));
-    q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
-                        vreinterpretq_s32_s16(*q14s16));
-    q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
-                        vreinterpretq_s32_s16(*q15s16));
-
-    q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
-                        vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
-    q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
-                        vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
-    q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
-                        vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
-    q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
-                        vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
-
-    *q8s16  = q0x2s16.val[0];
-    *q9s16  = q0x2s16.val[1];
-    *q10s16 = q1x2s16.val[0];
-    *q11s16 = q1x2s16.val[1];
-    *q12s16 = q2x2s16.val[0];
-    *q13s16 = q2x2s16.val[1];
-    *q14s16 = q3x2s16.val[0];
-    *q15s16 = q3x2s16.val[1];
-    return;
-}
-
-void vpx_idct16x16_256_add_neon_pass1(
-        int16_t *in,
-        int16_t *out,
-        int output_stride) {
-    int16x4_t d0s16, d1s16, d2s16, d3s16;
-    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-    uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
-    uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
-    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-    int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32;
-    int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
-    int16x8x2_t q0x2s16;
-
-    q0x2s16 = vld2q_s16(in);
-    q8s16  = q0x2s16.val[0];
-    in += 16;
-    q0x2s16 = vld2q_s16(in);
-    q9s16  = q0x2s16.val[0];
-    in += 16;
-    q0x2s16 = vld2q_s16(in);
-    q10s16 = q0x2s16.val[0];
-    in += 16;
-    q0x2s16 = vld2q_s16(in);
-    q11s16 = q0x2s16.val[0];
-    in += 16;
-    q0x2s16 = vld2q_s16(in);
-    q12s16 = q0x2s16.val[0];
-    in += 16;
-    q0x2s16 = vld2q_s16(in);
-    q13s16 = q0x2s16.val[0];
-    in += 16;
-    q0x2s16 = vld2q_s16(in);
-    q14s16 = q0x2s16.val[0];
-    in += 16;
-    q0x2s16 = vld2q_s16(in);
-    q15s16 = q0x2s16.val[0];
-
-    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
-                 &q12s16, &q13s16, &q14s16, &q15s16);
-
-    d16s16 = vget_low_s16(q8s16);
-    d17s16 = vget_high_s16(q8s16);
-    d18s16 = vget_low_s16(q9s16);
-    d19s16 = vget_high_s16(q9s16);
-    d20s16 = vget_low_s16(q10s16);
-    d21s16 = vget_high_s16(q10s16);
-    d22s16 = vget_low_s16(q11s16);
-    d23s16 = vget_high_s16(q11s16);
-    d24s16 = vget_low_s16(q12s16);
-    d25s16 = vget_high_s16(q12s16);
-    d26s16 = vget_low_s16(q13s16);
-    d27s16 = vget_high_s16(q13s16);
-    d28s16 = vget_low_s16(q14s16);
-    d29s16 = vget_high_s16(q14s16);
-    d30s16 = vget_low_s16(q15s16);
-    d31s16 = vget_high_s16(q15s16);
-
-    // stage 3
-    d0s16 = vdup_n_s16(cospi_28_64);
-    d1s16 = vdup_n_s16(cospi_4_64);
-
-    q2s32 = vmull_s16(d18s16, d0s16);
-    q3s32 = vmull_s16(d19s16, d0s16);
-    q5s32 = vmull_s16(d18s16, d1s16);
-    q6s32 = vmull_s16(d19s16, d1s16);
-
-    q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
-    q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
-    q5s32 = vmlal_s16(q5s32, d30s16, d0s16);
-    q6s32 = vmlal_s16(q6s32, d31s16, d0s16);
-
-    d2s16 = vdup_n_s16(cospi_12_64);
-    d3s16 = vdup_n_s16(cospi_20_64);
-
-    d8s16 = vqrshrn_n_s32(q2s32, 14);
-    d9s16 = vqrshrn_n_s32(q3s32, 14);
-    d14s16 = vqrshrn_n_s32(q5s32, 14);
-    d15s16 = vqrshrn_n_s32(q6s32, 14);
-    q4s16 = vcombine_s16(d8s16, d9s16);
-    q7s16 = vcombine_s16(d14s16, d15s16);
-
-    q2s32 = vmull_s16(d26s16, d2s16);
-    q3s32 = vmull_s16(d27s16, d2s16);
-    q9s32 = vmull_s16(d26s16, d3s16);
-    q15s32 = vmull_s16(d27s16, d3s16);
-
-    q2s32 = vmlsl_s16(q2s32, d22s16, d3s16);
-    q3s32 = vmlsl_s16(q3s32, d23s16, d3s16);
-    q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
-    q15s32 = vmlal_s16(q15s32, d23s16, d2s16);
-
-    d10s16 = vqrshrn_n_s32(q2s32, 14);
-    d11s16 = vqrshrn_n_s32(q3s32, 14);
-    d12s16 = vqrshrn_n_s32(q9s32, 14);
-    d13s16 = vqrshrn_n_s32(q15s32, 14);
-    q5s16 = vcombine_s16(d10s16, d11s16);
-    q6s16 = vcombine_s16(d12s16, d13s16);
-
-    // stage 4
-    d30s16 = vdup_n_s16(cospi_16_64);
-
-    q2s32 = vmull_s16(d16s16, d30s16);
-    q11s32 = vmull_s16(d17s16, d30s16);
-    q0s32 = vmull_s16(d24s16, d30s16);
-    q1s32 = vmull_s16(d25s16, d30s16);
-
-    d30s16 = vdup_n_s16(cospi_24_64);
-    d31s16 = vdup_n_s16(cospi_8_64);
-
-    q3s32 = vaddq_s32(q2s32, q0s32);
-    q12s32 = vaddq_s32(q11s32, q1s32);
-    q13s32 = vsubq_s32(q2s32, q0s32);
-    q1s32 = vsubq_s32(q11s32, q1s32);
-
-    d16s16 = vqrshrn_n_s32(q3s32, 14);
-    d17s16 = vqrshrn_n_s32(q12s32, 14);
-    d18s16 = vqrshrn_n_s32(q13s32, 14);
-    d19s16 = vqrshrn_n_s32(q1s32, 14);
-    q8s16 = vcombine_s16(d16s16, d17s16);
-    q9s16 = vcombine_s16(d18s16, d19s16);
-
-    q0s32 = vmull_s16(d20s16, d31s16);
-    q1s32 = vmull_s16(d21s16, d31s16);
-    q12s32 = vmull_s16(d20s16, d30s16);
-    q13s32 = vmull_s16(d21s16, d30s16);
-
-    q0s32 = vmlal_s16(q0s32, d28s16, d30s16);
-    q1s32 = vmlal_s16(q1s32, d29s16, d30s16);
-    q12s32 = vmlsl_s16(q12s32, d28s16, d31s16);
-    q13s32 = vmlsl_s16(q13s32, d29s16, d31s16);
-
-    d22s16 = vqrshrn_n_s32(q0s32, 14);
-    d23s16 = vqrshrn_n_s32(q1s32, 14);
-    d20s16 = vqrshrn_n_s32(q12s32, 14);
-    d21s16 = vqrshrn_n_s32(q13s32, 14);
-    q10s16 = vcombine_s16(d20s16, d21s16);
-    q11s16 = vcombine_s16(d22s16, d23s16);
-
-    q13s16 = vsubq_s16(q4s16, q5s16);
-    q4s16 = vaddq_s16(q4s16, q5s16);
-    q14s16 = vsubq_s16(q7s16, q6s16);
-    q15s16 = vaddq_s16(q6s16, q7s16);
-    d26s16 = vget_low_s16(q13s16);
-    d27s16 = vget_high_s16(q13s16);
-    d28s16 = vget_low_s16(q14s16);
-    d29s16 = vget_high_s16(q14s16);
-
-    // stage 5
-    q0s16 = vaddq_s16(q8s16, q11s16);
-    q1s16 = vaddq_s16(q9s16, q10s16);
-    q2s16 = vsubq_s16(q9s16, q10s16);
-    q3s16 = vsubq_s16(q8s16, q11s16);
-
-    d16s16 = vdup_n_s16(cospi_16_64);
-
-    q11s32 = vmull_s16(d26s16, d16s16);
-    q12s32 = vmull_s16(d27s16, d16s16);
-    q9s32 = vmull_s16(d28s16, d16s16);
-    q10s32 = vmull_s16(d29s16, d16s16);
-
-    q6s32 = vsubq_s32(q9s32, q11s32);
-    q13s32 = vsubq_s32(q10s32, q12s32);
-    q9s32 = vaddq_s32(q9s32, q11s32);
-    q10s32 = vaddq_s32(q10s32, q12s32);
-
-    d10s16 = vqrshrn_n_s32(q6s32, 14);
-    d11s16 = vqrshrn_n_s32(q13s32, 14);
-    d12s16 = vqrshrn_n_s32(q9s32, 14);
-    d13s16 = vqrshrn_n_s32(q10s32, 14);
-    q5s16 = vcombine_s16(d10s16, d11s16);
-    q6s16 = vcombine_s16(d12s16, d13s16);
-
-    // stage 6
-    q8s16 = vaddq_s16(q0s16, q15s16);
-    q9s16 = vaddq_s16(q1s16, q6s16);
-    q10s16 = vaddq_s16(q2s16, q5s16);
-    q11s16 = vaddq_s16(q3s16, q4s16);
-    q12s16 = vsubq_s16(q3s16, q4s16);
-    q13s16 = vsubq_s16(q2s16, q5s16);
-    q14s16 = vsubq_s16(q1s16, q6s16);
-    q15s16 = vsubq_s16(q0s16, q15s16);
-
-    d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
-    d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
-    d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
-    d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
-    d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
-    d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
-    d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
-    d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
-    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-    d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
-    d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
-    d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
-    d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
-
-    // store the data
-    output_stride >>= 1;  // output_stride / 2, out is int16_t
-    vst1_u64((uint64_t *)out, d16u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d17u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d18u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d19u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d20u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d21u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d22u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d23u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d24u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d25u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d26u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d27u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d28u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d29u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d30u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d31u64);
-    return;
-}
-
-void vpx_idct16x16_256_add_neon_pass2(
-        int16_t *src,
-        int16_t *out,
-        int16_t *pass1Output,
-        int16_t skip_adding,
-        uint8_t *dest,
-        int dest_stride) {
-    uint8_t *d;
-    uint8x8_t d12u8, d13u8;
-    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
-    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-    uint64x1_t d24u64, d25u64, d26u64, d27u64;
-    int64x1_t d12s64, d13s64;
-    uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16;
-    uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16;
-    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-    int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
-    int32x4_t q10s32, q11s32, q12s32, q13s32;
-    int16x8x2_t q0x2s16;
-
-    q0x2s16 = vld2q_s16(src);
-    q8s16  = q0x2s16.val[0];
-    src += 16;
-    q0x2s16 = vld2q_s16(src);
-    q9s16  = q0x2s16.val[0];
-    src += 16;
-    q0x2s16 = vld2q_s16(src);
-    q10s16 = q0x2s16.val[0];
-    src += 16;
-    q0x2s16 = vld2q_s16(src);
-    q11s16 = q0x2s16.val[0];
-    src += 16;
-    q0x2s16 = vld2q_s16(src);
-    q12s16 = q0x2s16.val[0];
-    src += 16;
-    q0x2s16 = vld2q_s16(src);
-    q13s16 = q0x2s16.val[0];
-    src += 16;
-    q0x2s16 = vld2q_s16(src);
-    q14s16 = q0x2s16.val[0];
-    src += 16;
-    q0x2s16 = vld2q_s16(src);
-    q15s16 = q0x2s16.val[0];
-
-    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
-                 &q12s16, &q13s16, &q14s16, &q15s16);
-
-    d16s16 = vget_low_s16(q8s16);
-    d17s16 = vget_high_s16(q8s16);
-    d18s16 = vget_low_s16(q9s16);
-    d19s16 = vget_high_s16(q9s16);
-    d20s16 = vget_low_s16(q10s16);
-    d21s16 = vget_high_s16(q10s16);
-    d22s16 = vget_low_s16(q11s16);
-    d23s16 = vget_high_s16(q11s16);
-    d24s16 = vget_low_s16(q12s16);
-    d25s16 = vget_high_s16(q12s16);
-    d26s16 = vget_low_s16(q13s16);
-    d27s16 = vget_high_s16(q13s16);
-    d28s16 = vget_low_s16(q14s16);
-    d29s16 = vget_high_s16(q14s16);
-    d30s16 = vget_low_s16(q15s16);
-    d31s16 = vget_high_s16(q15s16);
-
-    // stage 3
-    d12s16 = vdup_n_s16(cospi_30_64);
-    d13s16 = vdup_n_s16(cospi_2_64);
-
-    q2s32 = vmull_s16(d16s16, d12s16);
-    q3s32 = vmull_s16(d17s16, d12s16);
-    q1s32 = vmull_s16(d16s16, d13s16);
-    q4s32 = vmull_s16(d17s16, d13s16);
-
-    q2s32 = vmlsl_s16(q2s32, d30s16, d13s16);
-    q3s32 = vmlsl_s16(q3s32, d31s16, d13s16);
-    q1s32 = vmlal_s16(q1s32, d30s16, d12s16);
-    q4s32 = vmlal_s16(q4s32, d31s16, d12s16);
-
-    d0s16 = vqrshrn_n_s32(q2s32, 14);
-    d1s16 = vqrshrn_n_s32(q3s32, 14);
-    d14s16 = vqrshrn_n_s32(q1s32, 14);
-    d15s16 = vqrshrn_n_s32(q4s32, 14);
-    q0s16 = vcombine_s16(d0s16, d1s16);
-    q7s16 = vcombine_s16(d14s16, d15s16);
-
-    d30s16 = vdup_n_s16(cospi_14_64);
-    d31s16 = vdup_n_s16(cospi_18_64);
-
-    q2s32 = vmull_s16(d24s16, d30s16);
-    q3s32 = vmull_s16(d25s16, d30s16);
-    q4s32 = vmull_s16(d24s16, d31s16);
-    q5s32 = vmull_s16(d25s16, d31s16);
-
-    q2s32 = vmlsl_s16(q2s32, d22s16, d31s16);
-    q3s32 = vmlsl_s16(q3s32, d23s16, d31s16);
-    q4s32 = vmlal_s16(q4s32, d22s16, d30s16);
-    q5s32 = vmlal_s16(q5s32, d23s16, d30s16);
-
-    d2s16 = vqrshrn_n_s32(q2s32, 14);
-    d3s16 = vqrshrn_n_s32(q3s32, 14);
-    d12s16 = vqrshrn_n_s32(q4s32, 14);
-    d13s16 = vqrshrn_n_s32(q5s32, 14);
-    q1s16 = vcombine_s16(d2s16, d3s16);
-    q6s16 = vcombine_s16(d12s16, d13s16);
-
-    d30s16 = vdup_n_s16(cospi_22_64);
-    d31s16 = vdup_n_s16(cospi_10_64);
-
-    q11s32 = vmull_s16(d20s16, d30s16);
-    q12s32 = vmull_s16(d21s16, d30s16);
-    q4s32 = vmull_s16(d20s16, d31s16);
-    q5s32 = vmull_s16(d21s16, d31s16);
-
-    q11s32 = vmlsl_s16(q11s32, d26s16, d31s16);
-    q12s32 = vmlsl_s16(q12s32, d27s16, d31s16);
-    q4s32 = vmlal_s16(q4s32, d26s16, d30s16);
-    q5s32 = vmlal_s16(q5s32, d27s16, d30s16);
-
-    d4s16 = vqrshrn_n_s32(q11s32, 14);
-    d5s16 = vqrshrn_n_s32(q12s32, 14);
-    d11s16 = vqrshrn_n_s32(q5s32, 14);
-    d10s16 = vqrshrn_n_s32(q4s32, 14);
-    q2s16 = vcombine_s16(d4s16, d5s16);
-    q5s16 = vcombine_s16(d10s16, d11s16);
-
-    d30s16 = vdup_n_s16(cospi_6_64);
-    d31s16 = vdup_n_s16(cospi_26_64);
-
-    q10s32 = vmull_s16(d28s16, d30s16);
-    q11s32 = vmull_s16(d29s16, d30s16);
-    q12s32 = vmull_s16(d28s16, d31s16);
-    q13s32 = vmull_s16(d29s16, d31s16);
-
-    q10s32 = vmlsl_s16(q10s32, d18s16, d31s16);
-    q11s32 = vmlsl_s16(q11s32, d19s16, d31s16);
-    q12s32 = vmlal_s16(q12s32, d18s16, d30s16);
-    q13s32 = vmlal_s16(q13s32, d19s16, d30s16);
-
-    d6s16 = vqrshrn_n_s32(q10s32, 14);
-    d7s16 = vqrshrn_n_s32(q11s32, 14);
-    d8s16 = vqrshrn_n_s32(q12s32, 14);
-    d9s16 = vqrshrn_n_s32(q13s32, 14);
-    q3s16 = vcombine_s16(d6s16, d7s16);
-    q4s16 = vcombine_s16(d8s16, d9s16);
-
-    // stage 3
-    q9s16  = vsubq_s16(q0s16, q1s16);
-    q0s16  = vaddq_s16(q0s16, q1s16);
-    q10s16 = vsubq_s16(q3s16, q2s16);
-    q11s16 = vaddq_s16(q2s16, q3s16);
-    q12s16 = vaddq_s16(q4s16, q5s16);
-    q13s16 = vsubq_s16(q4s16, q5s16);
-    q14s16 = vsubq_s16(q7s16, q6s16);
-    q7s16  = vaddq_s16(q6s16, q7s16);
-
-    // stage 4
-    d18s16 = vget_low_s16(q9s16);
-    d19s16 = vget_high_s16(q9s16);
-    d20s16 = vget_low_s16(q10s16);
-    d21s16 = vget_high_s16(q10s16);
-    d26s16 = vget_low_s16(q13s16);
-    d27s16 = vget_high_s16(q13s16);
-    d28s16 = vget_low_s16(q14s16);
-    d29s16 = vget_high_s16(q14s16);
-
-    d30s16 = vdup_n_s16(cospi_8_64);
-    d31s16 = vdup_n_s16(cospi_24_64);
-
-    q2s32 = vmull_s16(d18s16, d31s16);
-    q3s32 = vmull_s16(d19s16, d31s16);
-    q4s32 = vmull_s16(d28s16, d31s16);
-    q5s32 = vmull_s16(d29s16, d31s16);
-
-    q2s32 = vmlal_s16(q2s32, d28s16, d30s16);
-    q3s32 = vmlal_s16(q3s32, d29s16, d30s16);
-    q4s32 = vmlsl_s16(q4s32, d18s16, d30s16);
-    q5s32 = vmlsl_s16(q5s32, d19s16, d30s16);
-
-    d12s16 = vqrshrn_n_s32(q2s32, 14);
-    d13s16 = vqrshrn_n_s32(q3s32, 14);
-    d2s16 = vqrshrn_n_s32(q4s32, 14);
-    d3s16 = vqrshrn_n_s32(q5s32, 14);
-    q1s16 = vcombine_s16(d2s16, d3s16);
-    q6s16 = vcombine_s16(d12s16, d13s16);
-
-    q3s16 = q11s16;
-    q4s16 = q12s16;
-
-    d30s16 = vdup_n_s16(-cospi_8_64);
-    q11s32 = vmull_s16(d26s16, d30s16);
-    q12s32 = vmull_s16(d27s16, d30s16);
-    q8s32 = vmull_s16(d20s16, d30s16);
-    q9s32 = vmull_s16(d21s16, d30s16);
-
-    q11s32 = vmlsl_s16(q11s32, d20s16, d31s16);
-    q12s32 = vmlsl_s16(q12s32, d21s16, d31s16);
-    q8s32 = vmlal_s16(q8s32, d26s16, d31s16);
-    q9s32 = vmlal_s16(q9s32, d27s16, d31s16);
-
-    d4s16 = vqrshrn_n_s32(q11s32, 14);
-    d5s16 = vqrshrn_n_s32(q12s32, 14);
-    d10s16 = vqrshrn_n_s32(q8s32, 14);
-    d11s16 = vqrshrn_n_s32(q9s32, 14);
-    q2s16 = vcombine_s16(d4s16, d5s16);
-    q5s16 = vcombine_s16(d10s16, d11s16);
-
-    // stage 5
-    q8s16  = vaddq_s16(q0s16, q3s16);
-    q9s16  = vaddq_s16(q1s16, q2s16);
-    q10s16 = vsubq_s16(q1s16, q2s16);
-    q11s16 = vsubq_s16(q0s16, q3s16);
-    q12s16 = vsubq_s16(q7s16, q4s16);
-    q13s16 = vsubq_s16(q6s16, q5s16);
-    q14s16 = vaddq_s16(q6s16, q5s16);
-    q15s16 = vaddq_s16(q7s16, q4s16);
-
-    // stage 6
-    d20s16 = vget_low_s16(q10s16);
-    d21s16 = vget_high_s16(q10s16);
-    d22s16 = vget_low_s16(q11s16);
-    d23s16 = vget_high_s16(q11s16);
-    d24s16 = vget_low_s16(q12s16);
-    d25s16 = vget_high_s16(q12s16);
-    d26s16 = vget_low_s16(q13s16);
-    d27s16 = vget_high_s16(q13s16);
-
-    d14s16 = vdup_n_s16(cospi_16_64);
-
-    q3s32 = vmull_s16(d26s16, d14s16);
-    q4s32 = vmull_s16(d27s16, d14s16);
-    q0s32 = vmull_s16(d20s16, d14s16);
-    q1s32 = vmull_s16(d21s16, d14s16);
-
-    q5s32 = vsubq_s32(q3s32, q0s32);
-    q6s32 = vsubq_s32(q4s32, q1s32);
-    q10s32 = vaddq_s32(q3s32, q0s32);
-    q4s32 = vaddq_s32(q4s32, q1s32);
-
-    d4s16 = vqrshrn_n_s32(q5s32, 14);
-    d5s16 = vqrshrn_n_s32(q6s32, 14);
-    d10s16 = vqrshrn_n_s32(q10s32, 14);
-    d11s16 = vqrshrn_n_s32(q4s32, 14);
-    q2s16 = vcombine_s16(d4s16, d5s16);
-    q5s16 = vcombine_s16(d10s16, d11s16);
-
-    q0s32 = vmull_s16(d22s16, d14s16);
-    q1s32 = vmull_s16(d23s16, d14s16);
-    q13s32 = vmull_s16(d24s16, d14s16);
-    q6s32 = vmull_s16(d25s16, d14s16);
-
-    q10s32 = vsubq_s32(q13s32, q0s32);
-    q4s32 = vsubq_s32(q6s32, q1s32);
-    q13s32 = vaddq_s32(q13s32, q0s32);
-    q6s32 = vaddq_s32(q6s32, q1s32);
-
-    d6s16 = vqrshrn_n_s32(q10s32, 14);
-    d7s16 = vqrshrn_n_s32(q4s32, 14);
-    d8s16 = vqrshrn_n_s32(q13s32, 14);
-    d9s16 = vqrshrn_n_s32(q6s32, 14);
-    q3s16 = vcombine_s16(d6s16, d7s16);
-    q4s16 = vcombine_s16(d8s16, d9s16);
-
-    // stage 7
-    if (skip_adding != 0) {
-        d = dest;
-        // load the data in pass1
-        q0s16 = vld1q_s16(pass1Output);
-        pass1Output += 8;
-        q1s16 = vld1q_s16(pass1Output);
-        pass1Output += 8;
-        d12s64 = vld1_s64((int64_t *)dest);
-        dest += dest_stride;
-        d13s64 = vld1_s64((int64_t *)dest);
-        dest += dest_stride;
-
-        q12s16 = vaddq_s16(q0s16, q15s16);
-        q13s16 = vaddq_s16(q1s16, q14s16);
-        q12s16 = vrshrq_n_s16(q12s16, 6);
-        q13s16 = vrshrq_n_s16(q13s16, 6);
-        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
-                          vreinterpret_u8_s64(d12s64));
-        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
-                          vreinterpret_u8_s64(d13s64));
-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-        d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-        d += dest_stride;
-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
-        d += dest_stride;
-        q14s16 = vsubq_s16(q1s16, q14s16);
-        q15s16 = vsubq_s16(q0s16, q15s16);
-
-        q10s16 = vld1q_s16(pass1Output);
-        pass1Output += 8;
-        q11s16 = vld1q_s16(pass1Output);
-        pass1Output += 8;
-        d12s64 = vld1_s64((int64_t *)dest);
-        dest += dest_stride;
-        d13s64 = vld1_s64((int64_t *)dest);
-        dest += dest_stride;
-        q12s16 = vaddq_s16(q10s16, q5s16);
-        q13s16 = vaddq_s16(q11s16, q4s16);
-        q12s16 = vrshrq_n_s16(q12s16, 6);
-        q13s16 = vrshrq_n_s16(q13s16, 6);
-        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
-                          vreinterpret_u8_s64(d12s64));
-        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
-                          vreinterpret_u8_s64(d13s64));
-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-        d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-        d += dest_stride;
-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
-        d += dest_stride;
-        q4s16 = vsubq_s16(q11s16, q4s16);
-        q5s16 = vsubq_s16(q10s16, q5s16);
-
-        q0s16 = vld1q_s16(pass1Output);
-        pass1Output += 8;
-        q1s16 = vld1q_s16(pass1Output);
-        pass1Output += 8;
-        d12s64 = vld1_s64((int64_t *)dest);
-        dest += dest_stride;
-        d13s64 = vld1_s64((int64_t *)dest);
-        dest += dest_stride;
-        q12s16 = vaddq_s16(q0s16, q3s16);
-        q13s16 = vaddq_s16(q1s16, q2s16);
-        q12s16 = vrshrq_n_s16(q12s16, 6);
-        q13s16 = vrshrq_n_s16(q13s16, 6);
-        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
-                          vreinterpret_u8_s64(d12s64));
-        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
-                          vreinterpret_u8_s64(d13s64));
-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-        d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-        d += dest_stride;
-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
-        d += dest_stride;
-        q2s16 = vsubq_s16(q1s16, q2s16);
-        q3s16 = vsubq_s16(q0s16, q3s16);
-
-        q10s16 = vld1q_s16(pass1Output);
-        pass1Output += 8;
-        q11s16 = vld1q_s16(pass1Output);
-        d12s64 = vld1_s64((int64_t *)dest);
-        dest += dest_stride;
-        d13s64 = vld1_s64((int64_t *)dest);
-        dest += dest_stride;
-        q12s16 = vaddq_s16(q10s16, q9s16);
-        q13s16 = vaddq_s16(q11s16, q8s16);
-        q12s16 = vrshrq_n_s16(q12s16, 6);
-        q13s16 = vrshrq_n_s16(q13s16, 6);
-        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
-                          vreinterpret_u8_s64(d12s64));
-        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
-                          vreinterpret_u8_s64(d13s64));
-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-        d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-        d += dest_stride;
-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
-        d += dest_stride;
-        q8s16 = vsubq_s16(q11s16, q8s16);
-        q9s16 = vsubq_s16(q10s16, q9s16);
-
-        // store the data  out 8,9,10,11,12,13,14,15
-        d12s64 = vld1_s64((int64_t *)dest);
-        dest += dest_stride;
-        q8s16 = vrshrq_n_s16(q8s16, 6);
-        q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
-                         vreinterpret_u8_s64(d12s64));
-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-        d += dest_stride;
-
-        d12s64 = vld1_s64((int64_t *)dest);
-        dest += dest_stride;
-        q9s16 = vrshrq_n_s16(q9s16, 6);
-        q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
-                          vreinterpret_u8_s64(d12s64));
-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-        d += dest_stride;
-
-        d12s64 = vld1_s64((int64_t *)dest);
-        dest += dest_stride;
-        q2s16 = vrshrq_n_s16(q2s16, 6);
-        q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16),
-                          vreinterpret_u8_s64(d12s64));
-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-        d += dest_stride;
-
-        d12s64 = vld1_s64((int64_t *)dest);
-        dest += dest_stride;
-        q3s16 = vrshrq_n_s16(q3s16, 6);
-        q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16),
-                         vreinterpret_u8_s64(d12s64));
-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16));
-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-        d += dest_stride;
-
-        d12s64 = vld1_s64((int64_t *)dest);
-        dest += dest_stride;
-        q4s16 = vrshrq_n_s16(q4s16, 6);
-        q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16),
-                         vreinterpret_u8_s64(d12s64));
-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16));
-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-        d += dest_stride;
-
-        d12s64 = vld1_s64((int64_t *)dest);
-        dest += dest_stride;
-        q5s16 = vrshrq_n_s16(q5s16, 6);
-        q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16),
-                         vreinterpret_u8_s64(d12s64));
-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16));
-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-        d += dest_stride;
-
-        d12s64 = vld1_s64((int64_t *)dest);
-        dest += dest_stride;
-        q14s16 = vrshrq_n_s16(q14s16, 6);
-        q14u16 = vaddw_u8(vreinterpretq_u16_s16(q14s16),
-                          vreinterpret_u8_s64(d12s64));
-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16));
-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-        d += dest_stride;
-
-        d12s64 = vld1_s64((int64_t *)dest);
-        q15s16 = vrshrq_n_s16(q15s16, 6);
-        q15u16 = vaddw_u8(vreinterpretq_u16_s16(q15s16),
-                          vreinterpret_u8_s64(d12s64));
-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16));
-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    } else {  // skip_adding_dest
-        q0s16 = vld1q_s16(pass1Output);
-        pass1Output += 8;
-        q1s16 = vld1q_s16(pass1Output);
-        pass1Output += 8;
-        q12s16 = vaddq_s16(q0s16, q15s16);
-        q13s16 = vaddq_s16(q1s16, q14s16);
-        d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-        d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-        d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-        d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-        vst1_u64((uint64_t *)out, d24u64);
-        out += 4;
-        vst1_u64((uint64_t *)out, d25u64);
-        out += 12;
-        vst1_u64((uint64_t *)out, d26u64);
-        out += 4;
-        vst1_u64((uint64_t *)out, d27u64);
-        out += 12;
-        q14s16 = vsubq_s16(q1s16, q14s16);
-        q15s16 = vsubq_s16(q0s16, q15s16);
-
-        q10s16 = vld1q_s16(pass1Output);
-        pass1Output += 8;
-        q11s16 = vld1q_s16(pass1Output);
-        pass1Output += 8;
-        q12s16 = vaddq_s16(q10s16, q5s16);
-        q13s16 = vaddq_s16(q11s16, q4s16);
-        d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-        d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-        d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-        d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-        vst1_u64((uint64_t *)out, d24u64);
-        out += 4;
-        vst1_u64((uint64_t *)out, d25u64);
-        out += 12;
-        vst1_u64((uint64_t *)out, d26u64);
-        out += 4;
-        vst1_u64((uint64_t *)out, d27u64);
-        out += 12;
-        q4s16 = vsubq_s16(q11s16, q4s16);
-        q5s16 = vsubq_s16(q10s16, q5s16);
-
-        q0s16 = vld1q_s16(pass1Output);
-        pass1Output += 8;
-        q1s16 = vld1q_s16(pass1Output);
-        pass1Output += 8;
-        q12s16 = vaddq_s16(q0s16, q3s16);
-        q13s16 = vaddq_s16(q1s16, q2s16);
-        d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-        d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-        d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-        d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-        vst1_u64((uint64_t *)out, d24u64);
-        out += 4;
-        vst1_u64((uint64_t *)out, d25u64);
-        out += 12;
-        vst1_u64((uint64_t *)out, d26u64);
-        out += 4;
-        vst1_u64((uint64_t *)out, d27u64);
-        out += 12;
-        q2s16 = vsubq_s16(q1s16, q2s16);
-        q3s16 = vsubq_s16(q0s16, q3s16);
-
-        q10s16 = vld1q_s16(pass1Output);
-        pass1Output += 8;
-        q11s16 = vld1q_s16(pass1Output);
-        pass1Output += 8;
-        q12s16 = vaddq_s16(q10s16, q9s16);
-        q13s16 = vaddq_s16(q11s16, q8s16);
-        d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-        d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-        d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-        d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-        vst1_u64((uint64_t *)out, d24u64);
-        out += 4;
-        vst1_u64((uint64_t *)out, d25u64);
-        out += 12;
-        vst1_u64((uint64_t *)out, d26u64);
-        out += 4;
-        vst1_u64((uint64_t *)out, d27u64);
-        out += 12;
-        q8s16 = vsubq_s16(q11s16, q8s16);
-        q9s16 = vsubq_s16(q10s16, q9s16);
-
-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16)));
-        out += 4;
-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16)));
-        out += 12;
-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16)));
-        out += 4;
-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16)));
-        out += 12;
-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16)));
-        out += 4;
-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16)));
-        out += 12;
-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16)));
-        out += 4;
-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16)));
-        out += 12;
-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16)));
-        out += 4;
-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16)));
-        out += 12;
-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16)));
-        out += 4;
-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16)));
-        out += 12;
-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16)));
-        out += 4;
-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16)));
-        out += 12;
-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16)));
-        out += 4;
-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16)));
-    }
-    return;
-}
-
-void vpx_idct16x16_10_add_neon_pass1(
-        int16_t *in,
-        int16_t *out,
-        int output_stride) {
-    int16x4_t d4s16;
-    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-    uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
-    uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
-    int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16;
-    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-    int32x4_t q6s32, q9s32;
-    int32x4_t q10s32, q11s32, q12s32, q15s32;
-    int16x8x2_t q0x2s16;
-
-    q0x2s16 = vld2q_s16(in);
-    q8s16 = q0x2s16.val[0];
-    in += 16;
-    q0x2s16 = vld2q_s16(in);
-    q9s16 = q0x2s16.val[0];
-    in += 16;
-    q0x2s16 = vld2q_s16(in);
-    q10s16 = q0x2s16.val[0];
-    in += 16;
-    q0x2s16 = vld2q_s16(in);
-    q11s16 = q0x2s16.val[0];
-    in += 16;
-    q0x2s16 = vld2q_s16(in);
-    q12s16 = q0x2s16.val[0];
-    in += 16;
-    q0x2s16 = vld2q_s16(in);
-    q13s16 = q0x2s16.val[0];
-    in += 16;
-    q0x2s16 = vld2q_s16(in);
-    q14s16 = q0x2s16.val[0];
-    in += 16;
-    q0x2s16 = vld2q_s16(in);
-    q15s16 = q0x2s16.val[0];
-
-    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
-                 &q12s16, &q13s16, &q14s16, &q15s16);
-
-    // stage 3
-    q0s16 = vdupq_n_s16(cospi_28_64 * 2);
-    q1s16 = vdupq_n_s16(cospi_4_64 * 2);
-
-    q4s16 = vqrdmulhq_s16(q9s16, q0s16);
-    q7s16 = vqrdmulhq_s16(q9s16, q1s16);
-
-    // stage 4
-    q1s16 = vdupq_n_s16(cospi_16_64 * 2);
-    d4s16 = vdup_n_s16(cospi_16_64);
-
-    q8s16 = vqrdmulhq_s16(q8s16, q1s16);
-
-    d8s16 = vget_low_s16(q4s16);
-    d9s16 = vget_high_s16(q4s16);
-    d14s16 = vget_low_s16(q7s16);
-    d15s16 = vget_high_s16(q7s16);
-    q9s32  = vmull_s16(d14s16, d4s16);
-    q10s32 = vmull_s16(d15s16, d4s16);
-    q12s32 = vmull_s16(d9s16, d4s16);
-    q11s32 = vmull_s16(d8s16, d4s16);
-
-    q15s32 = vsubq_s32(q10s32, q12s32);
-    q6s32 = vsubq_s32(q9s32, q11s32);
-    q9s32 = vaddq_s32(q9s32, q11s32);
-    q10s32 = vaddq_s32(q10s32, q12s32);
-
-    d11s16 = vqrshrn_n_s32(q15s32, 14);
-    d10s16 = vqrshrn_n_s32(q6s32, 14);
-    d12s16 = vqrshrn_n_s32(q9s32, 14);
-    d13s16 = vqrshrn_n_s32(q10s32, 14);
-    q5s16 = vcombine_s16(d10s16, d11s16);
-    q6s16 = vcombine_s16(d12s16, d13s16);
-
-    // stage 6
-    q2s16 = vaddq_s16(q8s16, q7s16);
-    q9s16 = vaddq_s16(q8s16, q6s16);
-    q10s16 = vaddq_s16(q8s16, q5s16);
-    q11s16 = vaddq_s16(q8s16, q4s16);
-    q12s16 = vsubq_s16(q8s16, q4s16);
-    q13s16 = vsubq_s16(q8s16, q5s16);
-    q14s16 = vsubq_s16(q8s16, q6s16);
-    q15s16 = vsubq_s16(q8s16, q7s16);
-
-    d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
-    d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
-    d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
-    d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
-    d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
-    d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
-    d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
-    d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
-    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-    d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
-    d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
-    d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
-    d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
-
-    // store the data
-    output_stride >>= 1;  // output_stride / 2, out is int16_t
-    vst1_u64((uint64_t *)out, d4u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d5u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d18u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d19u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d20u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d21u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d22u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d23u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d24u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d25u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d26u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d27u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d28u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d29u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d30u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d31u64);
-    return;
-}
-
-void vpx_idct16x16_10_add_neon_pass2(
-        int16_t *src,
-        int16_t *out,
-        int16_t *pass1Output,
-        int16_t skip_adding,
-        uint8_t *dest,
-        int dest_stride) {
-    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
-    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-    int16x4_t d20s16, d21s16, d22s16, d23s16;
-    int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16;
-    uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64;
-    uint64x1_t d16u64, d17u64, d18u64, d19u64;
-    uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
-    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-    int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
-    int32x4_t q10s32, q11s32, q12s32, q13s32;
-    int16x8x2_t q0x2s16;
-    (void)skip_adding;
-    (void)dest;
-    (void)dest_stride;
-
-    q0x2s16 = vld2q_s16(src);
-    q8s16 = q0x2s16.val[0];
-    src += 16;
-    q0x2s16 = vld2q_s16(src);
-    q9s16 = q0x2s16.val[0];
-    src += 16;
-    q0x2s16 = vld2q_s16(src);
-    q10s16 = q0x2s16.val[0];
-    src += 16;
-    q0x2s16 = vld2q_s16(src);
-    q11s16 = q0x2s16.val[0];
-    src += 16;
-    q0x2s16 = vld2q_s16(src);
-    q12s16 = q0x2s16.val[0];
-    src += 16;
-    q0x2s16 = vld2q_s16(src);
-    q13s16 = q0x2s16.val[0];
-    src += 16;
-    q0x2s16 = vld2q_s16(src);
-    q14s16 = q0x2s16.val[0];
-    src += 16;
-    q0x2s16 = vld2q_s16(src);
-    q15s16 = q0x2s16.val[0];
-
-    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
-                 &q12s16, &q13s16, &q14s16, &q15s16);
-
-    // stage 3
-    q6s16 = vdupq_n_s16(cospi_30_64 * 2);
-    q0s16 = vqrdmulhq_s16(q8s16, q6s16);
-    q6s16 = vdupq_n_s16(cospi_2_64 * 2);
-    q7s16 = vqrdmulhq_s16(q8s16, q6s16);
-
-    q15s16 = vdupq_n_s16(-cospi_26_64 * 2);
-    q14s16 = vdupq_n_s16(cospi_6_64 * 2);
-    q3s16 = vqrdmulhq_s16(q9s16, q15s16);
-    q4s16 = vqrdmulhq_s16(q9s16, q14s16);
-
-    // stage 4
-    d0s16 = vget_low_s16(q0s16);
-    d1s16 = vget_high_s16(q0s16);
-    d6s16 = vget_low_s16(q3s16);
-    d7s16 = vget_high_s16(q3s16);
-    d8s16 = vget_low_s16(q4s16);
-    d9s16 = vget_high_s16(q4s16);
-    d14s16 = vget_low_s16(q7s16);
-    d15s16 = vget_high_s16(q7s16);
-
-    d30s16 = vdup_n_s16(cospi_8_64);
-    d31s16 = vdup_n_s16(cospi_24_64);
-
-    q12s32 = vmull_s16(d14s16, d31s16);
-    q5s32 = vmull_s16(d15s16, d31s16);
-    q2s32 = vmull_s16(d0s16, d31s16);
-    q11s32 = vmull_s16(d1s16, d31s16);
-
-    q12s32 = vmlsl_s16(q12s32, d0s16, d30s16);
-    q5s32 = vmlsl_s16(q5s32, d1s16, d30s16);
-    q2s32 = vmlal_s16(q2s32, d14s16, d30s16);
-    q11s32 = vmlal_s16(q11s32, d15s16, d30s16);
-
-    d2s16 = vqrshrn_n_s32(q12s32, 14);
-    d3s16 = vqrshrn_n_s32(q5s32, 14);
-    d12s16 = vqrshrn_n_s32(q2s32, 14);
-    d13s16 = vqrshrn_n_s32(q11s32, 14);
-    q1s16 = vcombine_s16(d2s16, d3s16);
-    q6s16 = vcombine_s16(d12s16, d13s16);
-
-    d30s16 = vdup_n_s16(-cospi_8_64);
-    q10s32 = vmull_s16(d8s16, d30s16);
-    q13s32 = vmull_s16(d9s16, d30s16);
-    q8s32 = vmull_s16(d6s16, d30s16);
-    q9s32 = vmull_s16(d7s16, d30s16);
-
-    q10s32 = vmlsl_s16(q10s32, d6s16, d31s16);
-    q13s32 = vmlsl_s16(q13s32, d7s16, d31s16);
-    q8s32 = vmlal_s16(q8s32, d8s16, d31s16);
-    q9s32 = vmlal_s16(q9s32, d9s16, d31s16);
-
-    d4s16 = vqrshrn_n_s32(q10s32, 14);
-    d5s16 = vqrshrn_n_s32(q13s32, 14);
-    d10s16 = vqrshrn_n_s32(q8s32, 14);
-    d11s16 = vqrshrn_n_s32(q9s32, 14);
-    q2s16 = vcombine_s16(d4s16, d5s16);
-    q5s16 = vcombine_s16(d10s16, d11s16);
-
-    // stage 5
-    q8s16  = vaddq_s16(q0s16, q3s16);
-    q9s16  = vaddq_s16(q1s16, q2s16);
-    q10s16 = vsubq_s16(q1s16, q2s16);
-    q11s16 = vsubq_s16(q0s16, q3s16);
-    q12s16 = vsubq_s16(q7s16, q4s16);
-    q13s16 = vsubq_s16(q6s16, q5s16);
-    q14s16 = vaddq_s16(q6s16, q5s16);
-    q15s16 = vaddq_s16(q7s16, q4s16);
-
-    // stage 6
-    d20s16 = vget_low_s16(q10s16);
-    d21s16 = vget_high_s16(q10s16);
-    d22s16 = vget_low_s16(q11s16);
-    d23s16 = vget_high_s16(q11s16);
-    d24s16 = vget_low_s16(q12s16);
-    d25s16 = vget_high_s16(q12s16);
-    d26s16 = vget_low_s16(q13s16);
-    d27s16 = vget_high_s16(q13s16);
-
-    d14s16 = vdup_n_s16(cospi_16_64);
-    q3s32 = vmull_s16(d26s16, d14s16);
-    q4s32 = vmull_s16(d27s16, d14s16);
-    q0s32 = vmull_s16(d20s16, d14s16);
-    q1s32 = vmull_s16(d21s16, d14s16);
-
-    q5s32 = vsubq_s32(q3s32, q0s32);
-    q6s32 = vsubq_s32(q4s32, q1s32);
-    q0s32 = vaddq_s32(q3s32, q0s32);
-    q4s32 = vaddq_s32(q4s32, q1s32);
-
-    d4s16 = vqrshrn_n_s32(q5s32, 14);
-    d5s16 = vqrshrn_n_s32(q6s32, 14);
-    d10s16 = vqrshrn_n_s32(q0s32, 14);
-    d11s16 = vqrshrn_n_s32(q4s32, 14);
-    q2s16 = vcombine_s16(d4s16, d5s16);
-    q5s16 = vcombine_s16(d10s16, d11s16);
-
-    q0s32 = vmull_s16(d22s16, d14s16);
-    q1s32 = vmull_s16(d23s16, d14s16);
-    q13s32 = vmull_s16(d24s16, d14s16);
-    q6s32 = vmull_s16(d25s16, d14s16);
-
-    q10s32 = vsubq_s32(q13s32, q0s32);
-    q4s32 = vsubq_s32(q6s32, q1s32);
-    q13s32 = vaddq_s32(q13s32, q0s32);
-    q6s32 = vaddq_s32(q6s32, q1s32);
-
-    d6s16 = vqrshrn_n_s32(q10s32, 14);
-    d7s16 = vqrshrn_n_s32(q4s32, 14);
-    d8s16 = vqrshrn_n_s32(q13s32, 14);
-    d9s16 = vqrshrn_n_s32(q6s32, 14);
-    q3s16 = vcombine_s16(d6s16, d7s16);
-    q4s16 = vcombine_s16(d8s16, d9s16);
-
-    // stage 7
-    q0s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q1s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q12s16 = vaddq_s16(q0s16, q15s16);
-    q13s16 = vaddq_s16(q1s16, q14s16);
-    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-    vst1_u64((uint64_t *)out, d24u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d25u64);
-    out += 12;
-    vst1_u64((uint64_t *)out, d26u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d27u64);
-    out += 12;
-    q14s16 = vsubq_s16(q1s16, q14s16);
-    q15s16 = vsubq_s16(q0s16, q15s16);
-
-    q10s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q11s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q12s16 = vaddq_s16(q10s16, q5s16);
-    q13s16 = vaddq_s16(q11s16, q4s16);
-    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-    vst1_u64((uint64_t *)out, d24u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d25u64);
-    out += 12;
-    vst1_u64((uint64_t *)out, d26u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d27u64);
-    out += 12;
-    q4s16 = vsubq_s16(q11s16, q4s16);
-    q5s16 = vsubq_s16(q10s16, q5s16);
-
-    q0s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q1s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q12s16 = vaddq_s16(q0s16, q3s16);
-    q13s16 = vaddq_s16(q1s16, q2s16);
-    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-    vst1_u64((uint64_t *)out, d24u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d25u64);
-    out += 12;
-    vst1_u64((uint64_t *)out, d26u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d27u64);
-    out += 12;
-    q2s16 = vsubq_s16(q1s16, q2s16);
-    q3s16 = vsubq_s16(q0s16, q3s16);
-
-    q10s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q11s16 = vld1q_s16(pass1Output);
-    q12s16 = vaddq_s16(q10s16, q9s16);
-    q13s16 = vaddq_s16(q11s16, q8s16);
-    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-    vst1_u64((uint64_t *)out, d24u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d25u64);
-    out += 12;
-    vst1_u64((uint64_t *)out, d26u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d27u64);
-    out += 12;
-    q8s16 = vsubq_s16(q11s16, q8s16);
-    q9s16 = vsubq_s16(q10s16, q9s16);
-
-    d4u64  = vreinterpret_u64_s16(vget_low_s16(q2s16));
-    d5u64  = vreinterpret_u64_s16(vget_high_s16(q2s16));
-    d6u64  = vreinterpret_u64_s16(vget_low_s16(q3s16));
-    d7u64  = vreinterpret_u64_s16(vget_high_s16(q3s16));
-    d8u64  = vreinterpret_u64_s16(vget_low_s16(q4s16));
-    d9u64  = vreinterpret_u64_s16(vget_high_s16(q4s16));
-    d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16));
-    d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16));
-    d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
-    d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
-    d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
-    d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
-    d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
-    d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
-    d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
-    d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
-
-    vst1_u64((uint64_t *)out, d16u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d17u64);
-    out += 12;
-    vst1_u64((uint64_t *)out, d18u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d19u64);
-    out += 12;
-    vst1_u64((uint64_t *)out, d4u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d5u64);
-    out += 12;
-    vst1_u64((uint64_t *)out, d6u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d7u64);
-    out += 12;
-    vst1_u64((uint64_t *)out, d8u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d9u64);
-    out += 12;
-    vst1_u64((uint64_t *)out, d10u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d11u64);
-    out += 12;
-    vst1_u64((uint64_t *)out, d28u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d29u64);
-    out += 12;
-    vst1_u64((uint64_t *)out, d30u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d31u64);
-    return;
-}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct16x16_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct16x16_neon.c
deleted file mode 100644
index 352979aa16..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/idct16x16_neon.c
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_dsp/vpx_dsp_common.h"
-
-void vpx_idct16x16_256_add_neon_pass1(const int16_t *input,
-                                      int16_t *output,
-                                      int output_stride);
-void vpx_idct16x16_256_add_neon_pass2(const int16_t *src,
-                                      int16_t *output,
-                                      int16_t *pass1Output,
-                                      int16_t skip_adding,
-                                      uint8_t *dest,
-                                      int dest_stride);
-void vpx_idct16x16_10_add_neon_pass1(const int16_t *input,
-                                     int16_t *output,
-                                     int output_stride);
-void vpx_idct16x16_10_add_neon_pass2(const int16_t *src,
-                                     int16_t *output,
-                                     int16_t *pass1Output,
-                                     int16_t skip_adding,
-                                     uint8_t *dest,
-                                     int dest_stride);
-
-#if HAVE_NEON_ASM
-/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
-extern void vpx_push_neon(int64_t *store);
-extern void vpx_pop_neon(int64_t *store);
-#endif  // HAVE_NEON_ASM
-
-void vpx_idct16x16_256_add_neon(const int16_t *input,
-                                uint8_t *dest, int dest_stride) {
-#if HAVE_NEON_ASM
-  int64_t store_reg[8];
-#endif
-  int16_t pass1_output[16*16] = {0};
-  int16_t row_idct_output[16*16] = {0};
-
-#if HAVE_NEON_ASM
-  // save d8-d15 register values.
-  vpx_push_neon(store_reg);
-#endif
-
-  /* Parallel idct on the upper 8 rows */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vpx_idct16x16_256_add_neon_pass1(input, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7
-  // which will be saved into row_idct_output.
-  vpx_idct16x16_256_add_neon_pass2(input+1,
-                                     row_idct_output,
-                                     pass1_output,
-                                     0,
-                                     dest,
-                                     dest_stride);
-
-  /* Parallel idct on the lower 8 rows */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vpx_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7
-  // which will be saved into row_idct_output.
-  vpx_idct16x16_256_add_neon_pass2(input+8*16+1,
-                                     row_idct_output+8,
-                                     pass1_output,
-                                     0,
-                                     dest,
-                                     dest_stride);
-
-  /* Parallel idct on the left 8 columns */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7.
-  // Then add the result to the destination data.
-  vpx_idct16x16_256_add_neon_pass2(row_idct_output+1,
-                                     row_idct_output,
-                                     pass1_output,
-                                     1,
-                                     dest,
-                                     dest_stride);
-
-  /* Parallel idct on the right 8 columns */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vpx_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7.
-  // Then add the result to the destination data.
-  vpx_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
-                                     row_idct_output+8,
-                                     pass1_output,
-                                     1,
-                                     dest+8,
-                                     dest_stride);
-
-#if HAVE_NEON_ASM
-  // restore d8-d15 register values.
-  vpx_pop_neon(store_reg);
-#endif
-
-  return;
-}
-
-void vpx_idct16x16_10_add_neon(const int16_t *input,
-                               uint8_t *dest, int dest_stride) {
-#if HAVE_NEON_ASM
-  int64_t store_reg[8];
-#endif
-  int16_t pass1_output[16*16] = {0};
-  int16_t row_idct_output[16*16] = {0};
-
-#if HAVE_NEON_ASM
-  // save d8-d15 register values.
-  vpx_push_neon(store_reg);
-#endif
-
-  /* Parallel idct on the upper 8 rows */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vpx_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7
-  // which will be saved into row_idct_output.
-  vpx_idct16x16_10_add_neon_pass2(input+1,
-                                        row_idct_output,
-                                        pass1_output,
-                                        0,
-                                        dest,
-                                        dest_stride);
-
-  /* Skip Parallel idct on the lower 8 rows as they are all 0s */
-
-  /* Parallel idct on the left 8 columns */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7.
-  // Then add the result to the destination data.
-  vpx_idct16x16_256_add_neon_pass2(row_idct_output+1,
-                                     row_idct_output,
-                                     pass1_output,
-                                     1,
-                                     dest,
-                                     dest_stride);
-
-  /* Parallel idct on the right 8 columns */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vpx_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7.
-  // Then add the result to the destination data.
-  vpx_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
-                                     row_idct_output+8,
-                                     pass1_output,
-                                     1,
-                                     dest+8,
-                                     dest_stride);
-
-#if HAVE_NEON_ASM
-  // restore d8-d15 register values.
-  vpx_pop_neon(store_reg);
-#endif
-
-  return;
-}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c
deleted file mode 100644
index c25c0c4a5c..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_config.h"
-
-#include "vpx_dsp/inv_txfm.h"
-#include "vpx_ports/mem.h"
-
-static INLINE void LD_16x8(
-        uint8_t *d,
-        int d_stride,
-        uint8x16_t *q8u8,
-        uint8x16_t *q9u8,
-        uint8x16_t *q10u8,
-        uint8x16_t *q11u8,
-        uint8x16_t *q12u8,
-        uint8x16_t *q13u8,
-        uint8x16_t *q14u8,
-        uint8x16_t *q15u8) {
-    *q8u8 = vld1q_u8(d);
-    d += d_stride;
-    *q9u8 = vld1q_u8(d);
-    d += d_stride;
-    *q10u8 = vld1q_u8(d);
-    d += d_stride;
-    *q11u8 = vld1q_u8(d);
-    d += d_stride;
-    *q12u8 = vld1q_u8(d);
-    d += d_stride;
-    *q13u8 = vld1q_u8(d);
-    d += d_stride;
-    *q14u8 = vld1q_u8(d);
-    d += d_stride;
-    *q15u8 = vld1q_u8(d);
-    return;
-}
-
-static INLINE void ADD_DIFF_16x8(
-        uint8x16_t qdiffu8,
-        uint8x16_t *q8u8,
-        uint8x16_t *q9u8,
-        uint8x16_t *q10u8,
-        uint8x16_t *q11u8,
-        uint8x16_t *q12u8,
-        uint8x16_t *q13u8,
-        uint8x16_t *q14u8,
-        uint8x16_t *q15u8) {
-    *q8u8 = vqaddq_u8(*q8u8, qdiffu8);
-    *q9u8 = vqaddq_u8(*q9u8, qdiffu8);
-    *q10u8 = vqaddq_u8(*q10u8, qdiffu8);
-    *q11u8 = vqaddq_u8(*q11u8, qdiffu8);
-    *q12u8 = vqaddq_u8(*q12u8, qdiffu8);
-    *q13u8 = vqaddq_u8(*q13u8, qdiffu8);
-    *q14u8 = vqaddq_u8(*q14u8, qdiffu8);
-    *q15u8 = vqaddq_u8(*q15u8, qdiffu8);
-    return;
-}
-
-static INLINE void SUB_DIFF_16x8(
-        uint8x16_t qdiffu8,
-        uint8x16_t *q8u8,
-        uint8x16_t *q9u8,
-        uint8x16_t *q10u8,
-        uint8x16_t *q11u8,
-        uint8x16_t *q12u8,
-        uint8x16_t *q13u8,
-        uint8x16_t *q14u8,
-        uint8x16_t *q15u8) {
-    *q8u8 = vqsubq_u8(*q8u8, qdiffu8);
-    *q9u8 = vqsubq_u8(*q9u8, qdiffu8);
-    *q10u8 = vqsubq_u8(*q10u8, qdiffu8);
-    *q11u8 = vqsubq_u8(*q11u8, qdiffu8);
-    *q12u8 = vqsubq_u8(*q12u8, qdiffu8);
-    *q13u8 = vqsubq_u8(*q13u8, qdiffu8);
-    *q14u8 = vqsubq_u8(*q14u8, qdiffu8);
-    *q15u8 = vqsubq_u8(*q15u8, qdiffu8);
-    return;
-}
-
-static INLINE void ST_16x8(
-        uint8_t *d,
-        int d_stride,
-        uint8x16_t *q8u8,
-        uint8x16_t *q9u8,
-        uint8x16_t *q10u8,
-        uint8x16_t *q11u8,
-        uint8x16_t *q12u8,
-        uint8x16_t *q13u8,
-        uint8x16_t *q14u8,
-        uint8x16_t *q15u8) {
-    vst1q_u8(d, *q8u8);
-    d += d_stride;
-    vst1q_u8(d, *q9u8);
-    d += d_stride;
-    vst1q_u8(d, *q10u8);
-    d += d_stride;
-    vst1q_u8(d, *q11u8);
-    d += d_stride;
-    vst1q_u8(d, *q12u8);
-    d += d_stride;
-    vst1q_u8(d, *q13u8);
-    d += d_stride;
-    vst1q_u8(d, *q14u8);
-    d += d_stride;
-    vst1q_u8(d, *q15u8);
-    return;
-}
-
-void vpx_idct32x32_1_add_neon(
-        int16_t *input,
-        uint8_t *dest,
-        int dest_stride) {
-    uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
-    int i, j, dest_stride8;
-    uint8_t *d;
-    int16_t a1, cospi_16_64 = 11585;
-    int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
-
-    out = dct_const_round_shift(out * cospi_16_64);
-    a1 = ROUND_POWER_OF_TWO(out, 6);
-
-    dest_stride8 = dest_stride * 8;
-    if (a1 >= 0) {  // diff_positive_32_32
-        a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
-        q0u8 = vdupq_n_u8(a1);
-        for (i = 0; i < 2; i++, dest += 16) {  // diff_positive_32_32_loop
-            d = dest;
-            for (j = 0; j < 4; j++) {
-                LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
-                                        &q12u8, &q13u8, &q14u8, &q15u8);
-                ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8,
-                                    &q12u8, &q13u8, &q14u8, &q15u8);
-                ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
-                                        &q12u8, &q13u8, &q14u8, &q15u8);
-                d += dest_stride8;
-            }
-        }
-    } else {  // diff_negative_32_32
-        a1 = -a1;
-        a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
-        q0u8 = vdupq_n_u8(a1);
-        for (i = 0; i < 2; i++, dest += 16) {  // diff_negative_32_32_loop
-            d = dest;
-            for (j = 0; j < 4; j++) {
-                LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
-                                        &q12u8, &q13u8, &q14u8, &q15u8);
-                SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8,
-                                    &q12u8, &q13u8, &q14u8, &q15u8);
-                ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
-                                        &q12u8, &q13u8, &q14u8, &q15u8);
-                d += dest_stride8;
-            }
-        }
-    }
-    return;
-}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct32x32_add_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct32x32_add_neon.c
deleted file mode 100644
index 025437eb96..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/idct32x32_add_neon.c
+++ /dev/null
@@ -1,719 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_config.h"
-#include "vpx_dsp/txfm_common.h"
-
-#define LOAD_FROM_TRANSPOSED(prev, first, second) \
-    q14s16 = vld1q_s16(trans_buf + first * 8); \
-    q13s16 = vld1q_s16(trans_buf + second * 8);
-
-#define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \
-    qA = vld1q_s16(out + first * 32); \
-    qB = vld1q_s16(out + second * 32);
-
-#define STORE_IN_OUTPUT(prev, first, second, qA, qB) \
-    vst1q_s16(out + first * 32, qA); \
-    vst1q_s16(out + second * 32, qB);
-
-#define  STORE_COMBINE_CENTER_RESULTS(r10, r9) \
-       __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, \
-                                      q6s16, q7s16, q8s16, q9s16);
-static INLINE void __STORE_COMBINE_CENTER_RESULTS(
-        uint8_t *p1,
-        uint8_t *p2,
-        int stride,
-        int16x8_t q6s16,
-        int16x8_t q7s16,
-        int16x8_t q8s16,
-        int16x8_t q9s16) {
-    int16x4_t d8s16, d9s16, d10s16, d11s16;
-
-    d8s16 = vld1_s16((int16_t *)p1);
-    p1 += stride;
-    d11s16 = vld1_s16((int16_t *)p2);
-    p2 -= stride;
-    d9s16 = vld1_s16((int16_t *)p1);
-    d10s16 = vld1_s16((int16_t *)p2);
-
-    q7s16 = vrshrq_n_s16(q7s16, 6);
-    q8s16 = vrshrq_n_s16(q8s16, 6);
-    q9s16 = vrshrq_n_s16(q9s16, 6);
-    q6s16 = vrshrq_n_s16(q6s16, 6);
-
-    q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16),
-                                           vreinterpret_u8_s16(d9s16)));
-    q8s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q8s16),
-                                           vreinterpret_u8_s16(d10s16)));
-    q9s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q9s16),
-                                           vreinterpret_u8_s16(d11s16)));
-    q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16),
-                                           vreinterpret_u8_s16(d8s16)));
-
-    d9s16  = vreinterpret_s16_u8(vqmovun_s16(q7s16));
-    d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16));
-    d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16));
-    d8s16  = vreinterpret_s16_u8(vqmovun_s16(q6s16));
-
-    vst1_s16((int16_t *)p1, d9s16);
-    p1 -= stride;
-    vst1_s16((int16_t *)p2, d10s16);
-    p2 += stride;
-    vst1_s16((int16_t *)p1, d8s16);
-    vst1_s16((int16_t *)p2, d11s16);
-    return;
-}
-
-#define  STORE_COMBINE_EXTREME_RESULTS(r7, r6); \
-       __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, \
-                                      q4s16, q5s16, q6s16, q7s16);
-static INLINE void __STORE_COMBINE_EXTREME_RESULTS(
-        uint8_t *p1,
-        uint8_t *p2,
-        int stride,
-        int16x8_t q4s16,
-        int16x8_t q5s16,
-        int16x8_t q6s16,
-        int16x8_t q7s16) {
-    int16x4_t d4s16, d5s16, d6s16, d7s16;
-
-    d4s16 = vld1_s16((int16_t *)p1);
-    p1 += stride;
-    d7s16 = vld1_s16((int16_t *)p2);
-    p2 -= stride;
-    d5s16 = vld1_s16((int16_t *)p1);
-    d6s16 = vld1_s16((int16_t *)p2);
-
-    q5s16 = vrshrq_n_s16(q5s16, 6);
-    q6s16 = vrshrq_n_s16(q6s16, 6);
-    q7s16 = vrshrq_n_s16(q7s16, 6);
-    q4s16 = vrshrq_n_s16(q4s16, 6);
-
-    q5s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q5s16),
-                                           vreinterpret_u8_s16(d5s16)));
-    q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16),
-                                           vreinterpret_u8_s16(d6s16)));
-    q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16),
-                                           vreinterpret_u8_s16(d7s16)));
-    q4s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q4s16),
-                                           vreinterpret_u8_s16(d4s16)));
-
-    d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16));
-    d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
-    d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
-    d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16));
-
-    vst1_s16((int16_t *)p1, d5s16);
-    p1 -= stride;
-    vst1_s16((int16_t *)p2, d6s16);
-    p2 += stride;
-    vst1_s16((int16_t *)p2, d7s16);
-    vst1_s16((int16_t *)p1, d4s16);
-    return;
-}
-
-#define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \
-        DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB);
-static INLINE void DO_BUTTERFLY(
-        int16x8_t q14s16,
-        int16x8_t q13s16,
-        int16_t first_const,
-        int16_t second_const,
-        int16x8_t *qAs16,
-        int16x8_t *qBs16) {
-    int16x4_t d30s16, d31s16;
-    int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32;
-    int16x4_t dCs16, dDs16, dAs16, dBs16;
-
-    dCs16 = vget_low_s16(q14s16);
-    dDs16 = vget_high_s16(q14s16);
-    dAs16 = vget_low_s16(q13s16);
-    dBs16 = vget_high_s16(q13s16);
-
-    d30s16 = vdup_n_s16(first_const);
-    d31s16 = vdup_n_s16(second_const);
-
-    q8s32 = vmull_s16(dCs16, d30s16);
-    q10s32 = vmull_s16(dAs16, d31s16);
-    q9s32 = vmull_s16(dDs16, d30s16);
-    q11s32 = vmull_s16(dBs16, d31s16);
-    q12s32 = vmull_s16(dCs16, d31s16);
-
-    q8s32 = vsubq_s32(q8s32, q10s32);
-    q9s32 = vsubq_s32(q9s32, q11s32);
-
-    q10s32 = vmull_s16(dDs16, d31s16);
-    q11s32 = vmull_s16(dAs16, d30s16);
-    q15s32 = vmull_s16(dBs16, d30s16);
-
-    q11s32 = vaddq_s32(q12s32, q11s32);
-    q10s32 = vaddq_s32(q10s32, q15s32);
-
-    *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14),
-                          vqrshrn_n_s32(q9s32, 14));
-    *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14),
-                          vqrshrn_n_s32(q10s32, 14));
-    return;
-}
-
-static INLINE void idct32_transpose_pair(
-        int16_t *input,
-        int16_t *t_buf) {
-    int16_t *in;
-    int i;
-    const int stride = 32;
-    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-    int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
-    int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
-
-    for (i = 0; i < 4; i++, input += 8) {
-        in = input;
-        q8s16 = vld1q_s16(in);
-        in += stride;
-        q9s16 = vld1q_s16(in);
-        in += stride;
-        q10s16 = vld1q_s16(in);
-        in += stride;
-        q11s16 = vld1q_s16(in);
-        in += stride;
-        q12s16 = vld1q_s16(in);
-        in += stride;
-        q13s16 = vld1q_s16(in);
-        in += stride;
-        q14s16 = vld1q_s16(in);
-        in += stride;
-        q15s16 = vld1q_s16(in);
-
-        d16s16 = vget_low_s16(q8s16);
-        d17s16 = vget_high_s16(q8s16);
-        d18s16 = vget_low_s16(q9s16);
-        d19s16 = vget_high_s16(q9s16);
-        d20s16 = vget_low_s16(q10s16);
-        d21s16 = vget_high_s16(q10s16);
-        d22s16 = vget_low_s16(q11s16);
-        d23s16 = vget_high_s16(q11s16);
-        d24s16 = vget_low_s16(q12s16);
-        d25s16 = vget_high_s16(q12s16);
-        d26s16 = vget_low_s16(q13s16);
-        d27s16 = vget_high_s16(q13s16);
-        d28s16 = vget_low_s16(q14s16);
-        d29s16 = vget_high_s16(q14s16);
-        d30s16 = vget_low_s16(q15s16);
-        d31s16 = vget_high_s16(q15s16);
-
-        q8s16  = vcombine_s16(d16s16, d24s16);  // vswp d17, d24
-        q9s16  = vcombine_s16(d18s16, d26s16);  // vswp d19, d26
-        q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
-        q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
-        q12s16 = vcombine_s16(d17s16, d25s16);
-        q13s16 = vcombine_s16(d19s16, d27s16);
-        q14s16 = vcombine_s16(d21s16, d29s16);
-        q15s16 = vcombine_s16(d23s16, d31s16);
-
-        q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
-                            vreinterpretq_s32_s16(q10s16));
-        q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q9s16),
-                            vreinterpretq_s32_s16(q11s16));
-        q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q12s16),
-                            vreinterpretq_s32_s16(q14s16));
-        q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q13s16),
-                            vreinterpretq_s32_s16(q15s16));
-
-        q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
-                            vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
-        q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
-                            vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
-        q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
-                            vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
-        q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
-                            vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
-
-        vst1q_s16(t_buf, q0x2s16.val[0]);
-        t_buf += 8;
-        vst1q_s16(t_buf, q0x2s16.val[1]);
-        t_buf += 8;
-        vst1q_s16(t_buf, q1x2s16.val[0]);
-        t_buf += 8;
-        vst1q_s16(t_buf, q1x2s16.val[1]);
-        t_buf += 8;
-        vst1q_s16(t_buf, q2x2s16.val[0]);
-        t_buf += 8;
-        vst1q_s16(t_buf, q2x2s16.val[1]);
-        t_buf += 8;
-        vst1q_s16(t_buf, q3x2s16.val[0]);
-        t_buf += 8;
-        vst1q_s16(t_buf, q3x2s16.val[1]);
-        t_buf += 8;
-    }
-    return;
-}
-
-static INLINE void idct32_bands_end_1st_pass(
-        int16_t *out,
-        int16x8_t q2s16,
-        int16x8_t q3s16,
-        int16x8_t q6s16,
-        int16x8_t q7s16,
-        int16x8_t q8s16,
-        int16x8_t q9s16,
-        int16x8_t q10s16,
-        int16x8_t q11s16,
-        int16x8_t q12s16,
-        int16x8_t q13s16,
-        int16x8_t q14s16,
-        int16x8_t q15s16) {
-    int16x8_t q0s16, q1s16, q4s16, q5s16;
-
-    STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16);
-    STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16);
-
-    LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16);
-    q4s16 = vaddq_s16(q2s16, q1s16);
-    q5s16 = vaddq_s16(q3s16, q0s16);
-    q6s16 = vsubq_s16(q3s16, q0s16);
-    q7s16 = vsubq_s16(q2s16, q1s16);
-    STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16);
-    STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16);
-
-    LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16);
-    q2s16 = vaddq_s16(q10s16, q1s16);
-    q3s16 = vaddq_s16(q11s16, q0s16);
-    q4s16 = vsubq_s16(q11s16, q0s16);
-    q5s16 = vsubq_s16(q10s16, q1s16);
-
-    LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16);
-    q8s16 = vaddq_s16(q4s16, q1s16);
-    q9s16 = vaddq_s16(q5s16, q0s16);
-    q6s16 = vsubq_s16(q5s16, q0s16);
-    q7s16 = vsubq_s16(q4s16, q1s16);
-    STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16);
-    STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16);
-
-    LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16);
-    q4s16 = vaddq_s16(q2s16, q1s16);
-    q5s16 = vaddq_s16(q3s16, q0s16);
-    q6s16 = vsubq_s16(q3s16, q0s16);
-    q7s16 = vsubq_s16(q2s16, q1s16);
-    STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16);
-    STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16);
-
-    LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16);
-    q2s16 = vaddq_s16(q12s16, q1s16);
-    q3s16 = vaddq_s16(q13s16, q0s16);
-    q4s16 = vsubq_s16(q13s16, q0s16);
-    q5s16 = vsubq_s16(q12s16, q1s16);
-
-    LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16);
-    q8s16 = vaddq_s16(q4s16, q1s16);
-    q9s16 = vaddq_s16(q5s16, q0s16);
-    q6s16 = vsubq_s16(q5s16, q0s16);
-    q7s16 = vsubq_s16(q4s16, q1s16);
-    STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16);
-    STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16);
-
-    LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16);
-    q4s16 = vaddq_s16(q2s16, q1s16);
-    q5s16 = vaddq_s16(q3s16, q0s16);
-    q6s16 = vsubq_s16(q3s16, q0s16);
-    q7s16 = vsubq_s16(q2s16, q1s16);
-    STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16);
-    STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16);
-
-    LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16);
-    q2s16 = vaddq_s16(q14s16, q1s16);
-    q3s16 = vaddq_s16(q15s16, q0s16);
-    q4s16 = vsubq_s16(q15s16, q0s16);
-    q5s16 = vsubq_s16(q14s16, q1s16);
-
-    LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16);
-    q8s16 = vaddq_s16(q4s16, q1s16);
-    q9s16 = vaddq_s16(q5s16, q0s16);
-    q6s16 = vsubq_s16(q5s16, q0s16);
-    q7s16 = vsubq_s16(q4s16, q1s16);
-    STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16);
-    STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16);
-
-    LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16);
-    q4s16 = vaddq_s16(q2s16, q1s16);
-    q5s16 = vaddq_s16(q3s16, q0s16);
-    q6s16 = vsubq_s16(q3s16, q0s16);
-    q7s16 = vsubq_s16(q2s16, q1s16);
-    STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16);
-    STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16);
-    return;
-}
-
-static INLINE void idct32_bands_end_2nd_pass(
-        int16_t *out,
-        uint8_t *dest,
-        int stride,
-        int16x8_t q2s16,
-        int16x8_t q3s16,
-        int16x8_t q6s16,
-        int16x8_t q7s16,
-        int16x8_t q8s16,
-        int16x8_t q9s16,
-        int16x8_t q10s16,
-        int16x8_t q11s16,
-        int16x8_t q12s16,
-        int16x8_t q13s16,
-        int16x8_t q14s16,
-        int16x8_t q15s16) {
-    uint8_t *r6  = dest + 31 * stride;
-    uint8_t *r7  = dest/* +  0 * stride*/;
-    uint8_t *r9  = dest + 15 * stride;
-    uint8_t *r10 = dest + 16 * stride;
-    int str2 = stride << 1;
-    int16x8_t q0s16, q1s16, q4s16, q5s16;
-
-    STORE_COMBINE_CENTER_RESULTS(r10, r9);
-    r10 += str2; r9 -= str2;
-
-    LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16)
-    q4s16 = vaddq_s16(q2s16, q1s16);
-    q5s16 = vaddq_s16(q3s16, q0s16);
-    q6s16 = vsubq_s16(q3s16, q0s16);
-    q7s16 = vsubq_s16(q2s16, q1s16);
-    STORE_COMBINE_EXTREME_RESULTS(r7, r6);
-    r7 += str2; r6 -= str2;
-
-    LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16)
-    q2s16 = vaddq_s16(q10s16, q1s16);
-    q3s16 = vaddq_s16(q11s16, q0s16);
-    q4s16 = vsubq_s16(q11s16, q0s16);
-    q5s16 = vsubq_s16(q10s16, q1s16);
-
-    LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16)
-    q8s16 = vaddq_s16(q4s16, q1s16);
-    q9s16 = vaddq_s16(q5s16, q0s16);
-    q6s16 = vsubq_s16(q5s16, q0s16);
-    q7s16 = vsubq_s16(q4s16, q1s16);
-    STORE_COMBINE_CENTER_RESULTS(r10, r9);
-    r10 += str2; r9 -= str2;
-
-    LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16)
-    q4s16 = vaddq_s16(q2s16, q1s16);
-    q5s16 = vaddq_s16(q3s16, q0s16);
-    q6s16 = vsubq_s16(q3s16, q0s16);
-    q7s16 = vsubq_s16(q2s16, q1s16);
-    STORE_COMBINE_EXTREME_RESULTS(r7, r6);
-    r7 += str2; r6 -= str2;
-
-    LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16)
-    q2s16 = vaddq_s16(q12s16, q1s16);
-    q3s16 = vaddq_s16(q13s16, q0s16);
-    q4s16 = vsubq_s16(q13s16, q0s16);
-    q5s16 = vsubq_s16(q12s16, q1s16);
-
-    LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16)
-    q8s16 = vaddq_s16(q4s16, q1s16);
-    q9s16 = vaddq_s16(q5s16, q0s16);
-    q6s16 = vsubq_s16(q5s16, q0s16);
-    q7s16 = vsubq_s16(q4s16, q1s16);
-    STORE_COMBINE_CENTER_RESULTS(r10, r9);
-    r10 += str2; r9 -= str2;
-
-    LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16)
-    q4s16 = vaddq_s16(q2s16, q1s16);
-    q5s16 = vaddq_s16(q3s16, q0s16);
-    q6s16 = vsubq_s16(q3s16, q0s16);
-    q7s16 = vsubq_s16(q2s16, q1s16);
-    STORE_COMBINE_EXTREME_RESULTS(r7, r6);
-    r7 += str2; r6 -= str2;
-
-    LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16)
-    q2s16 = vaddq_s16(q14s16, q1s16);
-    q3s16 = vaddq_s16(q15s16, q0s16);
-    q4s16 = vsubq_s16(q15s16, q0s16);
-    q5s16 = vsubq_s16(q14s16, q1s16);
-
-    LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16)
-    q8s16 = vaddq_s16(q4s16, q1s16);
-    q9s16 = vaddq_s16(q5s16, q0s16);
-    q6s16 = vsubq_s16(q5s16, q0s16);
-    q7s16 = vsubq_s16(q4s16, q1s16);
-    STORE_COMBINE_CENTER_RESULTS(r10, r9);
-
-    LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16)
-    q4s16 = vaddq_s16(q2s16, q1s16);
-    q5s16 = vaddq_s16(q3s16, q0s16);
-    q6s16 = vsubq_s16(q3s16, q0s16);
-    q7s16 = vsubq_s16(q2s16, q1s16);
-    STORE_COMBINE_EXTREME_RESULTS(r7, r6);
-    return;
-}
-
-void vpx_idct32x32_1024_add_neon(
-        int16_t *input,
-        uint8_t *dest,
-        int stride) {
-    int i, idct32_pass_loop;
-    int16_t trans_buf[32 * 8];
-    int16_t pass1[32 * 32];
-    int16_t pass2[32 * 32];
-    int16_t *out;
-    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-
-    for (idct32_pass_loop = 0, out = pass1;
-         idct32_pass_loop < 2;
-         idct32_pass_loop++,
-         input = pass1,  // the input of pass2 is the result of pass1
-         out = pass2) {
-        for (i = 0;
-             i < 4; i++,
-             input += 32 * 8, out += 8) {  // idct32_bands_loop
-            idct32_transpose_pair(input, trans_buf);
-
-            // -----------------------------------------
-            // BLOCK A: 16-19,28-31
-            // -----------------------------------------
-            // generate 16,17,30,31
-            // part of stage 1
-            LOAD_FROM_TRANSPOSED(0, 1, 31)
-            DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16)
-            LOAD_FROM_TRANSPOSED(31, 17, 15)
-            DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16)
-            // part of stage 2
-            q4s16 = vaddq_s16(q0s16, q1s16);
-            q13s16 = vsubq_s16(q0s16, q1s16);
-            q6s16 = vaddq_s16(q2s16, q3s16);
-            q14s16 = vsubq_s16(q2s16, q3s16);
-            // part of stage 3
-            DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16)
-
-            // generate 18,19,28,29
-            // part of stage 1
-            LOAD_FROM_TRANSPOSED(15, 9, 23)
-            DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16)
-            LOAD_FROM_TRANSPOSED(23, 25, 7)
-            DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16)
-            // part of stage 2
-            q13s16 = vsubq_s16(q3s16, q2s16);
-            q3s16 = vaddq_s16(q3s16, q2s16);
-            q14s16 = vsubq_s16(q1s16, q0s16);
-            q2s16 = vaddq_s16(q1s16, q0s16);
-            // part of stage 3
-            DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16)
-            // part of stage 4
-            q8s16 = vaddq_s16(q4s16, q2s16);
-            q9s16 = vaddq_s16(q5s16, q0s16);
-            q10s16 = vaddq_s16(q7s16, q1s16);
-            q15s16 = vaddq_s16(q6s16, q3s16);
-            q13s16 = vsubq_s16(q5s16, q0s16);
-            q14s16 = vsubq_s16(q7s16, q1s16);
-            STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16)
-            STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16)
-            // part of stage 5
-            DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16)
-            STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16)
-            // part of stage 4
-            q13s16 = vsubq_s16(q4s16, q2s16);
-            q14s16 = vsubq_s16(q6s16, q3s16);
-            // part of stage 5
-            DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16)
-            STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16)
-
-            // -----------------------------------------
-            // BLOCK B: 20-23,24-27
-            // -----------------------------------------
-            // generate 20,21,26,27
-            // part of stage 1
-            LOAD_FROM_TRANSPOSED(7, 5, 27)
-            DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16)
-            LOAD_FROM_TRANSPOSED(27, 21, 11)
-            DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16)
-            // part of stage 2
-            q13s16 = vsubq_s16(q0s16, q1s16);
-            q0s16 = vaddq_s16(q0s16, q1s16);
-            q14s16 = vsubq_s16(q2s16, q3s16);
-            q2s16 = vaddq_s16(q2s16, q3s16);
-            // part of stage 3
-            DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
-
-            // generate 22,23,24,25
-            // part of stage 1
-            LOAD_FROM_TRANSPOSED(11, 13, 19)
-            DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16)
-            LOAD_FROM_TRANSPOSED(19, 29, 3)
-            DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16)
-            // part of stage 2
-            q14s16 = vsubq_s16(q4s16, q5s16);
-            q5s16  = vaddq_s16(q4s16, q5s16);
-            q13s16 = vsubq_s16(q6s16, q7s16);
-            q6s16  = vaddq_s16(q6s16, q7s16);
-            // part of stage 3
-            DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16)
-            // part of stage 4
-            q10s16 = vaddq_s16(q7s16, q1s16);
-            q11s16 = vaddq_s16(q5s16, q0s16);
-            q12s16 = vaddq_s16(q6s16, q2s16);
-            q15s16 = vaddq_s16(q4s16, q3s16);
-            // part of stage 6
-            LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16)
-            q8s16 = vaddq_s16(q14s16, q11s16);
-            q9s16 = vaddq_s16(q13s16, q10s16);
-            q13s16 = vsubq_s16(q13s16, q10s16);
-            q11s16 = vsubq_s16(q14s16, q11s16);
-            STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16)
-            LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16)
-            q8s16  = vsubq_s16(q9s16, q12s16);
-            q10s16 = vaddq_s16(q14s16, q15s16);
-            q14s16 = vsubq_s16(q14s16, q15s16);
-            q12s16 = vaddq_s16(q9s16, q12s16);
-            STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16)
-            // part of stage 7
-            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
-            STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16)
-            q13s16 = q11s16;
-            q14s16 = q8s16;
-            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
-            STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16)
-            // part of stage 4
-            q14s16 = vsubq_s16(q5s16, q0s16);
-            q13s16 = vsubq_s16(q6s16, q2s16);
-            DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16);
-            q14s16 = vsubq_s16(q7s16, q1s16);
-            q13s16 = vsubq_s16(q4s16, q3s16);
-            DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16);
-            // part of stage 6
-            LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16)
-            q8s16 = vaddq_s16(q14s16, q1s16);
-            q9s16 = vaddq_s16(q13s16, q6s16);
-            q13s16 = vsubq_s16(q13s16, q6s16);
-            q1s16 = vsubq_s16(q14s16, q1s16);
-            STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16)
-            LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16)
-            q14s16 = vsubq_s16(q8s16, q5s16);
-            q10s16 = vaddq_s16(q8s16, q5s16);
-            q11s16 = vaddq_s16(q9s16, q0s16);
-            q0s16 = vsubq_s16(q9s16, q0s16);
-            STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16)
-            // part of stage 7
-            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
-            STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16)
-            DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64,
-                                                         &q1s16, &q0s16);
-            STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16)
-
-            // -----------------------------------------
-            // BLOCK C: 8-10,11-15
-            // -----------------------------------------
-            // generate 8,9,14,15
-            // part of stage 2
-            LOAD_FROM_TRANSPOSED(3, 2, 30)
-            DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16)
-            LOAD_FROM_TRANSPOSED(30, 18, 14)
-            DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16)
-            // part of stage 3
-            q13s16 = vsubq_s16(q0s16, q1s16);
-            q0s16 = vaddq_s16(q0s16, q1s16);
-            q14s16 = vsubq_s16(q2s16, q3s16);
-            q2s16 = vaddq_s16(q2s16, q3s16);
-            // part of stage 4
-            DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16)
-
-            // generate 10,11,12,13
-            // part of stage 2
-            LOAD_FROM_TRANSPOSED(14, 10, 22)
-            DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16)
-            LOAD_FROM_TRANSPOSED(22, 26, 6)
-            DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16)
-            // part of stage 3
-            q14s16 = vsubq_s16(q4s16, q5s16);
-            q5s16 = vaddq_s16(q4s16, q5s16);
-            q13s16 = vsubq_s16(q6s16, q7s16);
-            q6s16 = vaddq_s16(q6s16, q7s16);
-            // part of stage 4
-            DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16)
-            // part of stage 5
-            q8s16 = vaddq_s16(q0s16, q5s16);
-            q9s16 = vaddq_s16(q1s16, q7s16);
-            q13s16 = vsubq_s16(q1s16, q7s16);
-            q14s16 = vsubq_s16(q3s16, q4s16);
-            q10s16 = vaddq_s16(q3s16, q4s16);
-            q15s16 = vaddq_s16(q2s16, q6s16);
-            STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16)
-            STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16)
-            // part of stage 6
-            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
-            STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16)
-            q13s16 = vsubq_s16(q0s16, q5s16);
-            q14s16 = vsubq_s16(q2s16, q6s16);
-            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
-            STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16)
-
-            // -----------------------------------------
-            // BLOCK D: 0-3,4-7
-            // -----------------------------------------
-            // generate 4,5,6,7
-            // part of stage 3
-            LOAD_FROM_TRANSPOSED(6, 4, 28)
-            DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16)
-            LOAD_FROM_TRANSPOSED(28, 20, 12)
-            DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
-            // part of stage 4
-            q13s16 = vsubq_s16(q0s16, q1s16);
-            q0s16 = vaddq_s16(q0s16, q1s16);
-            q14s16 = vsubq_s16(q2s16, q3s16);
-            q2s16 = vaddq_s16(q2s16, q3s16);
-            // part of stage 5
-            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
-
-            // generate 0,1,2,3
-            // part of stage 4
-            LOAD_FROM_TRANSPOSED(12, 0, 16)
-            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16)
-            LOAD_FROM_TRANSPOSED(16, 8, 24)
-            DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16)
-            // part of stage 5
-            q4s16 = vaddq_s16(q7s16, q6s16);
-            q7s16 = vsubq_s16(q7s16, q6s16);
-            q6s16 = vsubq_s16(q5s16, q14s16);
-            q5s16 = vaddq_s16(q5s16, q14s16);
-            // part of stage 6
-            q8s16 = vaddq_s16(q4s16, q2s16);
-            q9s16 = vaddq_s16(q5s16, q3s16);
-            q10s16 = vaddq_s16(q6s16, q1s16);
-            q11s16 = vaddq_s16(q7s16, q0s16);
-            q12s16 = vsubq_s16(q7s16, q0s16);
-            q13s16 = vsubq_s16(q6s16, q1s16);
-            q14s16 = vsubq_s16(q5s16, q3s16);
-            q15s16 = vsubq_s16(q4s16, q2s16);
-            // part of stage 7
-            LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16)
-            q2s16 = vaddq_s16(q8s16, q1s16);
-            q3s16 = vaddq_s16(q9s16, q0s16);
-            q4s16 = vsubq_s16(q9s16, q0s16);
-            q5s16 = vsubq_s16(q8s16, q1s16);
-            LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16)
-            q8s16 = vaddq_s16(q4s16, q1s16);
-            q9s16 = vaddq_s16(q5s16, q0s16);
-            q6s16 = vsubq_s16(q5s16, q0s16);
-            q7s16 = vsubq_s16(q4s16, q1s16);
-
-            if (idct32_pass_loop == 0) {
-                idct32_bands_end_1st_pass(out,
-                         q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
-                         q10s16, q11s16, q12s16, q13s16, q14s16, q15s16);
-            } else {
-                idct32_bands_end_2nd_pass(out, dest, stride,
-                         q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
-                         q10s16, q11s16, q12s16, q13s16, q14s16, q15s16);
-                dest += 8;
-            }
-        }
-    }
-    return;
-}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
deleted file mode 100644
index ea618700c9..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "vpx_dsp/inv_txfm.h"
-#include "vpx_ports/mem.h"
-
-void vpx_idct4x4_1_add_neon(
-        int16_t *input,
-        uint8_t *dest,
-        int dest_stride) {
-    uint8x8_t d6u8;
-    uint32x2_t d2u32 = vdup_n_u32(0);
-    uint16x8_t q8u16;
-    int16x8_t q0s16;
-    uint8_t *d1, *d2;
-    int16_t i, a1, cospi_16_64 = 11585;
-    int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
-    out = dct_const_round_shift(out * cospi_16_64);
-    a1 = ROUND_POWER_OF_TWO(out, 4);
-
-    q0s16 = vdupq_n_s16(a1);
-
-    // dc_only_idct_add
-    d1 = d2 = dest;
-    for (i = 0; i < 2; i++) {
-        d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0);
-        d1 += dest_stride;
-        d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1);
-        d1 += dest_stride;
-
-        q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16),
-                         vreinterpret_u8_u32(d2u32));
-        d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-
-        vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0);
-        d2 += dest_stride;
-        vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1);
-        d2 += dest_stride;
-    }
-    return;
-}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct4x4_add_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct4x4_add_neon.c
deleted file mode 100644
index 3c975c99b7..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/idct4x4_add_neon.c
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-void vpx_idct4x4_16_add_neon(
-        int16_t *input,
-        uint8_t *dest,
-        int dest_stride) {
-    uint8x8_t d26u8, d27u8;
-    uint32x2_t d26u32, d27u32;
-    uint16x8_t q8u16, q9u16;
-    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16;
-    int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16;
-    int16x8_t q8s16, q9s16, q13s16, q14s16;
-    int32x4_t q1s32, q13s32, q14s32, q15s32;
-    int16x4x2_t d0x2s16, d1x2s16;
-    int32x4x2_t q0x2s32;
-    uint8_t *d;
-    int16_t cospi_8_64 = 15137;
-    int16_t cospi_16_64 = 11585;
-    int16_t cospi_24_64 = 6270;
-
-    d26u32 = d27u32 = vdup_n_u32(0);
-
-    q8s16 = vld1q_s16(input);
-    q9s16 = vld1q_s16(input + 8);
-
-    d16s16 = vget_low_s16(q8s16);
-    d17s16 = vget_high_s16(q8s16);
-    d18s16 = vget_low_s16(q9s16);
-    d19s16 = vget_high_s16(q9s16);
-
-    d0x2s16 = vtrn_s16(d16s16, d17s16);
-    d1x2s16 = vtrn_s16(d18s16, d19s16);
-    q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
-    q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
-
-    d20s16 = vdup_n_s16(cospi_8_64);
-    d21s16 = vdup_n_s16(cospi_16_64);
-
-    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
-                        vreinterpretq_s32_s16(q9s16));
-    d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
-    d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
-    d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
-    d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
-
-    d22s16 = vdup_n_s16(cospi_24_64);
-
-    // stage 1
-    d23s16 = vadd_s16(d16s16, d18s16);
-    d24s16 = vsub_s16(d16s16, d18s16);
-
-    q15s32 = vmull_s16(d17s16, d22s16);
-    q1s32  = vmull_s16(d17s16, d20s16);
-    q13s32 = vmull_s16(d23s16, d21s16);
-    q14s32 = vmull_s16(d24s16, d21s16);
-
-    q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
-    q1s32  = vmlal_s16(q1s32,  d19s16, d22s16);
-
-    d26s16 = vqrshrn_n_s32(q13s32, 14);
-    d27s16 = vqrshrn_n_s32(q14s32, 14);
-    d29s16 = vqrshrn_n_s32(q15s32, 14);
-    d28s16 = vqrshrn_n_s32(q1s32,  14);
-    q13s16 = vcombine_s16(d26s16, d27s16);
-    q14s16 = vcombine_s16(d28s16, d29s16);
-
-    // stage 2
-    q8s16 = vaddq_s16(q13s16, q14s16);
-    q9s16 = vsubq_s16(q13s16, q14s16);
-
-    d16s16 = vget_low_s16(q8s16);
-    d17s16 = vget_high_s16(q8s16);
-    d18s16 = vget_high_s16(q9s16);  // vswp d18 d19
-    d19s16 = vget_low_s16(q9s16);
-
-    d0x2s16 = vtrn_s16(d16s16, d17s16);
-    d1x2s16 = vtrn_s16(d18s16, d19s16);
-    q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
-    q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
-
-    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
-                        vreinterpretq_s32_s16(q9s16));
-    d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
-    d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
-    d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
-    d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
-
-    // do the transform on columns
-    // stage 1
-    d23s16 = vadd_s16(d16s16, d18s16);
-    d24s16 = vsub_s16(d16s16, d18s16);
-
-    q15s32 = vmull_s16(d17s16, d22s16);
-    q1s32  = vmull_s16(d17s16, d20s16);
-    q13s32 = vmull_s16(d23s16, d21s16);
-    q14s32 = vmull_s16(d24s16, d21s16);
-
-    q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
-    q1s32  = vmlal_s16(q1s32,  d19s16, d22s16);
-
-    d26s16 = vqrshrn_n_s32(q13s32, 14);
-    d27s16 = vqrshrn_n_s32(q14s32, 14);
-    d29s16 = vqrshrn_n_s32(q15s32, 14);
-    d28s16 = vqrshrn_n_s32(q1s32,  14);
-    q13s16 = vcombine_s16(d26s16, d27s16);
-    q14s16 = vcombine_s16(d28s16, d29s16);
-
-    // stage 2
-    q8s16 = vaddq_s16(q13s16, q14s16);
-    q9s16 = vsubq_s16(q13s16, q14s16);
-
-    q8s16 = vrshrq_n_s16(q8s16, 4);
-    q9s16 = vrshrq_n_s16(q9s16, 4);
-
-    d = dest;
-    d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0);
-    d += dest_stride;
-    d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1);
-    d += dest_stride;
-    d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1);
-    d += dest_stride;
-    d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0);
-
-    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
-                     vreinterpret_u8_u32(d26u32));
-    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
-                     vreinterpret_u8_u32(d27u32));
-
-    d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-    d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-
-    d = dest;
-    vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0);
-    d += dest_stride;
-    vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1);
-    d += dest_stride;
-    vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1);
-    d += dest_stride;
-    vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0);
-    return;
-}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c
deleted file mode 100644
index c1b801fad5..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "vpx_dsp/inv_txfm.h"
-#include "vpx_ports/mem.h"
-
-void vpx_idct8x8_1_add_neon(
-        int16_t *input,
-        uint8_t *dest,
-        int dest_stride) {
-    uint8x8_t d2u8, d3u8, d30u8, d31u8;
-    uint64x1_t d2u64, d3u64, d4u64, d5u64;
-    uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
-    int16x8_t q0s16;
-    uint8_t *d1, *d2;
-    int16_t i, a1, cospi_16_64 = 11585;
-    int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
-    out = dct_const_round_shift(out * cospi_16_64);
-    a1 = ROUND_POWER_OF_TWO(out, 5);
-
-    q0s16 = vdupq_n_s16(a1);
-    q0u16 = vreinterpretq_u16_s16(q0s16);
-
-    d1 = d2 = dest;
-    for (i = 0; i < 2; i++) {
-        d2u64 = vld1_u64((const uint64_t *)d1);
-        d1 += dest_stride;
-        d3u64 = vld1_u64((const uint64_t *)d1);
-        d1 += dest_stride;
-        d4u64 = vld1_u64((const uint64_t *)d1);
-        d1 += dest_stride;
-        d5u64 = vld1_u64((const uint64_t *)d1);
-        d1 += dest_stride;
-
-        q9u16  = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
-        q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
-        q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
-        q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
-
-        d2u8  = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-        d3u8  = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-        d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-        d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-
-        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-        d2 += dest_stride;
-        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-        d2 += dest_stride;
-        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
-        d2 += dest_stride;
-        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8));
-        d2 += dest_stride;
-    }
-    return;
-}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct8x8_add_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct8x8_add_neon.c
deleted file mode 100644
index 4b2c2a6f83..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/idct8x8_add_neon.c
+++ /dev/null
@@ -1,540 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_config.h"
-#include "vpx_dsp/txfm_common.h"
-
-static INLINE void TRANSPOSE8X8(
-        int16x8_t *q8s16,
-        int16x8_t *q9s16,
-        int16x8_t *q10s16,
-        int16x8_t *q11s16,
-        int16x8_t *q12s16,
-        int16x8_t *q13s16,
-        int16x8_t *q14s16,
-        int16x8_t *q15s16) {
-    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-    int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
-    int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
-
-    d16s16 = vget_low_s16(*q8s16);
-    d17s16 = vget_high_s16(*q8s16);
-    d18s16 = vget_low_s16(*q9s16);
-    d19s16 = vget_high_s16(*q9s16);
-    d20s16 = vget_low_s16(*q10s16);
-    d21s16 = vget_high_s16(*q10s16);
-    d22s16 = vget_low_s16(*q11s16);
-    d23s16 = vget_high_s16(*q11s16);
-    d24s16 = vget_low_s16(*q12s16);
-    d25s16 = vget_high_s16(*q12s16);
-    d26s16 = vget_low_s16(*q13s16);
-    d27s16 = vget_high_s16(*q13s16);
-    d28s16 = vget_low_s16(*q14s16);
-    d29s16 = vget_high_s16(*q14s16);
-    d30s16 = vget_low_s16(*q15s16);
-    d31s16 = vget_high_s16(*q15s16);
-
-    *q8s16  = vcombine_s16(d16s16, d24s16);  // vswp d17, d24
-    *q9s16  = vcombine_s16(d18s16, d26s16);  // vswp d19, d26
-    *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
-    *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
-    *q12s16 = vcombine_s16(d17s16, d25s16);
-    *q13s16 = vcombine_s16(d19s16, d27s16);
-    *q14s16 = vcombine_s16(d21s16, d29s16);
-    *q15s16 = vcombine_s16(d23s16, d31s16);
-
-    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
-                        vreinterpretq_s32_s16(*q10s16));
-    q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
-                        vreinterpretq_s32_s16(*q11s16));
-    q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
-                        vreinterpretq_s32_s16(*q14s16));
-    q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
-                        vreinterpretq_s32_s16(*q15s16));
-
-    q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
-                        vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
-    q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
-                        vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
-    q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
-                        vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
-    q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
-                        vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
-
-    *q8s16  = q0x2s16.val[0];
-    *q9s16  = q0x2s16.val[1];
-    *q10s16 = q1x2s16.val[0];
-    *q11s16 = q1x2s16.val[1];
-    *q12s16 = q2x2s16.val[0];
-    *q13s16 = q2x2s16.val[1];
-    *q14s16 = q3x2s16.val[0];
-    *q15s16 = q3x2s16.val[1];
-    return;
-}
-
-static INLINE void IDCT8x8_1D(
-        int16x8_t *q8s16,
-        int16x8_t *q9s16,
-        int16x8_t *q10s16,
-        int16x8_t *q11s16,
-        int16x8_t *q12s16,
-        int16x8_t *q13s16,
-        int16x8_t *q14s16,
-        int16x8_t *q15s16) {
-    int16x4_t d0s16, d1s16, d2s16, d3s16;
-    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-    int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
-    int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
-
-    d0s16 = vdup_n_s16(cospi_28_64);
-    d1s16 = vdup_n_s16(cospi_4_64);
-    d2s16 = vdup_n_s16(cospi_12_64);
-    d3s16 = vdup_n_s16(cospi_20_64);
-
-    d16s16 = vget_low_s16(*q8s16);
-    d17s16 = vget_high_s16(*q8s16);
-    d18s16 = vget_low_s16(*q9s16);
-    d19s16 = vget_high_s16(*q9s16);
-    d20s16 = vget_low_s16(*q10s16);
-    d21s16 = vget_high_s16(*q10s16);
-    d22s16 = vget_low_s16(*q11s16);
-    d23s16 = vget_high_s16(*q11s16);
-    d24s16 = vget_low_s16(*q12s16);
-    d25s16 = vget_high_s16(*q12s16);
-    d26s16 = vget_low_s16(*q13s16);
-    d27s16 = vget_high_s16(*q13s16);
-    d28s16 = vget_low_s16(*q14s16);
-    d29s16 = vget_high_s16(*q14s16);
-    d30s16 = vget_low_s16(*q15s16);
-    d31s16 = vget_high_s16(*q15s16);
-
-    q2s32 = vmull_s16(d18s16, d0s16);
-    q3s32 = vmull_s16(d19s16, d0s16);
-    q5s32 = vmull_s16(d26s16, d2s16);
-    q6s32 = vmull_s16(d27s16, d2s16);
-
-    q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
-    q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
-    q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
-    q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
-
-    d8s16 = vqrshrn_n_s32(q2s32, 14);
-    d9s16 = vqrshrn_n_s32(q3s32, 14);
-    d10s16 = vqrshrn_n_s32(q5s32, 14);
-    d11s16 = vqrshrn_n_s32(q6s32, 14);
-    q4s16 = vcombine_s16(d8s16, d9s16);
-    q5s16 = vcombine_s16(d10s16, d11s16);
-
-    q2s32 = vmull_s16(d18s16, d1s16);
-    q3s32 = vmull_s16(d19s16, d1s16);
-    q9s32 = vmull_s16(d26s16, d3s16);
-    q13s32 = vmull_s16(d27s16, d3s16);
-
-    q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
-    q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
-    q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
-    q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
-
-    d14s16 = vqrshrn_n_s32(q2s32, 14);
-    d15s16 = vqrshrn_n_s32(q3s32, 14);
-    d12s16 = vqrshrn_n_s32(q9s32, 14);
-    d13s16 = vqrshrn_n_s32(q13s32, 14);
-    q6s16 = vcombine_s16(d12s16, d13s16);
-    q7s16 = vcombine_s16(d14s16, d15s16);
-
-    d0s16 = vdup_n_s16(cospi_16_64);
-
-    q2s32 = vmull_s16(d16s16, d0s16);
-    q3s32 = vmull_s16(d17s16, d0s16);
-    q13s32 = vmull_s16(d16s16, d0s16);
-    q15s32 = vmull_s16(d17s16, d0s16);
-
-    q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
-    q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
-    q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
-    q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
-
-    d0s16 = vdup_n_s16(cospi_24_64);
-    d1s16 = vdup_n_s16(cospi_8_64);
-
-    d18s16 = vqrshrn_n_s32(q2s32, 14);
-    d19s16 = vqrshrn_n_s32(q3s32, 14);
-    d22s16 = vqrshrn_n_s32(q13s32, 14);
-    d23s16 = vqrshrn_n_s32(q15s32, 14);
-    *q9s16 = vcombine_s16(d18s16, d19s16);
-    *q11s16 = vcombine_s16(d22s16, d23s16);
-
-    q2s32 = vmull_s16(d20s16, d0s16);
-    q3s32 = vmull_s16(d21s16, d0s16);
-    q8s32 = vmull_s16(d20s16, d1s16);
-    q12s32 = vmull_s16(d21s16, d1s16);
-
-    q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
-    q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
-    q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
-    q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
-
-    d26s16 = vqrshrn_n_s32(q2s32, 14);
-    d27s16 = vqrshrn_n_s32(q3s32, 14);
-    d30s16 = vqrshrn_n_s32(q8s32, 14);
-    d31s16 = vqrshrn_n_s32(q12s32, 14);
-    *q13s16 = vcombine_s16(d26s16, d27s16);
-    *q15s16 = vcombine_s16(d30s16, d31s16);
-
-    q0s16 = vaddq_s16(*q9s16, *q15s16);
-    q1s16 = vaddq_s16(*q11s16, *q13s16);
-    q2s16 = vsubq_s16(*q11s16, *q13s16);
-    q3s16 = vsubq_s16(*q9s16, *q15s16);
-
-    *q13s16 = vsubq_s16(q4s16, q5s16);
-    q4s16 = vaddq_s16(q4s16, q5s16);
-    *q14s16 = vsubq_s16(q7s16, q6s16);
-    q7s16 = vaddq_s16(q7s16, q6s16);
-    d26s16 = vget_low_s16(*q13s16);
-    d27s16 = vget_high_s16(*q13s16);
-    d28s16 = vget_low_s16(*q14s16);
-    d29s16 = vget_high_s16(*q14s16);
-
-    d16s16 = vdup_n_s16(cospi_16_64);
-
-    q9s32 = vmull_s16(d28s16, d16s16);
-    q10s32 = vmull_s16(d29s16, d16s16);
-    q11s32 = vmull_s16(d28s16, d16s16);
-    q12s32 = vmull_s16(d29s16, d16s16);
-
-    q9s32 = vmlsl_s16(q9s32,  d26s16, d16s16);
-    q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
-    q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
-    q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
-
-    d10s16 = vqrshrn_n_s32(q9s32, 14);
-    d11s16 = vqrshrn_n_s32(q10s32, 14);
-    d12s16 = vqrshrn_n_s32(q11s32, 14);
-    d13s16 = vqrshrn_n_s32(q12s32, 14);
-    q5s16 = vcombine_s16(d10s16, d11s16);
-    q6s16 = vcombine_s16(d12s16, d13s16);
-
-    *q8s16 = vaddq_s16(q0s16, q7s16);
-    *q9s16 = vaddq_s16(q1s16, q6s16);
-    *q10s16 = vaddq_s16(q2s16, q5s16);
-    *q11s16 = vaddq_s16(q3s16, q4s16);
-    *q12s16 = vsubq_s16(q3s16, q4s16);
-    *q13s16 = vsubq_s16(q2s16, q5s16);
-    *q14s16 = vsubq_s16(q1s16, q6s16);
-    *q15s16 = vsubq_s16(q0s16, q7s16);
-    return;
-}
-
-void vpx_idct8x8_64_add_neon(
-        int16_t *input,
-        uint8_t *dest,
-        int dest_stride) {
-    uint8_t *d1, *d2;
-    uint8x8_t d0u8, d1u8, d2u8, d3u8;
-    uint64x1_t d0u64, d1u64, d2u64, d3u64;
-    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-    uint16x8_t q8u16, q9u16, q10u16, q11u16;
-
-    q8s16 = vld1q_s16(input);
-    q9s16 = vld1q_s16(input + 8);
-    q10s16 = vld1q_s16(input + 16);
-    q11s16 = vld1q_s16(input + 24);
-    q12s16 = vld1q_s16(input + 32);
-    q13s16 = vld1q_s16(input + 40);
-    q14s16 = vld1q_s16(input + 48);
-    q15s16 = vld1q_s16(input + 56);
-
-    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
-                 &q12s16, &q13s16, &q14s16, &q15s16);
-
-    IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
-               &q12s16, &q13s16, &q14s16, &q15s16);
-
-    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
-                 &q12s16, &q13s16, &q14s16, &q15s16);
-
-    IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
-               &q12s16, &q13s16, &q14s16, &q15s16);
-
-    q8s16 = vrshrq_n_s16(q8s16, 5);
-    q9s16 = vrshrq_n_s16(q9s16, 5);
-    q10s16 = vrshrq_n_s16(q10s16, 5);
-    q11s16 = vrshrq_n_s16(q11s16, 5);
-    q12s16 = vrshrq_n_s16(q12s16, 5);
-    q13s16 = vrshrq_n_s16(q13s16, 5);
-    q14s16 = vrshrq_n_s16(q14s16, 5);
-    q15s16 = vrshrq_n_s16(q15s16, 5);
-
-    d1 = d2 = dest;
-
-    d0u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-    d1u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-    d2u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-    d3u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-
-    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
-                     vreinterpret_u8_u64(d0u64));
-    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
-                     vreinterpret_u8_u64(d1u64));
-    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
-                      vreinterpret_u8_u64(d2u64));
-    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
-                      vreinterpret_u8_u64(d3u64));
-
-    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-    d2 += dest_stride;
-
-    q8s16 = q12s16;
-    q9s16 = q13s16;
-    q10s16 = q14s16;
-    q11s16 = q15s16;
-
-    d0u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-    d1u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-    d2u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-    d3u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-
-    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
-                     vreinterpret_u8_u64(d0u64));
-    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
-                     vreinterpret_u8_u64(d1u64));
-    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
-                      vreinterpret_u8_u64(d2u64));
-    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
-                      vreinterpret_u8_u64(d3u64));
-
-    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-    d2 += dest_stride;
-    return;
-}
-
-void vpx_idct8x8_12_add_neon(
-        int16_t *input,
-        uint8_t *dest,
-        int dest_stride) {
-    uint8_t *d1, *d2;
-    uint8x8_t d0u8, d1u8, d2u8, d3u8;
-    int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16;
-    int16x4_t d26s16, d27s16, d28s16, d29s16;
-    uint64x1_t d0u64, d1u64, d2u64, d3u64;
-    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-    uint16x8_t q8u16, q9u16, q10u16, q11u16;
-    int32x4_t q9s32, q10s32, q11s32, q12s32;
-
-    q8s16 = vld1q_s16(input);
-    q9s16 = vld1q_s16(input + 8);
-    q10s16 = vld1q_s16(input + 16);
-    q11s16 = vld1q_s16(input + 24);
-    q12s16 = vld1q_s16(input + 32);
-    q13s16 = vld1q_s16(input + 40);
-    q14s16 = vld1q_s16(input + 48);
-    q15s16 = vld1q_s16(input + 56);
-
-    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
-                 &q12s16, &q13s16, &q14s16, &q15s16);
-
-    // First transform rows
-    // stage 1
-    q0s16 = vdupq_n_s16(cospi_28_64 * 2);
-    q1s16 = vdupq_n_s16(cospi_4_64 * 2);
-
-    q4s16 = vqrdmulhq_s16(q9s16, q0s16);
-
-    q0s16 = vdupq_n_s16(-cospi_20_64 * 2);
-
-    q7s16 = vqrdmulhq_s16(q9s16, q1s16);
-
-    q1s16 = vdupq_n_s16(cospi_12_64 * 2);
-
-    q5s16 = vqrdmulhq_s16(q11s16, q0s16);
-
-    q0s16 = vdupq_n_s16(cospi_16_64 * 2);
-
-    q6s16 = vqrdmulhq_s16(q11s16, q1s16);
-
-    // stage 2 & stage 3 - even half
-    q1s16 = vdupq_n_s16(cospi_24_64 * 2);
-
-    q9s16 = vqrdmulhq_s16(q8s16, q0s16);
-
-    q0s16 = vdupq_n_s16(cospi_8_64 * 2);
-
-    q13s16 = vqrdmulhq_s16(q10s16, q1s16);
-
-    q15s16 = vqrdmulhq_s16(q10s16, q0s16);
-
-    // stage 3 -odd half
-    q0s16 = vaddq_s16(q9s16, q15s16);
-    q1s16 = vaddq_s16(q9s16, q13s16);
-    q2s16 = vsubq_s16(q9s16, q13s16);
-    q3s16 = vsubq_s16(q9s16, q15s16);
-
-    // stage 2 - odd half
-    q13s16 = vsubq_s16(q4s16, q5s16);
-    q4s16 = vaddq_s16(q4s16, q5s16);
-    q14s16 = vsubq_s16(q7s16, q6s16);
-    q7s16 = vaddq_s16(q7s16, q6s16);
-    d26s16 = vget_low_s16(q13s16);
-    d27s16 = vget_high_s16(q13s16);
-    d28s16 = vget_low_s16(q14s16);
-    d29s16 = vget_high_s16(q14s16);
-
-    d16s16 = vdup_n_s16(cospi_16_64);
-    q9s32 = vmull_s16(d28s16, d16s16);
-    q10s32 = vmull_s16(d29s16, d16s16);
-    q11s32 = vmull_s16(d28s16, d16s16);
-    q12s32 = vmull_s16(d29s16, d16s16);
-
-    q9s32 = vmlsl_s16(q9s32,  d26s16, d16s16);
-    q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
-    q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
-    q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
-
-    d10s16 = vqrshrn_n_s32(q9s32, 14);
-    d11s16 = vqrshrn_n_s32(q10s32, 14);
-    d12s16 = vqrshrn_n_s32(q11s32, 14);
-    d13s16 = vqrshrn_n_s32(q12s32, 14);
-    q5s16 = vcombine_s16(d10s16, d11s16);
-    q6s16 = vcombine_s16(d12s16, d13s16);
-
-    // stage 4
-    q8s16 = vaddq_s16(q0s16, q7s16);
-    q9s16 = vaddq_s16(q1s16, q6s16);
-    q10s16 = vaddq_s16(q2s16, q5s16);
-    q11s16 = vaddq_s16(q3s16, q4s16);
-    q12s16 = vsubq_s16(q3s16, q4s16);
-    q13s16 = vsubq_s16(q2s16, q5s16);
-    q14s16 = vsubq_s16(q1s16, q6s16);
-    q15s16 = vsubq_s16(q0s16, q7s16);
-
-    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
-                 &q12s16, &q13s16, &q14s16, &q15s16);
-
-    IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
-               &q12s16, &q13s16, &q14s16, &q15s16);
-
-    q8s16 = vrshrq_n_s16(q8s16, 5);
-    q9s16 = vrshrq_n_s16(q9s16, 5);
-    q10s16 = vrshrq_n_s16(q10s16, 5);
-    q11s16 = vrshrq_n_s16(q11s16, 5);
-    q12s16 = vrshrq_n_s16(q12s16, 5);
-    q13s16 = vrshrq_n_s16(q13s16, 5);
-    q14s16 = vrshrq_n_s16(q14s16, 5);
-    q15s16 = vrshrq_n_s16(q15s16, 5);
-
-    d1 = d2 = dest;
-
-    d0u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-    d1u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-    d2u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-    d3u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-
-    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
-                     vreinterpret_u8_u64(d0u64));
-    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
-                     vreinterpret_u8_u64(d1u64));
-    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
-                      vreinterpret_u8_u64(d2u64));
-    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
-                      vreinterpret_u8_u64(d3u64));
-
-    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-    d2 += dest_stride;
-
-    q8s16 = q12s16;
-    q9s16 = q13s16;
-    q10s16 = q14s16;
-    q11s16 = q15s16;
-
-    d0u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-    d1u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-    d2u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-    d3u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-
-    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
-                     vreinterpret_u8_u64(d0u64));
-    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
-                     vreinterpret_u8_u64(d1u64));
-    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
-                      vreinterpret_u8_u64(d2u64));
-    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
-                      vreinterpret_u8_u64(d3u64));
-
-    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-    d2 += dest_stride;
-    return;
-}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/intrapred_neon.c b/thirdparty/libvpx/vpx_dsp/arm/intrapred_neon.c
deleted file mode 100644
index 0a376104d2..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/intrapred_neon.c
+++ /dev/null
@@ -1,822 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_config.h"
-#include "./vpx_dsp_rtcd.h"
-#include "vpx/vpx_integer.h"
-
-//------------------------------------------------------------------------------
-// DC 4x4
-
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride,
-                          const uint8_t *above, const uint8_t *left,
-                          int do_above, int do_left) {
-  uint16x8_t sum_top;
-  uint16x8_t sum_left;
-  uint8x8_t dc0;
-
-  if (do_above) {
-    const uint8x8_t A = vld1_u8(above);  // top row
-    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
-    const uint16x4_t p1 = vpadd_u16(p0, p0);
-    sum_top = vcombine_u16(p1, p1);
-  }
-
-  if (do_left) {
-    const uint8x8_t L = vld1_u8(left);  // left border
-    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
-    const uint16x4_t p1 = vpadd_u16(p0, p0);
-    sum_left = vcombine_u16(p1, p1);
-  }
-
-  if (do_above && do_left) {
-    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
-    dc0 = vrshrn_n_u16(sum, 3);
-  } else if (do_above) {
-    dc0 = vrshrn_n_u16(sum_top, 2);
-  } else if (do_left) {
-    dc0 = vrshrn_n_u16(sum_left, 2);
-  } else {
-    dc0 = vdup_n_u8(0x80);
-  }
-
-  {
-    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
-    int i;
-    for (i = 0; i < 4; ++i) {
-      vst1_lane_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
-    }
-  }
-}
-
-void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  dc_4x4(dst, stride, above, left, 1, 1);
-}
-
-void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  dc_4x4(dst, stride, NULL, left, 0, 1);
-}
-
-void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  dc_4x4(dst, stride, above, NULL, 1, 0);
-}
-
-void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-  dc_4x4(dst, stride, NULL, NULL, 0, 0);
-}
-
-//------------------------------------------------------------------------------
-// DC 8x8
-
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride,
-                          const uint8_t *above, const uint8_t *left,
-                          int do_above, int do_left) {
-  uint16x8_t sum_top;
-  uint16x8_t sum_left;
-  uint8x8_t dc0;
-
-  if (do_above) {
-    const uint8x8_t A = vld1_u8(above);  // top row
-    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
-    const uint16x4_t p1 = vpadd_u16(p0, p0);
-    const uint16x4_t p2 = vpadd_u16(p1, p1);
-    sum_top = vcombine_u16(p2, p2);
-  }
-
-  if (do_left) {
-    const uint8x8_t L = vld1_u8(left);  // left border
-    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
-    const uint16x4_t p1 = vpadd_u16(p0, p0);
-    const uint16x4_t p2 = vpadd_u16(p1, p1);
-    sum_left = vcombine_u16(p2, p2);
-  }
-
-  if (do_above && do_left) {
-    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
-    dc0 = vrshrn_n_u16(sum, 4);
-  } else if (do_above) {
-    dc0 = vrshrn_n_u16(sum_top, 3);
-  } else if (do_left) {
-    dc0 = vrshrn_n_u16(sum_left, 3);
-  } else {
-    dc0 = vdup_n_u8(0x80);
-  }
-
-  {
-    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
-    int i;
-    for (i = 0; i < 8; ++i) {
-      vst1_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc));
-    }
-  }
-}
-
-void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  dc_8x8(dst, stride, above, left, 1, 1);
-}
-
-void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  dc_8x8(dst, stride, NULL, left, 0, 1);
-}
-
-void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  dc_8x8(dst, stride, above, NULL, 1, 0);
-}
-
-void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-  dc_8x8(dst, stride, NULL, NULL, 0, 0);
-}
-
-//------------------------------------------------------------------------------
-// DC 16x16
-
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
-                            const uint8_t *above, const uint8_t *left,
-                            int do_above, int do_left) {
-  uint16x8_t sum_top;
-  uint16x8_t sum_left;
-  uint8x8_t dc0;
-
-  if (do_above) {
-    const uint8x16_t A = vld1q_u8(above);  // top row
-    const uint16x8_t p0 = vpaddlq_u8(A);  // cascading summation of the top
-    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
-    const uint16x4_t p2 = vpadd_u16(p1, p1);
-    const uint16x4_t p3 = vpadd_u16(p2, p2);
-    sum_top = vcombine_u16(p3, p3);
-  }
-
-  if (do_left) {
-    const uint8x16_t L = vld1q_u8(left);  // left row
-    const uint16x8_t p0 = vpaddlq_u8(L);  // cascading summation of the left
-    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
-    const uint16x4_t p2 = vpadd_u16(p1, p1);
-    const uint16x4_t p3 = vpadd_u16(p2, p2);
-    sum_left = vcombine_u16(p3, p3);
-  }
-
-  if (do_above && do_left) {
-    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
-    dc0 = vrshrn_n_u16(sum, 5);
-  } else if (do_above) {
-    dc0 = vrshrn_n_u16(sum_top, 4);
-  } else if (do_left) {
-    dc0 = vrshrn_n_u16(sum_left, 4);
-  } else {
-    dc0 = vdup_n_u8(0x80);
-  }
-
-  {
-    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
-    int i;
-    for (i = 0; i < 16; ++i) {
-      vst1q_u8(dst + i * stride, dc);
-    }
-  }
-}
-
-void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  dc_16x16(dst, stride, above, left, 1, 1);
-}
-
-void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  (void)above;
-  dc_16x16(dst, stride, NULL, left, 0, 1);
-}
-
-void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)left;
-  dc_16x16(dst, stride, above, NULL, 1, 0);
-}
-
-void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  dc_16x16(dst, stride, NULL, NULL, 0, 0);
-}
-
-//------------------------------------------------------------------------------
-// DC 32x32
-
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride,
-                            const uint8_t *above, const uint8_t *left,
-                            int do_above, int do_left) {
-  uint16x8_t sum_top;
-  uint16x8_t sum_left;
-  uint8x8_t dc0;
-
-  if (do_above) {
-    const uint8x16_t A0 = vld1q_u8(above);  // top row
-    const uint8x16_t A1 = vld1q_u8(above + 16);
-    const uint16x8_t p0 = vpaddlq_u8(A0);  // cascading summation of the top
-    const uint16x8_t p1 = vpaddlq_u8(A1);
-    const uint16x8_t p2 = vaddq_u16(p0, p1);
-    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
-    const uint16x4_t p4 = vpadd_u16(p3, p3);
-    const uint16x4_t p5 = vpadd_u16(p4, p4);
-    sum_top = vcombine_u16(p5, p5);
-  }
-
-  if (do_left) {
-    const uint8x16_t L0 = vld1q_u8(left);  // left row
-    const uint8x16_t L1 = vld1q_u8(left + 16);
-    const uint16x8_t p0 = vpaddlq_u8(L0);  // cascading summation of the left
-    const uint16x8_t p1 = vpaddlq_u8(L1);
-    const uint16x8_t p2 = vaddq_u16(p0, p1);
-    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
-    const uint16x4_t p4 = vpadd_u16(p3, p3);
-    const uint16x4_t p5 = vpadd_u16(p4, p4);
-    sum_left = vcombine_u16(p5, p5);
-  }
-
-  if (do_above && do_left) {
-    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
-    dc0 = vrshrn_n_u16(sum, 6);
-  } else if (do_above) {
-    dc0 = vrshrn_n_u16(sum_top, 5);
-  } else if (do_left) {
-    dc0 = vrshrn_n_u16(sum_left, 5);
-  } else {
-    dc0 = vdup_n_u8(0x80);
-  }
-
-  {
-    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
-    int i;
-    for (i = 0; i < 32; ++i) {
-      vst1q_u8(dst + i * stride, dc);
-      vst1q_u8(dst + i * stride + 16, dc);
-    }
-  }
-}
-
-void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  dc_32x32(dst, stride, above, left, 1, 1);
-}
-
-void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  (void)above;
-  dc_32x32(dst, stride, NULL, left, 0, 1);
-}
-
-void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)left;
-  dc_32x32(dst, stride, above, NULL, 1, 0);
-}
-
-void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  dc_32x32(dst, stride, NULL, NULL, 0, 0);
-}
-
-// -----------------------------------------------------------------------------
-
-void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above));  // top row
-  const uint64x1_t A1 = vshr_n_u64(A0, 8);
-  const uint64x1_t A2 = vshr_n_u64(A0, 16);
-  const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
-  const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
-  const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
-  const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00);
-  const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
-  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
-  const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
-  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
-  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
-  const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
-  (void)left;
-  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
-  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
-  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
-  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
-  dst[3 * stride + 3] = above[7];
-}
-
-void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  static const uint8_t shuffle1[8] = { 1, 2, 3, 4, 5, 6, 7, 7 };
-  static const uint8_t shuffle2[8] = { 2, 3, 4, 5, 6, 7, 7, 7 };
-  const uint8x8_t sh_12345677 = vld1_u8(shuffle1);
-  const uint8x8_t sh_23456777 = vld1_u8(shuffle2);
-  const uint8x8_t A0 = vld1_u8(above);  // top row
-  const uint8x8_t A1 = vtbl1_u8(A0, sh_12345677);
-  const uint8x8_t A2 = vtbl1_u8(A0, sh_23456777);
-  const uint8x8_t avg1 = vhadd_u8(A0, A2);
-  uint8x8_t row = vrhadd_u8(avg1, A1);
-  int i;
-  (void)left;
-  for (i = 0; i < 7; ++i) {
-    vst1_u8(dst + i * stride, row);
-    row = vtbl1_u8(row, sh_12345677);
-  }
-  vst1_u8(dst + i * stride, row);
-}
-
-void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
-                                  const uint8_t *above, const uint8_t *left) {
-  const uint8x16_t A0 = vld1q_u8(above);  // top row
-  const uint8x16_t above_right = vld1q_dup_u8(above + 15);
-  const uint8x16_t A1 = vextq_u8(A0, above_right, 1);
-  const uint8x16_t A2 = vextq_u8(A0, above_right, 2);
-  const uint8x16_t avg1 = vhaddq_u8(A0, A2);
-  uint8x16_t row = vrhaddq_u8(avg1, A1);
-  int i;
-  (void)left;
-  for (i = 0; i < 15; ++i) {
-    vst1q_u8(dst + i * stride, row);
-    row = vextq_u8(row, above_right, 1);
-  }
-  vst1q_u8(dst + i * stride, row);
-}
-
-// -----------------------------------------------------------------------------
-
-void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
-  const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
-  const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
-  const uint32x2_t zero = vdup_n_u32(0);
-  const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);
-  const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL);
-  const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8));
-  const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
-  const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
-  const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
-  const uint8_t D = vget_lane_u8(XABCD_u8, 4);
-  const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
-  const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
-  const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
-  const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
-  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
-  const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
-  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
-  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
-  const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
-  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
-  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
-  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
-  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
-}
-
-#if !HAVE_NEON_ASM
-
-void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  int i;
-  uint32x2_t d0u32 = vdup_n_u32(0);
-  (void)left;
-
-  d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
-  for (i = 0; i < 4; i++, dst += stride)
-    vst1_lane_u32((uint32_t *)dst, d0u32, 0);
-}
-
-void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  int i;
-  uint8x8_t d0u8 = vdup_n_u8(0);
-  (void)left;
-
-  d0u8 = vld1_u8(above);
-  for (i = 0; i < 8; i++, dst += stride)
-    vst1_u8(dst, d0u8);
-}
-
-void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  int i;
-  uint8x16_t q0u8 = vdupq_n_u8(0);
-  (void)left;
-
-  q0u8 = vld1q_u8(above);
-  for (i = 0; i < 16; i++, dst += stride)
-    vst1q_u8(dst, q0u8);
-}
-
-void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  int i;
-  uint8x16_t q0u8 = vdupq_n_u8(0);
-  uint8x16_t q1u8 = vdupq_n_u8(0);
-  (void)left;
-
-  q0u8 = vld1q_u8(above);
-  q1u8 = vld1q_u8(above + 16);
-  for (i = 0; i < 32; i++, dst += stride) {
-    vst1q_u8(dst, q0u8);
-    vst1q_u8(dst + 16, q1u8);
-  }
-}
-
-void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  uint8x8_t d0u8 = vdup_n_u8(0);
-  uint32x2_t d1u32 = vdup_n_u32(0);
-  (void)above;
-
-  d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
-
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
-  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
-  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
-  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
-  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
-}
-
-void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  uint8x8_t d0u8 = vdup_n_u8(0);
-  uint64x1_t d1u64 = vdup_n_u64(0);
-  (void)above;
-
-  d1u64 = vld1_u64((const uint64_t *)left);
-
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
-  vst1_u8(dst, d0u8);
-}
-
-void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  int j;
-  uint8x8_t d2u8 = vdup_n_u8(0);
-  uint8x16_t q0u8 = vdupq_n_u8(0);
-  uint8x16_t q1u8 = vdupq_n_u8(0);
-  (void)above;
-
-  q1u8 = vld1q_u8(left);
-  d2u8 = vget_low_u8(q1u8);
-  for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
-    q0u8 = vdupq_lane_u8(d2u8, 0);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 1);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 2);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 3);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 4);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 5);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 6);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 7);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-  }
-}
-
-void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  int j, k;
-  uint8x8_t d2u8 = vdup_n_u8(0);
-  uint8x16_t q0u8 = vdupq_n_u8(0);
-  uint8x16_t q1u8 = vdupq_n_u8(0);
-  (void)above;
-
-  for (k = 0; k < 2; k++, left += 16) {
-    q1u8 = vld1q_u8(left);
-    d2u8 = vget_low_u8(q1u8);
-    for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
-      q0u8 = vdupq_lane_u8(d2u8, 0);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 1);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 2);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 3);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 4);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 5);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 6);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 7);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-    }
-  }
-}
-
-void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  int i;
-  uint16x8_t q1u16, q3u16;
-  int16x8_t q1s16;
-  uint8x8_t d0u8 = vdup_n_u8(0);
-  uint32x2_t d2u32 = vdup_n_u32(0);
-
-  d0u8 = vld1_dup_u8(above - 1);
-  d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0);
-  q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
-  for (i = 0; i < 4; i++, dst += stride) {
-    q1u16 = vdupq_n_u16((uint16_t)left[i]);
-    q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16),
-                      vreinterpretq_s16_u16(q3u16));
-    d0u8 = vqmovun_s16(q1s16);
-    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
-  }
-}
-
-void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  int j;
-  uint16x8_t q0u16, q3u16, q10u16;
-  int16x8_t q0s16;
-  uint16x4_t d20u16;
-  uint8x8_t d0u8, d2u8, d30u8;
-
-  d0u8 = vld1_dup_u8(above - 1);
-  d30u8 = vld1_u8(left);
-  d2u8 = vld1_u8(above);
-  q10u16 = vmovl_u8(d30u8);
-  q3u16 = vsubl_u8(d2u8, d0u8);
-  d20u16 = vget_low_u16(q10u16);
-  for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
-    q0u16 = vdupq_lane_u16(d20u16, 0);
-    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
-                      vreinterpretq_s16_u16(q0u16));
-    d0u8 = vqmovun_s16(q0s16);
-    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
-    dst += stride;
-    q0u16 = vdupq_lane_u16(d20u16, 1);
-    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
-                      vreinterpretq_s16_u16(q0u16));
-    d0u8 = vqmovun_s16(q0s16);
-    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
-    dst += stride;
-    q0u16 = vdupq_lane_u16(d20u16, 2);
-    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
-                      vreinterpretq_s16_u16(q0u16));
-    d0u8 = vqmovun_s16(q0s16);
-    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
-    dst += stride;
-    q0u16 = vdupq_lane_u16(d20u16, 3);
-    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
-                      vreinterpretq_s16_u16(q0u16));
-    d0u8 = vqmovun_s16(q0s16);
-    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
-    dst += stride;
-  }
-}
-
-void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  int j, k;
-  uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16;
-  uint8x16_t q0u8, q1u8;
-  int16x8_t q0s16, q1s16, q8s16, q11s16;
-  uint16x4_t d20u16;
-  uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8;
-
-  q0u8 = vld1q_dup_u8(above - 1);
-  q1u8 = vld1q_u8(above);
-  q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
-  q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
-  for (k = 0; k < 2; k++, left += 8) {
-    d18u8 = vld1_u8(left);
-    q10u16 = vmovl_u8(d18u8);
-    d20u16 = vget_low_u16(q10u16);
-    for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
-      q0u16 = vdupq_lane_u16(d20u16, 0);
-      q8u16 = vdupq_lane_u16(d20u16, 1);
-      q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                        vreinterpretq_s16_u16(q2u16));
-      q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                        vreinterpretq_s16_u16(q3u16));
-      q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
-                         vreinterpretq_s16_u16(q2u16));
-      q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
-                        vreinterpretq_s16_u16(q3u16));
-      d2u8 = vqmovun_s16(q1s16);
-      d3u8 = vqmovun_s16(q0s16);
-      d22u8 = vqmovun_s16(q11s16);
-      d23u8 = vqmovun_s16(q8s16);
-      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
-      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
-      dst += stride;
-      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
-      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
-      dst += stride;
-
-      q0u16 = vdupq_lane_u16(d20u16, 2);
-      q8u16 = vdupq_lane_u16(d20u16, 3);
-      q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                        vreinterpretq_s16_u16(q2u16));
-      q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                        vreinterpretq_s16_u16(q3u16));
-      q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
-                         vreinterpretq_s16_u16(q2u16));
-      q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
-                        vreinterpretq_s16_u16(q3u16));
-      d2u8 = vqmovun_s16(q1s16);
-      d3u8 = vqmovun_s16(q0s16);
-      d22u8 = vqmovun_s16(q11s16);
-      d23u8 = vqmovun_s16(q8s16);
-      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
-      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
-      dst += stride;
-      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
-      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
-      dst += stride;
-    }
-  }
-}
-
-void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  int j, k;
-  uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
-  uint8x16_t q0u8, q1u8, q2u8;
-  int16x8_t q12s16, q13s16, q14s16, q15s16;
-  uint16x4_t d6u16;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8;
-
-  q0u8 = vld1q_dup_u8(above - 1);
-  q1u8 = vld1q_u8(above);
-  q2u8 = vld1q_u8(above + 16);
-  q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
-  q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
-  q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8));
-  q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8));
-  for (k = 0; k < 4; k++, left += 8) {
-    d26u8 = vld1_u8(left);
-    q3u16 = vmovl_u8(d26u8);
-    d6u16 = vget_low_u16(q3u16);
-    for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
-      q0u16 = vdupq_lane_u16(d6u16, 0);
-      q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q8u16));
-      q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q9u16));
-      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q10u16));
-      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q11u16));
-      d0u8 = vqmovun_s16(q12s16);
-      d1u8 = vqmovun_s16(q13s16);
-      d2u8 = vqmovun_s16(q14s16);
-      d3u8 = vqmovun_s16(q15s16);
-      q0u8 = vcombine_u8(d0u8, d1u8);
-      q1u8 = vcombine_u8(d2u8, d3u8);
-      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
-      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
-      dst += stride;
-
-      q0u16 = vdupq_lane_u16(d6u16, 1);
-      q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q8u16));
-      q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q9u16));
-      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q10u16));
-      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q11u16));
-      d0u8 = vqmovun_s16(q12s16);
-      d1u8 = vqmovun_s16(q13s16);
-      d2u8 = vqmovun_s16(q14s16);
-      d3u8 = vqmovun_s16(q15s16);
-      q0u8 = vcombine_u8(d0u8, d1u8);
-      q1u8 = vcombine_u8(d2u8, d3u8);
-      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
-      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
-      dst += stride;
-
-      q0u16 = vdupq_lane_u16(d6u16, 2);
-      q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q8u16));
-      q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q9u16));
-      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q10u16));
-      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q11u16));
-      d0u8 = vqmovun_s16(q12s16);
-      d1u8 = vqmovun_s16(q13s16);
-      d2u8 = vqmovun_s16(q14s16);
-      d3u8 = vqmovun_s16(q15s16);
-      q0u8 = vcombine_u8(d0u8, d1u8);
-      q1u8 = vcombine_u8(d2u8, d3u8);
-      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
-      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
-      dst += stride;
-
-      q0u16 = vdupq_lane_u16(d6u16, 3);
-      q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q8u16));
-      q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q9u16));
-      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q10u16));
-      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q11u16));
-      d0u8 = vqmovun_s16(q12s16);
-      d1u8 = vqmovun_s16(q13s16);
-      d2u8 = vqmovun_s16(q14s16);
-      d3u8 = vqmovun_s16(q15s16);
-      q0u8 = vcombine_u8(d0u8, d1u8);
-      q1u8 = vcombine_u8(d2u8, d3u8);
-      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
-      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
-      dst += stride;
-    }
-  }
-}
-#endif  // !HAVE_NEON_ASM
diff --git a/thirdparty/libvpx/vpx_dsp/arm/loopfilter_16_neon.c b/thirdparty/libvpx/vpx_dsp/arm/loopfilter_16_neon.c
deleted file mode 100644
index d24e6adc8a..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/loopfilter_16_neon.c
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_dsp_rtcd.h"
-#include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
-
-static INLINE void loop_filter_neon_16(
-        uint8x16_t qblimit,  // blimit
-        uint8x16_t qlimit,   // limit
-        uint8x16_t qthresh,  // thresh
-        uint8x16_t q3,       // p3
-        uint8x16_t q4,       // p2
-        uint8x16_t q5,       // p1
-        uint8x16_t q6,       // p0
-        uint8x16_t q7,       // q0
-        uint8x16_t q8,       // q1
-        uint8x16_t q9,       // q2
-        uint8x16_t q10,      // q3
-        uint8x16_t *q5r,     // p1
-        uint8x16_t *q6r,     // p0
-        uint8x16_t *q7r,     // q0
-        uint8x16_t *q8r) {   // q1
-    uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
-    int16x8_t q2s16, q11s16;
-    uint16x8_t q4u16;
-    int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8;
-    int8x8_t d2s8, d3s8;
-
-    q11u8 = vabdq_u8(q3, q4);
-    q12u8 = vabdq_u8(q4, q5);
-    q13u8 = vabdq_u8(q5, q6);
-    q14u8 = vabdq_u8(q8, q7);
-    q3 = vabdq_u8(q9, q8);
-    q4 = vabdq_u8(q10, q9);
-
-    q11u8 = vmaxq_u8(q11u8, q12u8);
-    q12u8 = vmaxq_u8(q13u8, q14u8);
-    q3 = vmaxq_u8(q3, q4);
-    q15u8 = vmaxq_u8(q11u8, q12u8);
-
-    q9 = vabdq_u8(q6, q7);
-
-    // vp8_hevmask
-    q13u8 = vcgtq_u8(q13u8, qthresh);
-    q14u8 = vcgtq_u8(q14u8, qthresh);
-    q15u8 = vmaxq_u8(q15u8, q3);
-
-    q2u8 = vabdq_u8(q5, q8);
-    q9 = vqaddq_u8(q9, q9);
-
-    q15u8 = vcgeq_u8(qlimit, q15u8);
-
-    // vp8_filter() function
-    // convert to signed
-    q10 = vdupq_n_u8(0x80);
-    q8 = veorq_u8(q8, q10);
-    q7 = veorq_u8(q7, q10);
-    q6 = veorq_u8(q6, q10);
-    q5 = veorq_u8(q5, q10);
-
-    q2u8 = vshrq_n_u8(q2u8, 1);
-    q9 = vqaddq_u8(q9, q2u8);
-
-    q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
-                     vget_low_s8(vreinterpretq_s8_u8(q6)));
-    q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
-                      vget_high_s8(vreinterpretq_s8_u8(q6)));
-
-    q9 = vcgeq_u8(qblimit, q9);
-
-    q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
-                    vreinterpretq_s8_u8(q8));
-
-    q14u8 = vorrq_u8(q13u8, q14u8);
-
-    q4u16 = vdupq_n_u16(3);
-    q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
-    q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
-
-    q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
-    q15u8 = vandq_u8(q15u8, q9);
-
-    q1s8 = vreinterpretq_s8_u8(q1u8);
-    q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
-    q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
-
-    q4 = vdupq_n_u8(3);
-    q9 = vdupq_n_u8(4);
-    // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
-    d2s8 = vqmovn_s16(q2s16);
-    d3s8 = vqmovn_s16(q11s16);
-    q1s8 = vcombine_s8(d2s8, d3s8);
-    q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
-    q1s8 = vreinterpretq_s8_u8(q1u8);
-
-    q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4));
-    q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
-    q2s8 = vshrq_n_s8(q2s8, 3);
-    q1s8 = vshrq_n_s8(q1s8, 3);
-
-    q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
-    q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
-
-    q1s8 = vrshrq_n_s8(q1s8, 1);
-    q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
-
-    q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
-    q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
-
-    *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10);
-    *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8),  q10);
-    *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10);
-    *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10);
-    return;
-}
-
-void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */,
-                                    const uint8_t *blimit0,
-                                    const uint8_t *limit0,
-                                    const uint8_t *thresh0,
-                                    const uint8_t *blimit1,
-                                    const uint8_t *limit1,
-                                    const uint8_t *thresh1) {
-    uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1;
-    uint8x16_t qblimit, qlimit, qthresh;
-    uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
-
-    dblimit0 = vld1_u8(blimit0);
-    dlimit0 = vld1_u8(limit0);
-    dthresh0 = vld1_u8(thresh0);
-    dblimit1 = vld1_u8(blimit1);
-    dlimit1 = vld1_u8(limit1);
-    dthresh1 = vld1_u8(thresh1);
-    qblimit = vcombine_u8(dblimit0, dblimit1);
-    qlimit = vcombine_u8(dlimit0, dlimit1);
-    qthresh = vcombine_u8(dthresh0, dthresh1);
-
-    s -= (p << 2);
-
-    q3u8 = vld1q_u8(s);
-    s += p;
-    q4u8 = vld1q_u8(s);
-    s += p;
-    q5u8 = vld1q_u8(s);
-    s += p;
-    q6u8 = vld1q_u8(s);
-    s += p;
-    q7u8 = vld1q_u8(s);
-    s += p;
-    q8u8 = vld1q_u8(s);
-    s += p;
-    q9u8 = vld1q_u8(s);
-    s += p;
-    q10u8 = vld1q_u8(s);
-
-    loop_filter_neon_16(qblimit, qlimit, qthresh,
-                        q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8,
-                        &q5u8, &q6u8, &q7u8, &q8u8);
-
-    s -= (p * 5);
-    vst1q_u8(s, q5u8);
-    s += p;
-    vst1q_u8(s, q6u8);
-    s += p;
-    vst1q_u8(s, q7u8);
-    s += p;
-    vst1q_u8(s, q8u8);
-    return;
-}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/loopfilter_4_neon.c b/thirdparty/libvpx/vpx_dsp/arm/loopfilter_4_neon.c
deleted file mode 100644
index 7f3ee70b94..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/loopfilter_4_neon.c
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_dsp_rtcd.h"
-
-static INLINE void loop_filter_neon(
-        uint8x8_t dblimit,    // flimit
-        uint8x8_t dlimit,     // limit
-        uint8x8_t dthresh,    // thresh
-        uint8x8_t d3u8,       // p3
-        uint8x8_t d4u8,       // p2
-        uint8x8_t d5u8,       // p1
-        uint8x8_t d6u8,       // p0
-        uint8x8_t d7u8,       // q0
-        uint8x8_t d16u8,      // q1
-        uint8x8_t d17u8,      // q2
-        uint8x8_t d18u8,      // q3
-        uint8x8_t *d4ru8,     // p1
-        uint8x8_t *d5ru8,     // p0
-        uint8x8_t *d6ru8,     // q0
-        uint8x8_t *d7ru8) {   // q1
-    uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8;
-    int16x8_t q12s16;
-    int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8;
-
-    d19u8 = vabd_u8(d3u8, d4u8);
-    d20u8 = vabd_u8(d4u8, d5u8);
-    d21u8 = vabd_u8(d5u8, d6u8);
-    d22u8 = vabd_u8(d16u8, d7u8);
-    d3u8  = vabd_u8(d17u8, d16u8);
-    d4u8  = vabd_u8(d18u8, d17u8);
-
-    d19u8 = vmax_u8(d19u8, d20u8);
-    d20u8 = vmax_u8(d21u8, d22u8);
-    d3u8  = vmax_u8(d3u8,  d4u8);
-    d23u8 = vmax_u8(d19u8, d20u8);
-
-    d17u8 = vabd_u8(d6u8, d7u8);
-
-    d21u8 = vcgt_u8(d21u8, dthresh);
-    d22u8 = vcgt_u8(d22u8, dthresh);
-    d23u8 = vmax_u8(d23u8, d3u8);
-
-    d28u8 = vabd_u8(d5u8, d16u8);
-    d17u8 = vqadd_u8(d17u8, d17u8);
-
-    d23u8 = vcge_u8(dlimit, d23u8);
-
-    d18u8 = vdup_n_u8(0x80);
-    d5u8  = veor_u8(d5u8,  d18u8);
-    d6u8  = veor_u8(d6u8,  d18u8);
-    d7u8  = veor_u8(d7u8,  d18u8);
-    d16u8 = veor_u8(d16u8, d18u8);
-
-    d28u8 = vshr_n_u8(d28u8, 1);
-    d17u8 = vqadd_u8(d17u8, d28u8);
-
-    d19u8 = vdup_n_u8(3);
-
-    d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8),
-                    vreinterpret_s8_u8(d6u8));
-
-    d17u8 = vcge_u8(dblimit, d17u8);
-
-    d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8),
-                     vreinterpret_s8_u8(d16u8));
-
-    d22u8 = vorr_u8(d21u8, d22u8);
-
-    q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8));
-
-    d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8);
-    d23u8 = vand_u8(d23u8, d17u8);
-
-    q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8));
-
-    d17u8 = vdup_n_u8(4);
-
-    d27s8 = vqmovn_s16(q12s16);
-    d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8);
-    d27s8 = vreinterpret_s8_u8(d27u8);
-
-    d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8));
-    d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8));
-    d28s8 = vshr_n_s8(d28s8, 3);
-    d27s8 = vshr_n_s8(d27s8, 3);
-
-    d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8);
-    d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8);
-
-    d27s8 = vrshr_n_s8(d27s8, 1);
-    d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8));
-
-    d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8);
-    d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8);
-
-    *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8);
-    *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8);
-    *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8);
-    *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8);
-    return;
-}
-
-void vpx_lpf_horizontal_4_neon(
-        uint8_t *src,
-        int pitch,
-        const uint8_t *blimit,
-        const uint8_t *limit,
-        const uint8_t *thresh) {
-    int i;
-    uint8_t *s, *psrc;
-    uint8x8_t dblimit, dlimit, dthresh;
-    uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
-
-    dblimit = vld1_u8(blimit);
-    dlimit = vld1_u8(limit);
-    dthresh = vld1_u8(thresh);
-
-    psrc = src - (pitch << 2);
-    for (i = 0; i < 1; i++) {
-        s = psrc + i * 8;
-
-        d3u8 = vld1_u8(s);
-        s += pitch;
-        d4u8 = vld1_u8(s);
-        s += pitch;
-        d5u8 = vld1_u8(s);
-        s += pitch;
-        d6u8 = vld1_u8(s);
-        s += pitch;
-        d7u8 = vld1_u8(s);
-        s += pitch;
-        d16u8 = vld1_u8(s);
-        s += pitch;
-        d17u8 = vld1_u8(s);
-        s += pitch;
-        d18u8 = vld1_u8(s);
-
-        loop_filter_neon(dblimit, dlimit, dthresh,
-                         d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
-                         &d4u8, &d5u8, &d6u8, &d7u8);
-
-        s -= (pitch * 5);
-        vst1_u8(s, d4u8);
-        s += pitch;
-        vst1_u8(s, d5u8);
-        s += pitch;
-        vst1_u8(s, d6u8);
-        s += pitch;
-        vst1_u8(s, d7u8);
-    }
-    return;
-}
-
-void vpx_lpf_vertical_4_neon(
-        uint8_t *src,
-        int pitch,
-        const uint8_t *blimit,
-        const uint8_t *limit,
-        const uint8_t *thresh) {
-    int i, pitch8;
-    uint8_t *s;
-    uint8x8_t dblimit, dlimit, dthresh;
-    uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
-    uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
-    uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
-    uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
-    uint8x8x4_t d4Result;
-
-    dblimit = vld1_u8(blimit);
-    dlimit = vld1_u8(limit);
-    dthresh = vld1_u8(thresh);
-
-    pitch8 = pitch * 8;
-    for (i = 0; i < 1; i++, src += pitch8) {
-        s = src - (i + 1) * 4;
-
-        d3u8 = vld1_u8(s);
-        s += pitch;
-        d4u8 = vld1_u8(s);
-        s += pitch;
-        d5u8 = vld1_u8(s);
-        s += pitch;
-        d6u8 = vld1_u8(s);
-        s += pitch;
-        d7u8 = vld1_u8(s);
-        s += pitch;
-        d16u8 = vld1_u8(s);
-        s += pitch;
-        d17u8 = vld1_u8(s);
-        s += pitch;
-        d18u8 = vld1_u8(s);
-
-        d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
-                      vreinterpret_u32_u8(d7u8));
-        d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
-                      vreinterpret_u32_u8(d16u8));
-        d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
-                      vreinterpret_u32_u8(d17u8));
-        d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
-                      vreinterpret_u32_u8(d18u8));
-
-        d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
-                          vreinterpret_u16_u32(d2tmp2.val[0]));
-        d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
-                          vreinterpret_u16_u32(d2tmp3.val[0]));
-        d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
-                          vreinterpret_u16_u32(d2tmp2.val[1]));
-        d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
-                          vreinterpret_u16_u32(d2tmp3.val[1]));
-
-        d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
-                         vreinterpret_u8_u16(d2tmp5.val[0]));
-        d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
-                         vreinterpret_u8_u16(d2tmp5.val[1]));
-        d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
-                          vreinterpret_u8_u16(d2tmp7.val[0]));
-        d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
-                          vreinterpret_u8_u16(d2tmp7.val[1]));
-
-        d3u8 = d2tmp8.val[0];
-        d4u8 = d2tmp8.val[1];
-        d5u8 = d2tmp9.val[0];
-        d6u8 = d2tmp9.val[1];
-        d7u8 = d2tmp10.val[0];
-        d16u8 = d2tmp10.val[1];
-        d17u8 = d2tmp11.val[0];
-        d18u8 = d2tmp11.val[1];
-
-        loop_filter_neon(dblimit, dlimit, dthresh,
-                         d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
-                         &d4u8, &d5u8, &d6u8, &d7u8);
-
-        d4Result.val[0] = d4u8;
-        d4Result.val[1] = d5u8;
-        d4Result.val[2] = d6u8;
-        d4Result.val[3] = d7u8;
-
-        src -= 2;
-        vst4_lane_u8(src, d4Result, 0);
-        src += pitch;
-        vst4_lane_u8(src, d4Result, 1);
-        src += pitch;
-        vst4_lane_u8(src, d4Result, 2);
-        src += pitch;
-        vst4_lane_u8(src, d4Result, 3);
-        src += pitch;
-        vst4_lane_u8(src, d4Result, 4);
-        src += pitch;
-        vst4_lane_u8(src, d4Result, 5);
-        src += pitch;
-        vst4_lane_u8(src, d4Result, 6);
-        src += pitch;
-        vst4_lane_u8(src, d4Result, 7);
-    }
-    return;
-}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/loopfilter_8_neon.c b/thirdparty/libvpx/vpx_dsp/arm/loopfilter_8_neon.c
deleted file mode 100644
index ec3757380d..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/loopfilter_8_neon.c
+++ /dev/null
@@ -1,445 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_dsp_rtcd.h"
-
-static INLINE void mbloop_filter_neon(
-        uint8x8_t dblimit,   // mblimit
-        uint8x8_t dlimit,    // limit
-        uint8x8_t dthresh,   // thresh
-        uint8x8_t d3u8,      // p2
-        uint8x8_t d4u8,      // p2
-        uint8x8_t d5u8,      // p1
-        uint8x8_t d6u8,      // p0
-        uint8x8_t d7u8,      // q0
-        uint8x8_t d16u8,     // q1
-        uint8x8_t d17u8,     // q2
-        uint8x8_t d18u8,     // q3
-        uint8x8_t *d0ru8,    // p1
-        uint8x8_t *d1ru8,    // p1
-        uint8x8_t *d2ru8,    // p0
-        uint8x8_t *d3ru8,    // q0
-        uint8x8_t *d4ru8,    // q1
-        uint8x8_t *d5ru8) {  // q1
-    uint32_t flat;
-    uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8;
-    uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
-    int16x8_t q15s16;
-    uint16x8_t q10u16, q14u16;
-    int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8;
-
-    d19u8 = vabd_u8(d3u8, d4u8);
-    d20u8 = vabd_u8(d4u8, d5u8);
-    d21u8 = vabd_u8(d5u8, d6u8);
-    d22u8 = vabd_u8(d16u8, d7u8);
-    d23u8 = vabd_u8(d17u8, d16u8);
-    d24u8 = vabd_u8(d18u8, d17u8);
-
-    d19u8 = vmax_u8(d19u8, d20u8);
-    d20u8 = vmax_u8(d21u8, d22u8);
-
-    d25u8 = vabd_u8(d6u8, d4u8);
-
-    d23u8 = vmax_u8(d23u8, d24u8);
-
-    d26u8 = vabd_u8(d7u8, d17u8);
-
-    d19u8 = vmax_u8(d19u8, d20u8);
-
-    d24u8 = vabd_u8(d6u8, d7u8);
-    d27u8 = vabd_u8(d3u8, d6u8);
-    d28u8 = vabd_u8(d18u8, d7u8);
-
-    d19u8 = vmax_u8(d19u8, d23u8);
-
-    d23u8 = vabd_u8(d5u8, d16u8);
-    d24u8 = vqadd_u8(d24u8, d24u8);
-
-
-    d19u8 = vcge_u8(dlimit, d19u8);
-
-
-    d25u8 = vmax_u8(d25u8, d26u8);
-    d26u8 = vmax_u8(d27u8, d28u8);
-
-    d23u8 = vshr_n_u8(d23u8, 1);
-
-    d25u8 = vmax_u8(d25u8, d26u8);
-
-    d24u8 = vqadd_u8(d24u8, d23u8);
-
-    d20u8 = vmax_u8(d20u8, d25u8);
-
-    d23u8 = vdup_n_u8(1);
-    d24u8 = vcge_u8(dblimit, d24u8);
-
-    d21u8 = vcgt_u8(d21u8, dthresh);
-
-    d20u8 = vcge_u8(d23u8, d20u8);
-
-    d19u8 = vand_u8(d19u8, d24u8);
-
-    d23u8 = vcgt_u8(d22u8, dthresh);
-
-    d20u8 = vand_u8(d20u8, d19u8);
-
-    d22u8 = vdup_n_u8(0x80);
-
-    d23u8 = vorr_u8(d21u8, d23u8);
-
-    q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8),
-                          vreinterpret_u16_u8(d21u8));
-
-    d30u8 = vshrn_n_u16(q10u16, 4);
-    flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0);
-
-    if (flat == 0xffffffff) {  // Check for all 1's, power_branch_only
-        d27u8 = vdup_n_u8(3);
-        d21u8 = vdup_n_u8(2);
-        q14u16 = vaddl_u8(d6u8, d7u8);
-        q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
-        q14u16 = vmlal_u8(q14u16, d4u8, d21u8);
-        q14u16 = vaddw_u8(q14u16, d5u8);
-        *d0ru8 = vqrshrn_n_u16(q14u16, 3);
-
-        q14u16 = vsubw_u8(q14u16, d3u8);
-        q14u16 = vsubw_u8(q14u16, d4u8);
-        q14u16 = vaddw_u8(q14u16, d5u8);
-        q14u16 = vaddw_u8(q14u16, d16u8);
-        *d1ru8 = vqrshrn_n_u16(q14u16, 3);
-
-        q14u16 = vsubw_u8(q14u16, d3u8);
-        q14u16 = vsubw_u8(q14u16, d5u8);
-        q14u16 = vaddw_u8(q14u16, d6u8);
-        q14u16 = vaddw_u8(q14u16, d17u8);
-        *d2ru8 = vqrshrn_n_u16(q14u16, 3);
-
-        q14u16 = vsubw_u8(q14u16, d3u8);
-        q14u16 = vsubw_u8(q14u16, d6u8);
-        q14u16 = vaddw_u8(q14u16, d7u8);
-        q14u16 = vaddw_u8(q14u16, d18u8);
-        *d3ru8 = vqrshrn_n_u16(q14u16, 3);
-
-        q14u16 = vsubw_u8(q14u16, d4u8);
-        q14u16 = vsubw_u8(q14u16, d7u8);
-        q14u16 = vaddw_u8(q14u16, d16u8);
-        q14u16 = vaddw_u8(q14u16, d18u8);
-        *d4ru8 = vqrshrn_n_u16(q14u16, 3);
-
-        q14u16 = vsubw_u8(q14u16, d5u8);
-        q14u16 = vsubw_u8(q14u16, d16u8);
-        q14u16 = vaddw_u8(q14u16, d17u8);
-        q14u16 = vaddw_u8(q14u16, d18u8);
-        *d5ru8 = vqrshrn_n_u16(q14u16, 3);
-    } else {
-        d21u8 = veor_u8(d7u8,  d22u8);
-        d24u8 = veor_u8(d6u8,  d22u8);
-        d25u8 = veor_u8(d5u8,  d22u8);
-        d26u8 = veor_u8(d16u8, d22u8);
-
-        d27u8 = vdup_n_u8(3);
-
-        d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8));
-        d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8));
-
-        q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8));
-
-        d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8));
-
-        q15s16 = vaddw_s8(q15s16, d29s8);
-
-        d29u8 = vdup_n_u8(4);
-
-        d28s8 = vqmovn_s16(q15s16);
-
-        d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8));
-
-        d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8));
-        d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8));
-        d30s8 = vshr_n_s8(d30s8, 3);
-        d29s8 = vshr_n_s8(d29s8, 3);
-
-        d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8);
-        d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8);
-
-        d29s8 = vrshr_n_s8(d29s8, 1);
-        d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8));
-
-        d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8);
-        d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8);
-
-        if (flat == 0) {  // filter_branch_only
-            *d0ru8 = d4u8;
-            *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
-            *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
-            *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
-            *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
-            *d5ru8 = d17u8;
-            return;
-        }
-
-        d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
-        d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
-        d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
-        d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
-
-        d23u8 = vdup_n_u8(2);
-        q14u16 = vaddl_u8(d6u8, d7u8);
-        q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
-        q14u16 = vmlal_u8(q14u16, d4u8, d23u8);
-
-        d0u8 = vbsl_u8(d20u8, dblimit, d4u8);
-
-        q14u16 = vaddw_u8(q14u16, d5u8);
-
-        d1u8 = vbsl_u8(d20u8, dlimit, d25u8);
-
-        d30u8 = vqrshrn_n_u16(q14u16, 3);
-
-        q14u16 = vsubw_u8(q14u16, d3u8);
-        q14u16 = vsubw_u8(q14u16, d4u8);
-        q14u16 = vaddw_u8(q14u16, d5u8);
-        q14u16 = vaddw_u8(q14u16, d16u8);
-
-        d2u8 = vbsl_u8(d20u8, dthresh, d24u8);
-
-        d31u8 = vqrshrn_n_u16(q14u16, 3);
-
-        q14u16 = vsubw_u8(q14u16, d3u8);
-        q14u16 = vsubw_u8(q14u16, d5u8);
-        q14u16 = vaddw_u8(q14u16, d6u8);
-        q14u16 = vaddw_u8(q14u16, d17u8);
-
-        *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8);
-
-        d23u8 = vqrshrn_n_u16(q14u16, 3);
-
-        q14u16 = vsubw_u8(q14u16, d3u8);
-        q14u16 = vsubw_u8(q14u16, d6u8);
-        q14u16 = vaddw_u8(q14u16, d7u8);
-
-        *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8);
-
-        q14u16 = vaddw_u8(q14u16, d18u8);
-
-        *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8);
-
-        d22u8 = vqrshrn_n_u16(q14u16, 3);
-
-        q14u16 = vsubw_u8(q14u16, d4u8);
-        q14u16 = vsubw_u8(q14u16, d7u8);
-        q14u16 = vaddw_u8(q14u16, d16u8);
-
-        d3u8 = vbsl_u8(d20u8, d3u8, d21u8);
-
-        q14u16 = vaddw_u8(q14u16, d18u8);
-
-        d4u8 = vbsl_u8(d20u8, d4u8, d26u8);
-
-        d6u8 = vqrshrn_n_u16(q14u16, 3);
-
-        q14u16 = vsubw_u8(q14u16, d5u8);
-        q14u16 = vsubw_u8(q14u16, d16u8);
-        q14u16 = vaddw_u8(q14u16, d17u8);
-        q14u16 = vaddw_u8(q14u16, d18u8);
-
-        d5u8 = vbsl_u8(d20u8, d5u8, d17u8);
-
-        d7u8 = vqrshrn_n_u16(q14u16, 3);
-
-        *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8);
-        *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8);
-        *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8);
-    }
-    return;
-}
-
-void vpx_lpf_horizontal_8_neon(
-        uint8_t *src,
-        int pitch,
-        const uint8_t *blimit,
-        const uint8_t *limit,
-        const uint8_t *thresh) {
-    int i;
-    uint8_t *s, *psrc;
-    uint8x8_t dblimit, dlimit, dthresh;
-    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
-    uint8x8_t d16u8, d17u8, d18u8;
-
-    dblimit = vld1_u8(blimit);
-    dlimit = vld1_u8(limit);
-    dthresh = vld1_u8(thresh);
-
-    psrc = src - (pitch << 2);
-    for (i = 0; i < 1; i++) {
-        s = psrc + i * 8;
-
-        d3u8  = vld1_u8(s);
-        s += pitch;
-        d4u8  = vld1_u8(s);
-        s += pitch;
-        d5u8  = vld1_u8(s);
-        s += pitch;
-        d6u8  = vld1_u8(s);
-        s += pitch;
-        d7u8  = vld1_u8(s);
-        s += pitch;
-        d16u8 = vld1_u8(s);
-        s += pitch;
-        d17u8 = vld1_u8(s);
-        s += pitch;
-        d18u8 = vld1_u8(s);
-
-        mbloop_filter_neon(dblimit, dlimit, dthresh,
-                           d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
-                           &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
-
-        s -= (pitch * 6);
-        vst1_u8(s, d0u8);
-        s += pitch;
-        vst1_u8(s, d1u8);
-        s += pitch;
-        vst1_u8(s, d2u8);
-        s += pitch;
-        vst1_u8(s, d3u8);
-        s += pitch;
-        vst1_u8(s, d4u8);
-        s += pitch;
-        vst1_u8(s, d5u8);
-    }
-    return;
-}
-
-void vpx_lpf_vertical_8_neon(
-        uint8_t *src,
-        int pitch,
-        const uint8_t *blimit,
-        const uint8_t *limit,
-        const uint8_t *thresh) {
-    int i;
-    uint8_t *s;
-    uint8x8_t dblimit, dlimit, dthresh;
-    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
-    uint8x8_t d16u8, d17u8, d18u8;
-    uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
-    uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
-    uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
-    uint8x8x4_t d4Result;
-    uint8x8x2_t d2Result;
-
-    dblimit = vld1_u8(blimit);
-    dlimit = vld1_u8(limit);
-    dthresh = vld1_u8(thresh);
-
-    for (i = 0; i < 1; i++) {
-        s = src + (i * (pitch << 3)) - 4;
-
-        d3u8 = vld1_u8(s);
-        s += pitch;
-        d4u8 = vld1_u8(s);
-        s += pitch;
-        d5u8 = vld1_u8(s);
-        s += pitch;
-        d6u8 = vld1_u8(s);
-        s += pitch;
-        d7u8 = vld1_u8(s);
-        s += pitch;
-        d16u8 = vld1_u8(s);
-        s += pitch;
-        d17u8 = vld1_u8(s);
-        s += pitch;
-        d18u8 = vld1_u8(s);
-
-        d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
-                          vreinterpret_u32_u8(d7u8));
-        d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
-                          vreinterpret_u32_u8(d16u8));
-        d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
-                          vreinterpret_u32_u8(d17u8));
-        d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
-                          vreinterpret_u32_u8(d18u8));
-
-        d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
-                          vreinterpret_u16_u32(d2tmp2.val[0]));
-        d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
-                          vreinterpret_u16_u32(d2tmp3.val[0]));
-        d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
-                          vreinterpret_u16_u32(d2tmp2.val[1]));
-        d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
-                          vreinterpret_u16_u32(d2tmp3.val[1]));
-
-        d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
-                         vreinterpret_u8_u16(d2tmp5.val[0]));
-        d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
-                         vreinterpret_u8_u16(d2tmp5.val[1]));
-        d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
-                          vreinterpret_u8_u16(d2tmp7.val[0]));
-        d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
-                          vreinterpret_u8_u16(d2tmp7.val[1]));
-
-        d3u8 = d2tmp8.val[0];
-        d4u8 = d2tmp8.val[1];
-        d5u8 = d2tmp9.val[0];
-        d6u8 = d2tmp9.val[1];
-        d7u8 = d2tmp10.val[0];
-        d16u8 = d2tmp10.val[1];
-        d17u8 = d2tmp11.val[0];
-        d18u8 = d2tmp11.val[1];
-
-        mbloop_filter_neon(dblimit, dlimit, dthresh,
-                           d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
-                           &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
-
-        d4Result.val[0] = d0u8;
-        d4Result.val[1] = d1u8;
-        d4Result.val[2] = d2u8;
-        d4Result.val[3] = d3u8;
-
-        d2Result.val[0] = d4u8;
-        d2Result.val[1] = d5u8;
-
-        s = src - 3;
-        vst4_lane_u8(s, d4Result, 0);
-        s += pitch;
-        vst4_lane_u8(s, d4Result, 1);
-        s += pitch;
-        vst4_lane_u8(s, d4Result, 2);
-        s += pitch;
-        vst4_lane_u8(s, d4Result, 3);
-        s += pitch;
-        vst4_lane_u8(s, d4Result, 4);
-        s += pitch;
-        vst4_lane_u8(s, d4Result, 5);
-        s += pitch;
-        vst4_lane_u8(s, d4Result, 6);
-        s += pitch;
-        vst4_lane_u8(s, d4Result, 7);
-
-        s = src + 1;
-        vst2_lane_u8(s, d2Result, 0);
-        s += pitch;
-        vst2_lane_u8(s, d2Result, 1);
-        s += pitch;
-        vst2_lane_u8(s, d2Result, 2);
-        s += pitch;
-        vst2_lane_u8(s, d2Result, 3);
-        s += pitch;
-        vst2_lane_u8(s, d2Result, 4);
-        s += pitch;
-        vst2_lane_u8(s, d2Result, 5);
-        s += pitch;
-        vst2_lane_u8(s, d2Result, 6);
-        s += pitch;
-        vst2_lane_u8(s, d2Result, 7);
-    }
-    return;
-}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/loopfilter_neon.c b/thirdparty/libvpx/vpx_dsp/arm/loopfilter_neon.c
deleted file mode 100644
index aa31f29358..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/loopfilter_neon.c
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_dsp_rtcd.h"
-#include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
-
-void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p,
-                                  const uint8_t *blimit0,
-                                  const uint8_t *limit0,
-                                  const uint8_t *thresh0,
-                                  const uint8_t *blimit1,
-                                  const uint8_t *limit1,
-                                  const uint8_t *thresh1) {
-  vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0);
-  vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1);
-}
-
-#if HAVE_NEON_ASM
-void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
-                                    const uint8_t *blimit0,
-                                    const uint8_t *limit0,
-                                    const uint8_t *thresh0,
-                                    const uint8_t *blimit1,
-                                    const uint8_t *limit1,
-                                    const uint8_t *thresh1) {
-  vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);
-  vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);
-}
-
-void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p,
-                                  const uint8_t *blimit0,
-                                  const uint8_t *limit0,
-                                  const uint8_t *thresh0,
-                                  const uint8_t *blimit1,
-                                  const uint8_t *limit1,
-                                  const uint8_t *thresh1) {
-  vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
-  vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
-}
-
-void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p,
-                                   const uint8_t *blimit,
-                                   const uint8_t *limit,
-                                   const uint8_t *thresh) {
-  vpx_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
-  vpx_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
-}
-#endif  // HAVE_NEON_ASM
diff --git a/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon.c b/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon.c
deleted file mode 100644
index 8632250138..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon.c
+++ /dev/null
@@ -1,373 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-#include <assert.h>
-
-#include "./vpx_config.h"
-#include "./vpx_dsp_rtcd.h"
-#include "vpx/vpx_integer.h"
-#include "vpx_ports/mem.h"
-
-static INLINE int32x4_t MULTIPLY_BY_Q0(
-    int16x4_t dsrc0,
-    int16x4_t dsrc1,
-    int16x4_t dsrc2,
-    int16x4_t dsrc3,
-    int16x4_t dsrc4,
-    int16x4_t dsrc5,
-    int16x4_t dsrc6,
-    int16x4_t dsrc7,
-    int16x8_t q0s16) {
-  int32x4_t qdst;
-  int16x4_t d0s16, d1s16;
-
-  d0s16 = vget_low_s16(q0s16);
-  d1s16 = vget_high_s16(q0s16);
-
-  qdst = vmull_lane_s16(dsrc0, d0s16, 0);
-  qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
-  qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
-  qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
-  qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
-  qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
-  qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
-  qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
-  return qdst;
-}
-
-void vpx_convolve8_avg_horiz_neon(
-    const uint8_t *src,
-    ptrdiff_t src_stride,
-    uint8_t *dst,
-    ptrdiff_t dst_stride,
-    const int16_t *filter_x,
-    int x_step_q4,
-    const int16_t *filter_y,  // unused
-    int y_step_q4,            // unused
-    int w,
-    int h) {
-  int width;
-  const uint8_t *s;
-  uint8_t *d;
-  uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
-  uint32x2_t d2u32, d3u32, d6u32, d7u32, d28u32, d29u32, d30u32, d31u32;
-  uint8x16_t q1u8, q3u8, q12u8, q13u8, q14u8, q15u8;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16;
-  uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
-  int16x8_t q0s16;
-  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
-  int32x4_t q1s32, q2s32, q14s32, q15s32;
-  uint16x8x2_t q0x2u16;
-  uint8x8x2_t d0x2u8, d1x2u8;
-  uint32x2x2_t d0x2u32;
-  uint16x4x2_t d0x2u16, d1x2u16;
-  uint32x4x2_t q0x2u32;
-
-  assert(x_step_q4 == 16);
-
-  q0s16 = vld1q_s16(filter_x);
-
-  src -= 3;  // adjust for taps
-  for (; h > 0; h -= 4) {  // loop_horiz_v
-    s = src;
-    d24u8 = vld1_u8(s);
-    s += src_stride;
-    d25u8 = vld1_u8(s);
-    s += src_stride;
-    d26u8 = vld1_u8(s);
-    s += src_stride;
-    d27u8 = vld1_u8(s);
-
-    q12u8 = vcombine_u8(d24u8, d25u8);
-    q13u8 = vcombine_u8(d26u8, d27u8);
-
-    q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
-                        vreinterpretq_u16_u8(q13u8));
-    d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
-    d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
-    d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
-    d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
-    d0x2u8 = vtrn_u8(d24u8, d25u8);
-    d1x2u8 = vtrn_u8(d26u8, d27u8);
-
-    __builtin_prefetch(src + src_stride * 4);
-    __builtin_prefetch(src + src_stride * 5);
-
-    q8u16 = vmovl_u8(d0x2u8.val[0]);
-    q9u16 = vmovl_u8(d0x2u8.val[1]);
-    q10u16 = vmovl_u8(d1x2u8.val[0]);
-    q11u16 = vmovl_u8(d1x2u8.val[1]);
-
-    src += 7;
-    d16u16 = vget_low_u16(q8u16);
-    d17u16 = vget_high_u16(q8u16);
-    d18u16 = vget_low_u16(q9u16);
-    d19u16 = vget_high_u16(q9u16);
-    q8u16 = vcombine_u16(d16u16, d18u16);  // vswp 17 18
-    q9u16 = vcombine_u16(d17u16, d19u16);
-
-    d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
-    d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));  // vmov 23 21
-    for (width = w;
-         width > 0;
-         width -= 4, src += 4, dst += 4) {  // loop_horiz
-      s = src;
-      d28u32 = vld1_dup_u32((const uint32_t *)s);
-      s += src_stride;
-      d29u32 = vld1_dup_u32((const uint32_t *)s);
-      s += src_stride;
-      d31u32 = vld1_dup_u32((const uint32_t *)s);
-      s += src_stride;
-      d30u32 = vld1_dup_u32((const uint32_t *)s);
-
-      __builtin_prefetch(src + 64);
-
-      d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
-                         vreinterpret_u16_u32(d31u32));
-      d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
-                         vreinterpret_u16_u32(d30u32));
-      d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28
-                       vreinterpret_u8_u16(d1x2u16.val[0]));  // d29
-      d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31
-                       vreinterpret_u8_u16(d1x2u16.val[1]));  // d30
-
-      __builtin_prefetch(src + 64 + src_stride);
-
-      q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
-      q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
-      q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
-                          vreinterpretq_u32_u8(q15u8));
-
-      d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
-      d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
-      q12u16 = vmovl_u8(d28u8);
-      q13u16 = vmovl_u8(d29u8);
-
-      __builtin_prefetch(src + 64 + src_stride * 2);
-
-      d = dst;
-      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
-      d += dst_stride;
-      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
-      d += dst_stride;
-      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
-      d += dst_stride;
-      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
-
-      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
-      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
-      d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
-      d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
-      d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-
-      q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
-                              d18s16, d19s16, d23s16, d24s16, q0s16);
-      q2s32  = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
-                              d19s16, d23s16, d24s16, d26s16, q0s16);
-      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
-                              d23s16, d24s16, d26s16, d27s16, q0s16);
-      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
-                              d24s16, d26s16, d27s16, d25s16, q0s16);
-
-      __builtin_prefetch(src + 64 + src_stride * 3);
-
-      d2u16 = vqrshrun_n_s32(q1s32, 7);
-      d3u16 = vqrshrun_n_s32(q2s32, 7);
-      d4u16 = vqrshrun_n_s32(q14s32, 7);
-      d5u16 = vqrshrun_n_s32(q15s32, 7);
-
-      q1u16 = vcombine_u16(d2u16, d3u16);
-      q2u16 = vcombine_u16(d4u16, d5u16);
-
-      d2u8 = vqmovn_u16(q1u16);
-      d3u8 = vqmovn_u16(q2u16);
-
-      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
-                         vreinterpret_u16_u8(d3u8));
-      d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
-                         vreinterpret_u32_u16(d0x2u16.val[1]));
-      d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
-                       vreinterpret_u8_u32(d0x2u32.val[1]));
-
-      q1u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
-      q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
-
-      q1u8 = vrhaddq_u8(q1u8, q3u8);
-
-      d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
-      d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
-
-      d = dst;
-      vst1_lane_u32((uint32_t *)d, d2u32, 0);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d3u32, 0);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d2u32, 1);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d3u32, 1);
-
-      q8u16 = q9u16;
-      d20s16 = d23s16;
-      q11u16 = q12u16;
-      q9u16 = q13u16;
-      d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-    }
-    src += src_stride * 4 - w - 7;
-    dst += dst_stride * 4 - w;
-  }
-  return;
-}
-
-void vpx_convolve8_avg_vert_neon(
-    const uint8_t *src,
-    ptrdiff_t src_stride,
-    uint8_t *dst,
-    ptrdiff_t dst_stride,
-    const int16_t *filter_x,  // unused
-    int x_step_q4,            // unused
-    const int16_t *filter_y,
-    int y_step_q4,
-    int w,
-    int h) {
-  int height;
-  const uint8_t *s;
-  uint8_t *d;
-  uint8x8_t d2u8, d3u8;
-  uint32x2_t d2u32, d3u32, d6u32, d7u32;
-  uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
-  uint8x16_t q1u8, q3u8;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16;
-  uint16x4_t d2u16, d3u16, d4u16, d5u16;
-  int16x8_t q0s16;
-  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
-  int32x4_t q1s32, q2s32, q14s32, q15s32;
-
-  assert(y_step_q4 == 16);
-
-  src -= src_stride * 3;
-  q0s16 = vld1q_s16(filter_y);
-  for (; w > 0; w -= 4, src += 4, dst += 4) {  // loop_vert_h
-    s = src;
-    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
-    s += src_stride;
-    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
-    s += src_stride;
-    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
-    s += src_stride;
-    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
-    s += src_stride;
-    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
-    s += src_stride;
-    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
-    s += src_stride;
-    d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
-    s += src_stride;
-
-    q8u16  = vmovl_u8(vreinterpret_u8_u32(d16u32));
-    q9u16  = vmovl_u8(vreinterpret_u8_u32(d18u32));
-    q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
-    q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
-
-    d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
-    d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
-    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-    d = dst;
-    for (height = h; height > 0; height -= 4) {  // loop_vert
-      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
-      s += src_stride;
-      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
-      s += src_stride;
-      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
-      s += src_stride;
-      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
-      s += src_stride;
-
-      q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
-      q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
-
-      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
-      d += dst_stride;
-      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
-      d += dst_stride;
-      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
-      d += dst_stride;
-      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
-      d -= dst_stride * 3;
-
-      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
-      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
-      d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
-      d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
-      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-
-      __builtin_prefetch(s);
-      __builtin_prefetch(s + src_stride);
-      q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
-                              d20s16, d21s16, d22s16, d24s16, q0s16);
-      __builtin_prefetch(s + src_stride * 2);
-      __builtin_prefetch(s + src_stride * 3);
-      q2s32  = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
-                              d21s16, d22s16, d24s16, d26s16, q0s16);
-      __builtin_prefetch(d);
-      __builtin_prefetch(d + dst_stride);
-      q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
-                              d22s16, d24s16, d26s16, d27s16, q0s16);
-      __builtin_prefetch(d + dst_stride * 2);
-      __builtin_prefetch(d + dst_stride * 3);
-      q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
-                              d24s16, d26s16, d27s16, d25s16, q0s16);
-
-      d2u16 = vqrshrun_n_s32(q1s32, 7);
-      d3u16 = vqrshrun_n_s32(q2s32, 7);
-      d4u16 = vqrshrun_n_s32(q14s32, 7);
-      d5u16 = vqrshrun_n_s32(q15s32, 7);
-
-      q1u16 = vcombine_u16(d2u16, d3u16);
-      q2u16 = vcombine_u16(d4u16, d5u16);
-
-      d2u8 = vqmovn_u16(q1u16);
-      d3u8 = vqmovn_u16(q2u16);
-
-      q1u8 = vcombine_u8(d2u8, d3u8);
-      q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
-
-      q1u8 = vrhaddq_u8(q1u8, q3u8);
-
-      d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
-      d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
-
-      vst1_lane_u32((uint32_t *)d, d2u32, 0);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d2u32, 1);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d3u32, 0);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d3u32, 1);
-      d += dst_stride;
-
-      q8u16 = q10u16;
-      d18s16 = d22s16;
-      d19s16 = d24s16;
-      q10u16 = q13u16;
-      d22s16 = d25s16;
-    }
-  }
-  return;
-}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c b/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
deleted file mode 100644
index 9bd715e2c6..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
+++ /dev/null
@@ -1,340 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-#include <assert.h>
-
-#include "./vpx_config.h"
-#include "./vpx_dsp_rtcd.h"
-#include "vpx/vpx_integer.h"
-#include "vpx_ports/mem.h"
-
-static INLINE int32x4_t MULTIPLY_BY_Q0(
-    int16x4_t dsrc0,
-    int16x4_t dsrc1,
-    int16x4_t dsrc2,
-    int16x4_t dsrc3,
-    int16x4_t dsrc4,
-    int16x4_t dsrc5,
-    int16x4_t dsrc6,
-    int16x4_t dsrc7,
-    int16x8_t q0s16) {
-  int32x4_t qdst;
-  int16x4_t d0s16, d1s16;
-
-  d0s16 = vget_low_s16(q0s16);
-  d1s16 = vget_high_s16(q0s16);
-
-  qdst = vmull_lane_s16(dsrc0, d0s16, 0);
-  qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
-  qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
-  qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
-  qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
-  qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
-  qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
-  qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
-  return qdst;
-}
-
-void vpx_convolve8_horiz_neon(
-    const uint8_t *src,
-    ptrdiff_t src_stride,
-    uint8_t *dst,
-    ptrdiff_t dst_stride,
-    const int16_t *filter_x,
-    int x_step_q4,
-    const int16_t *filter_y,  // unused
-    int y_step_q4,            // unused
-    int w,
-    int h) {
-  int width;
-  const uint8_t *s, *psrc;
-  uint8_t *d, *pdst;
-  uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
-  uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32;
-  uint8x16_t q12u8, q13u8, q14u8, q15u8;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16;
-  uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
-  int16x8_t q0s16;
-  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
-  int32x4_t q1s32, q2s32, q14s32, q15s32;
-  uint16x8x2_t q0x2u16;
-  uint8x8x2_t d0x2u8, d1x2u8;
-  uint32x2x2_t d0x2u32;
-  uint16x4x2_t d0x2u16, d1x2u16;
-  uint32x4x2_t q0x2u32;
-
-  assert(x_step_q4 == 16);
-
-  q0s16 = vld1q_s16(filter_x);
-
-  src -= 3;  // adjust for taps
-  for (; h > 0; h -= 4,
-    src += src_stride * 4,
-    dst += dst_stride * 4) {  // loop_horiz_v
-    s = src;
-    d24u8 = vld1_u8(s);
-    s += src_stride;
-    d25u8 = vld1_u8(s);
-    s += src_stride;
-    d26u8 = vld1_u8(s);
-    s += src_stride;
-    d27u8 = vld1_u8(s);
-
-    q12u8 = vcombine_u8(d24u8, d25u8);
-    q13u8 = vcombine_u8(d26u8, d27u8);
-
-    q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
-                        vreinterpretq_u16_u8(q13u8));
-    d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
-    d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
-    d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
-    d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
-    d0x2u8 = vtrn_u8(d24u8, d25u8);
-    d1x2u8 = vtrn_u8(d26u8, d27u8);
-
-    __builtin_prefetch(src + src_stride * 4);
-    __builtin_prefetch(src + src_stride * 5);
-    __builtin_prefetch(src + src_stride * 6);
-
-    q8u16  = vmovl_u8(d0x2u8.val[0]);
-    q9u16  = vmovl_u8(d0x2u8.val[1]);
-    q10u16 = vmovl_u8(d1x2u8.val[0]);
-    q11u16 = vmovl_u8(d1x2u8.val[1]);
-
-    d16u16 = vget_low_u16(q8u16);
-    d17u16 = vget_high_u16(q8u16);
-    d18u16 = vget_low_u16(q9u16);
-    d19u16 = vget_high_u16(q9u16);
-    q8u16 = vcombine_u16(d16u16, d18u16);  // vswp 17 18
-    q9u16 = vcombine_u16(d17u16, d19u16);
-
-    d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
-    d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));  // vmov 23 21
-    for (width = w, psrc = src + 7, pdst = dst;
-         width > 0;
-         width -= 4, psrc += 4, pdst += 4) {  // loop_horiz
-      s = psrc;
-      d28u32 = vld1_dup_u32((const uint32_t *)s);
-      s += src_stride;
-      d29u32 = vld1_dup_u32((const uint32_t *)s);
-      s += src_stride;
-      d31u32 = vld1_dup_u32((const uint32_t *)s);
-      s += src_stride;
-      d30u32 = vld1_dup_u32((const uint32_t *)s);
-
-      __builtin_prefetch(psrc + 64);
-
-      d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
-                         vreinterpret_u16_u32(d31u32));
-      d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
-                         vreinterpret_u16_u32(d30u32));
-      d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28
-                       vreinterpret_u8_u16(d1x2u16.val[0]));  // d29
-      d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31
-                       vreinterpret_u8_u16(d1x2u16.val[1]));  // d30
-
-      __builtin_prefetch(psrc + 64 + src_stride);
-
-      q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
-      q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
-      q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
-                          vreinterpretq_u32_u8(q15u8));
-
-      d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
-      d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
-      q12u16 = vmovl_u8(d28u8);
-      q13u16 = vmovl_u8(d29u8);
-
-      __builtin_prefetch(psrc + 64 + src_stride * 2);
-
-      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
-      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
-      d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
-      d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
-      d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-
-      q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
-                              d18s16, d19s16, d23s16, d24s16, q0s16);
-      q2s32  = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
-                              d19s16, d23s16, d24s16, d26s16, q0s16);
-      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
-                              d23s16, d24s16, d26s16, d27s16, q0s16);
-      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
-                              d24s16, d26s16, d27s16, d25s16, q0s16);
-
-      __builtin_prefetch(psrc + 60 + src_stride * 3);
-
-      d2u16 = vqrshrun_n_s32(q1s32, 7);
-      d3u16 = vqrshrun_n_s32(q2s32, 7);
-      d4u16 = vqrshrun_n_s32(q14s32, 7);
-      d5u16 = vqrshrun_n_s32(q15s32, 7);
-
-      q1u16 = vcombine_u16(d2u16, d3u16);
-      q2u16 = vcombine_u16(d4u16, d5u16);
-
-      d2u8 = vqmovn_u16(q1u16);
-      d3u8 = vqmovn_u16(q2u16);
-
-      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
-                         vreinterpret_u16_u8(d3u8));
-      d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
-                         vreinterpret_u32_u16(d0x2u16.val[1]));
-      d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
-                       vreinterpret_u8_u32(d0x2u32.val[1]));
-
-      d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]);
-      d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]);
-
-      d = pdst;
-      vst1_lane_u32((uint32_t *)d, d2u32, 0);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d3u32, 0);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d2u32, 1);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d3u32, 1);
-
-      q8u16 = q9u16;
-      d20s16 = d23s16;
-      q11u16 = q12u16;
-      q9u16 = q13u16;
-      d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-    }
-  }
-  return;
-}
-
-void vpx_convolve8_vert_neon(
-    const uint8_t *src,
-    ptrdiff_t src_stride,
-    uint8_t *dst,
-    ptrdiff_t dst_stride,
-    const int16_t *filter_x,  // unused
-    int x_step_q4,            // unused
-    const int16_t *filter_y,
-    int y_step_q4,
-    int w,
-    int h) {
-  int height;
-  const uint8_t *s;
-  uint8_t *d;
-  uint32x2_t d2u32, d3u32;
-  uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16;
-  uint16x4_t d2u16, d3u16, d4u16, d5u16;
-  int16x8_t q0s16;
-  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
-  int32x4_t q1s32, q2s32, q14s32, q15s32;
-
-  assert(y_step_q4 == 16);
-
-  src -= src_stride * 3;
-  q0s16 = vld1q_s16(filter_y);
-  for (; w > 0; w -= 4, src += 4, dst += 4) {  // loop_vert_h
-    s = src;
-    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
-    s += src_stride;
-    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
-    s += src_stride;
-    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
-    s += src_stride;
-    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
-    s += src_stride;
-    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
-    s += src_stride;
-    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
-    s += src_stride;
-    d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
-    s += src_stride;
-
-    q8u16  = vmovl_u8(vreinterpret_u8_u32(d16u32));
-    q9u16  = vmovl_u8(vreinterpret_u8_u32(d18u32));
-    q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
-    q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
-
-    d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
-    d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
-    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-    d = dst;
-    for (height = h; height > 0; height -= 4) {  // loop_vert
-      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
-      s += src_stride;
-      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
-      s += src_stride;
-      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
-      s += src_stride;
-      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
-      s += src_stride;
-
-      q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
-      q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
-
-      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
-      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
-      d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
-      d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
-      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-
-      __builtin_prefetch(d);
-      __builtin_prefetch(d + dst_stride);
-      q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
-                              d20s16, d21s16, d22s16, d24s16, q0s16);
-      __builtin_prefetch(d + dst_stride * 2);
-      __builtin_prefetch(d + dst_stride * 3);
-      q2s32  = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
-                              d21s16, d22s16, d24s16, d26s16, q0s16);
-      __builtin_prefetch(s);
-      __builtin_prefetch(s + src_stride);
-      q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
-                              d22s16, d24s16, d26s16, d27s16, q0s16);
-      __builtin_prefetch(s + src_stride * 2);
-      __builtin_prefetch(s + src_stride * 3);
-      q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
-                              d24s16, d26s16, d27s16, d25s16, q0s16);
-
-      d2u16 = vqrshrun_n_s32(q1s32, 7);
-      d3u16 = vqrshrun_n_s32(q2s32, 7);
-      d4u16 = vqrshrun_n_s32(q14s32, 7);
-      d5u16 = vqrshrun_n_s32(q15s32, 7);
-
-      q1u16 = vcombine_u16(d2u16, d3u16);
-      q2u16 = vcombine_u16(d4u16, d5u16);
-
-      d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16));
-      d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16));
-
-      vst1_lane_u32((uint32_t *)d, d2u32, 0);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d2u32, 1);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d3u32, 0);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d3u32, 1);
-      d += dst_stride;
-
-      q8u16 = q10u16;
-      d18s16 = d22s16;
-      d19s16 = d24s16;
-      q10u16 = q13u16;
-      d22s16 = d25s16;
-    }
-  }
-  return;
-}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c b/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c
deleted file mode 100644
index dc58a332f8..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx/vpx_integer.h"
-
-void vpx_convolve_avg_neon(
-    const uint8_t *src,    // r0
-    ptrdiff_t src_stride,  // r1
-    uint8_t *dst,          // r2
-    ptrdiff_t dst_stride,  // r3
-    const int16_t *filter_x,
-    int filter_x_stride,
-    const int16_t *filter_y,
-    int filter_y_stride,
-    int w,
-    int h) {
-  uint8_t *d;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8;
-  uint32x2_t d0u32, d2u32;
-  uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8;
-  (void)filter_x;  (void)filter_x_stride;
-  (void)filter_y;  (void)filter_y_stride;
-
-  d = dst;
-  if (w > 32) {  // avg64
-    for (; h > 0; h -= 1) {
-      q0u8  = vld1q_u8(src);
-      q1u8  = vld1q_u8(src + 16);
-      q2u8  = vld1q_u8(src + 32);
-      q3u8  = vld1q_u8(src + 48);
-      src += src_stride;
-      q8u8  = vld1q_u8(d);
-      q9u8  = vld1q_u8(d + 16);
-      q10u8 = vld1q_u8(d + 32);
-      q11u8 = vld1q_u8(d + 48);
-      d += dst_stride;
-
-      q0u8 = vrhaddq_u8(q0u8, q8u8);
-      q1u8 = vrhaddq_u8(q1u8, q9u8);
-      q2u8 = vrhaddq_u8(q2u8, q10u8);
-      q3u8 = vrhaddq_u8(q3u8, q11u8);
-
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q1u8);
-      vst1q_u8(dst + 32, q2u8);
-      vst1q_u8(dst + 48, q3u8);
-      dst += dst_stride;
-    }
-  } else if (w == 32) {  // avg32
-    for (; h > 0; h -= 2) {
-      q0u8 = vld1q_u8(src);
-      q1u8 = vld1q_u8(src + 16);
-      src += src_stride;
-      q2u8 = vld1q_u8(src);
-      q3u8 = vld1q_u8(src + 16);
-      src += src_stride;
-      q8u8 = vld1q_u8(d);
-      q9u8 = vld1q_u8(d + 16);
-      d += dst_stride;
-      q10u8 = vld1q_u8(d);
-      q11u8 = vld1q_u8(d + 16);
-      d += dst_stride;
-
-      q0u8 = vrhaddq_u8(q0u8, q8u8);
-      q1u8 = vrhaddq_u8(q1u8, q9u8);
-      q2u8 = vrhaddq_u8(q2u8, q10u8);
-      q3u8 = vrhaddq_u8(q3u8, q11u8);
-
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q1u8);
-      dst += dst_stride;
-      vst1q_u8(dst, q2u8);
-      vst1q_u8(dst + 16, q3u8);
-      dst += dst_stride;
-    }
-  } else if (w > 8) {  // avg16
-    for (; h > 0; h -= 2) {
-      q0u8 = vld1q_u8(src);
-      src += src_stride;
-      q1u8 = vld1q_u8(src);
-      src += src_stride;
-      q2u8 = vld1q_u8(d);
-      d += dst_stride;
-      q3u8 = vld1q_u8(d);
-      d += dst_stride;
-
-      q0u8 = vrhaddq_u8(q0u8, q2u8);
-      q1u8 = vrhaddq_u8(q1u8, q3u8);
-
-      vst1q_u8(dst, q0u8);
-      dst += dst_stride;
-      vst1q_u8(dst, q1u8);
-      dst += dst_stride;
-    }
-  } else if (w == 8) {  // avg8
-    for (; h > 0; h -= 2) {
-      d0u8 = vld1_u8(src);
-      src += src_stride;
-      d1u8 = vld1_u8(src);
-      src += src_stride;
-      d2u8 = vld1_u8(d);
-      d += dst_stride;
-      d3u8 = vld1_u8(d);
-      d += dst_stride;
-
-      q0u8 = vcombine_u8(d0u8, d1u8);
-      q1u8 = vcombine_u8(d2u8, d3u8);
-      q0u8 = vrhaddq_u8(q0u8, q1u8);
-
-      vst1_u8(dst, vget_low_u8(q0u8));
-      dst += dst_stride;
-      vst1_u8(dst, vget_high_u8(q0u8));
-      dst += dst_stride;
-    }
-  } else {  // avg4
-    for (; h > 0; h -= 2) {
-      d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0);
-      src += src_stride;
-      d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1);
-      src += src_stride;
-      d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0);
-      d += dst_stride;
-      d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1);
-      d += dst_stride;
-
-      d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32),
-                       vreinterpret_u8_u32(d2u32));
-
-      d0u32 = vreinterpret_u32_u8(d0u8);
-      vst1_lane_u32((uint32_t *)dst, d0u32, 0);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, d0u32, 1);
-      dst += dst_stride;
-    }
-  }
-  return;
-}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c b/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c
deleted file mode 100644
index d8fb97a861..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx/vpx_integer.h"
-
-void vpx_convolve_copy_neon(
-    const uint8_t *src,    // r0
-    ptrdiff_t src_stride,  // r1
-    uint8_t *dst,          // r2
-    ptrdiff_t dst_stride,  // r3
-    const int16_t *filter_x,
-    int filter_x_stride,
-    const int16_t *filter_y,
-    int filter_y_stride,
-    int w,
-    int h) {
-  uint8x8_t d0u8, d2u8;
-  uint8x16_t q0u8, q1u8, q2u8, q3u8;
-  (void)filter_x;  (void)filter_x_stride;
-  (void)filter_y;  (void)filter_y_stride;
-
-  if (w > 32) {  // copy64
-    for (; h > 0; h--) {
-      q0u8 = vld1q_u8(src);
-      q1u8 = vld1q_u8(src + 16);
-      q2u8 = vld1q_u8(src + 32);
-      q3u8 = vld1q_u8(src + 48);
-      src += src_stride;
-
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q1u8);
-      vst1q_u8(dst + 32, q2u8);
-      vst1q_u8(dst + 48, q3u8);
-      dst += dst_stride;
-    }
-  } else if (w == 32) {  // copy32
-    for (; h > 0; h -= 2) {
-      q0u8 = vld1q_u8(src);
-      q1u8 = vld1q_u8(src + 16);
-      src += src_stride;
-      q2u8 = vld1q_u8(src);
-      q3u8 = vld1q_u8(src + 16);
-      src += src_stride;
-
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q1u8);
-      dst += dst_stride;
-      vst1q_u8(dst, q2u8);
-      vst1q_u8(dst + 16, q3u8);
-      dst += dst_stride;
-    }
-  } else if (w > 8) {  // copy16
-    for (; h > 0; h -= 2) {
-      q0u8 = vld1q_u8(src);
-      src += src_stride;
-      q1u8 = vld1q_u8(src);
-      src += src_stride;
-
-      vst1q_u8(dst, q0u8);
-      dst += dst_stride;
-      vst1q_u8(dst, q1u8);
-      dst += dst_stride;
-    }
-  } else if (w == 8) {  // copy8
-    for (; h > 0; h -= 2) {
-      d0u8 = vld1_u8(src);
-      src += src_stride;
-      d2u8 = vld1_u8(src);
-      src += src_stride;
-
-      vst1_u8(dst, d0u8);
-      dst += dst_stride;
-      vst1_u8(dst, d2u8);
-      dst += dst_stride;
-    }
-  } else {  // copy4
-    for (; h > 0; h--) {
-      *(uint32_t *)dst = *(const uint32_t *)src;
-      src += src_stride;
-      dst += dst_stride;
-    }
-  }
-  return;
-}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_neon.c b/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
deleted file mode 100644
index 1506ce6203..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_ports/mem.h"
-
-void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
-                        uint8_t *dst, ptrdiff_t dst_stride,
-                        const int16_t *filter_x, int x_step_q4,
-                        const int16_t *filter_y, int y_step_q4,
-                        int w, int h) {
-  /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
-   * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
-   */
-  DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
-
-  // Account for the vertical phase needing 3 lines prior and 4 lines post
-  int intermediate_height = h + 7;
-
-  assert(y_step_q4 == 16);
-  assert(x_step_q4 == 16);
-
-  /* Filter starting 3 lines back. The neon implementation will ignore the
-   * given height and filter a multiple of 4 lines. Since this goes in to
-   * the temp buffer which has lots of extra room and is subsequently discarded
-   * this is safe if somewhat less than ideal.
-   */
-  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride,
-                           temp, 64,
-                           filter_x, x_step_q4, filter_y, y_step_q4,
-                           w, intermediate_height);
-
-  /* Step into the temp buffer 3 lines to get the actual frame data */
-  vpx_convolve8_vert_neon(temp + 64 * 3, 64,
-                          dst, dst_stride,
-                          filter_x, x_step_q4, filter_y, y_step_q4,
-                          w, h);
-}
-
-void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4,
-                            int w, int h) {
-  DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
-  int intermediate_height = h + 7;
-
-  assert(y_step_q4 == 16);
-  assert(x_step_q4 == 16);
-
-  /* This implementation has the same issues as above. In addition, we only want
-   * to average the values after both passes.
-   */
-  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride,
-                           temp, 64,
-                           filter_x, x_step_q4, filter_y, y_step_q4,
-                           w, intermediate_height);
-  vpx_convolve8_avg_vert_neon(temp + 64 * 3,
-                              64, dst, dst_stride,
-                              filter_x, x_step_q4, filter_y, y_step_q4,
-                              w, h);
-}
diff --git a/thirdparty/libvpx/vpx_dsp/bitreader.c b/thirdparty/libvpx/vpx_dsp/bitreader.c
deleted file mode 100644
index 8140e78e70..0000000000
--- a/thirdparty/libvpx/vpx_dsp/bitreader.c
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#include <stdlib.h>
-
-#include "./vpx_config.h"
-
-#include "vpx_dsp/bitreader.h"
-#include "vpx_dsp/prob.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_ports/mem.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vpx_util/endian_inl.h"
-
-int vpx_reader_init(vpx_reader *r,
-                    const uint8_t *buffer,
-                    size_t size,
-                    vpx_decrypt_cb decrypt_cb,
-                    void *decrypt_state) {
-  if (size && !buffer) {
-    return 1;
-  } else {
-    r->buffer_end = buffer + size;
-    r->buffer = buffer;
-    r->value = 0;
-    r->count = -8;
-    r->range = 255;
-    r->decrypt_cb = decrypt_cb;
-    r->decrypt_state = decrypt_state;
-    vpx_reader_fill(r);
-    return vpx_read_bit(r) != 0;  // marker bit
-  }
-}
-
-void vpx_reader_fill(vpx_reader *r) {
-  const uint8_t *const buffer_end = r->buffer_end;
-  const uint8_t *buffer = r->buffer;
-  const uint8_t *buffer_start = buffer;
-  BD_VALUE value = r->value;
-  int count = r->count;
-  const size_t bytes_left = buffer_end - buffer;
-  const size_t bits_left = bytes_left * CHAR_BIT;
-  int shift = BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT);
-
-  if (r->decrypt_cb) {
-    size_t n = VPXMIN(sizeof(r->clear_buffer), bytes_left);
-    r->decrypt_cb(r->decrypt_state, buffer, r->clear_buffer, (int)n);
-    buffer = r->clear_buffer;
-    buffer_start = r->clear_buffer;
-  }
-  if (bits_left > BD_VALUE_SIZE) {
-      const int bits = (shift & 0xfffffff8) + CHAR_BIT;
-      BD_VALUE nv;
-      BD_VALUE big_endian_values;
-      memcpy(&big_endian_values, buffer, sizeof(BD_VALUE));
-#if SIZE_MAX == 0xffffffffffffffffULL
-        big_endian_values = HToBE64(big_endian_values);
-#else
-        big_endian_values = HToBE32(big_endian_values);
-#endif
-      nv = big_endian_values >> (BD_VALUE_SIZE - bits);
-      count += bits;
-      buffer += (bits >> 3);
-      value = r->value | (nv << (shift & 0x7));
-  } else {
-    const int bits_over = (int)(shift + CHAR_BIT - (int)bits_left);
-    int loop_end = 0;
-    if (bits_over >= 0) {
-      count += LOTS_OF_BITS;
-      loop_end = bits_over;
-    }
-
-    if (bits_over < 0 || bits_left) {
-      while (shift >= loop_end) {
-        count += CHAR_BIT;
-        value |= (BD_VALUE)*buffer++ << shift;
-        shift -= CHAR_BIT;
-      }
-    }
-  }
-
-  // NOTE: Variable 'buffer' may not relate to 'r->buffer' after decryption,
-  // so we increase 'r->buffer' by the amount that 'buffer' moved, rather than
-  // assign 'buffer' to 'r->buffer'.
-  r->buffer += buffer - buffer_start;
-  r->value = value;
-  r->count = count;
-}
-
-const uint8_t *vpx_reader_find_end(vpx_reader *r) {
-  // Find the end of the coded buffer
-  while (r->count > CHAR_BIT && r->count < BD_VALUE_SIZE) {
-    r->count -= CHAR_BIT;
-    r->buffer--;
-  }
-  return r->buffer;
-}
diff --git a/thirdparty/libvpx/vpx_dsp/bitreader.h b/thirdparty/libvpx/vpx_dsp/bitreader.h
deleted file mode 100644
index 9a441b4107..0000000000
--- a/thirdparty/libvpx/vpx_dsp/bitreader.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_DSP_BITREADER_H_
-#define VPX_DSP_BITREADER_H_
-
-#include <stddef.h>
-#include <limits.h>
-
-#include "./vpx_config.h"
-#include "vpx_ports/mem.h"
-#include "vpx/vp8dx.h"
-#include "vpx/vpx_integer.h"
-#include "vpx_dsp/prob.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef size_t BD_VALUE;
-
-#define BD_VALUE_SIZE ((int)sizeof(BD_VALUE) * CHAR_BIT)
-
-// This is meant to be a large, positive constant that can still be efficiently
-// loaded as an immediate (on platforms like ARM, for example).
-// Even relatively modest values like 100 would work fine.
-#define LOTS_OF_BITS 0x40000000
-
-typedef struct {
-  // Be careful when reordering this struct, it may impact the cache negatively.
-  BD_VALUE value;
-  unsigned int range;
-  int count;
-  const uint8_t *buffer_end;
-  const uint8_t *buffer;
-  vpx_decrypt_cb decrypt_cb;
-  void *decrypt_state;
-  uint8_t clear_buffer[sizeof(BD_VALUE) + 1];
-} vpx_reader;
-
-int vpx_reader_init(vpx_reader *r,
-                    const uint8_t *buffer,
-                    size_t size,
-                    vpx_decrypt_cb decrypt_cb,
-                    void *decrypt_state);
-
-void vpx_reader_fill(vpx_reader *r);
-
-const uint8_t *vpx_reader_find_end(vpx_reader *r);
-
-static INLINE int vpx_reader_has_error(vpx_reader *r) {
-  // Check if we have reached the end of the buffer.
-  //
-  // Variable 'count' stores the number of bits in the 'value' buffer, minus
-  // 8. The top byte is part of the algorithm, and the remainder is buffered
-  // to be shifted into it. So if count == 8, the top 16 bits of 'value' are
-  // occupied, 8 for the algorithm and 8 in the buffer.
-  //
-  // When reading a byte from the user's buffer, count is filled with 8 and
-  // one byte is filled into the value buffer. When we reach the end of the
-  // data, count is additionally filled with LOTS_OF_BITS. So when
-  // count == LOTS_OF_BITS - 1, the user's data has been exhausted.
-  //
-  // 1 if we have tried to decode bits after the end of stream was encountered.
-  // 0 No error.
-  return r->count > BD_VALUE_SIZE && r->count < LOTS_OF_BITS;
-}
-
-static INLINE int vpx_read(vpx_reader *r, int prob) {
-  unsigned int bit = 0;
-  BD_VALUE value;
-  BD_VALUE bigsplit;
-  int count;
-  unsigned int range;
-  unsigned int split = (r->range * prob + (256 - prob)) >> CHAR_BIT;
-
-  if (r->count < 0)
-    vpx_reader_fill(r);
-
-  value = r->value;
-  count = r->count;
-
-  bigsplit = (BD_VALUE)split << (BD_VALUE_SIZE - CHAR_BIT);
-
-  range = split;
-
-  if (value >= bigsplit) {
-    range = r->range - split;
-    value = value - bigsplit;
-    bit = 1;
-  }
-
-  {
-    register int shift = vpx_norm[range];
-    range <<= shift;
-    value <<= shift;
-    count -= shift;
-  }
-  r->value = value;
-  r->count = count;
-  r->range = range;
-
-  return bit;
-}
-
-static INLINE int vpx_read_bit(vpx_reader *r) {
-  return vpx_read(r, 128);  // vpx_prob_half
-}
-
-static INLINE int vpx_read_literal(vpx_reader *r, int bits) {
-  int literal = 0, bit;
-
-  for (bit = bits - 1; bit >= 0; bit--)
-    literal |= vpx_read_bit(r) << bit;
-
-  return literal;
-}
-
-static INLINE int vpx_read_tree(vpx_reader *r, const vpx_tree_index *tree,
-                                const vpx_prob *probs) {
-  vpx_tree_index i = 0;
-
-  while ((i = tree[i + vpx_read(r, probs[i >> 1])]) > 0)
-    continue;
-
-  return -i;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VPX_DSP_BITREADER_H_
diff --git a/thirdparty/libvpx/vpx_dsp/bitreader_buffer.c b/thirdparty/libvpx/vpx_dsp/bitreader_buffer.c
deleted file mode 100644
index d7b55cf9f4..0000000000
--- a/thirdparty/libvpx/vpx_dsp/bitreader_buffer.c
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#include "./vpx_config.h"
-#include "./bitreader_buffer.h"
-
-size_t vpx_rb_bytes_read(struct vpx_read_bit_buffer *rb) {
-  return (rb->bit_offset + 7) >> 3;
-}
-
-int vpx_rb_read_bit(struct vpx_read_bit_buffer *rb) {
-  const size_t off = rb->bit_offset;
-  const size_t p = off >> 3;
-  const int q = 7 - (int)(off & 0x7);
-  if (rb->bit_buffer + p < rb->bit_buffer_end) {
-    const int bit = (rb->bit_buffer[p] >> q) & 1;
-    rb->bit_offset = off + 1;
-    return bit;
-  } else {
-    rb->error_handler(rb->error_handler_data);
-    return 0;
-  }
-}
-
-int vpx_rb_read_literal(struct vpx_read_bit_buffer *rb, int bits) {
-  int value = 0, bit;
-  for (bit = bits - 1; bit >= 0; bit--)
-    value |= vpx_rb_read_bit(rb) << bit;
-  return value;
-}
-
-int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb,
-                               int bits) {
-  const int value = vpx_rb_read_literal(rb, bits);
-  return vpx_rb_read_bit(rb) ? -value : value;
-}
-
-int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb,
-                                   int bits) {
-#if CONFIG_MISC_FIXES
-  const int nbits = sizeof(unsigned) * 8 - bits - 1;
-  const unsigned value = (unsigned)vpx_rb_read_literal(rb, bits + 1) << nbits;
-  return ((int) value) >> nbits;
-#else
-  return vpx_rb_read_signed_literal(rb, bits);
-#endif
-}
diff --git a/thirdparty/libvpx/vpx_dsp/bitreader_buffer.h b/thirdparty/libvpx/vpx_dsp/bitreader_buffer.h
deleted file mode 100644
index 8a48a95ed1..0000000000
--- a/thirdparty/libvpx/vpx_dsp/bitreader_buffer.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_DSP_BITREADER_BUFFER_H_
-#define VPX_DSP_BITREADER_BUFFER_H_
-
-#include <limits.h>
-
-#include "vpx/vpx_integer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef void (*vpx_rb_error_handler)(void *data);
-
-struct vpx_read_bit_buffer {
-  const uint8_t *bit_buffer;
-  const uint8_t *bit_buffer_end;
-  size_t bit_offset;
-
-  void *error_handler_data;
-  vpx_rb_error_handler error_handler;
-};
-
-size_t vpx_rb_bytes_read(struct vpx_read_bit_buffer *rb);
-
-int vpx_rb_read_bit(struct vpx_read_bit_buffer *rb);
-
-int vpx_rb_read_literal(struct vpx_read_bit_buffer *rb, int bits);
-
-int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb, int bits);
-
-int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb, int bits);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VPX_DSP_BITREADER_BUFFER_H_
diff --git a/thirdparty/libvpx/vpx_dsp/intrapred.c b/thirdparty/libvpx/vpx_dsp/intrapred.c
deleted file mode 100644
index cc4a74bd26..0000000000
--- a/thirdparty/libvpx/vpx_dsp/intrapred.c
+++ /dev/null
@@ -1,870 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_config.h"
-#include "./vpx_dsp_rtcd.h"
-
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_mem/vpx_mem.h"
-
-#define DST(x, y) dst[(x) + (y) * stride]
-#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
-#define AVG2(a, b) (((a) + (b) + 1) >> 1)
-
-static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                                  const uint8_t *above, const uint8_t *left) {
-  int r, c;
-  (void) above;
-  // first column
-  for (r = 0; r < bs - 1; ++r)
-    dst[r * stride] = AVG2(left[r], left[r + 1]);
-  dst[(bs - 1) * stride] = left[bs - 1];
-  dst++;
-
-  // second column
-  for (r = 0; r < bs - 2; ++r)
-    dst[r * stride] = AVG3(left[r], left[r + 1], left[r + 2]);
-  dst[(bs - 2) * stride] = AVG3(left[bs - 2], left[bs - 1], left[bs - 1]);
-  dst[(bs - 1) * stride] = left[bs - 1];
-  dst++;
-
-  // rest of last row
-  for (c = 0; c < bs - 2; ++c)
-    dst[(bs - 1) * stride + c] = left[bs - 1];
-
-  for (r = bs - 2; r >= 0; --r)
-    for (c = 0; c < bs - 2; ++c)
-      dst[r * stride + c] = dst[(r + 1) * stride + c - 2];
-}
-
-#if CONFIG_MISC_FIXES
-static INLINE void d207e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                                   const uint8_t *above, const uint8_t *left) {
-  int r, c;
-  (void) above;
-
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) {
-      dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1],
-                            left[(c >> 1) + r + 2])
-          : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]);
-    }
-    dst += stride;
-  }
-}
-#endif  // CONFIG_MISC_FIXES
-
-static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                                 const uint8_t *above, const uint8_t *left) {
-  int r, c;
-  int size;
-  (void)left;
-  for (c = 0; c < bs; ++c) {
-    dst[c] = AVG2(above[c], above[c + 1]);
-    dst[stride + c] = AVG3(above[c], above[c + 1], above[c + 2]);
-  }
-  for (r = 2, size = bs - 2; r < bs; r += 2, --size) {
-    memcpy(dst + (r + 0) * stride, dst + (r >> 1), size);
-    memset(dst + (r + 0) * stride + size, above[bs - 1], bs - size);
-    memcpy(dst + (r + 1) * stride, dst + stride + (r >> 1), size);
-    memset(dst + (r + 1) * stride + size, above[bs - 1], bs - size);
-  }
-}
-
-#if CONFIG_MISC_FIXES
-static INLINE void d63e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                                  const uint8_t *above, const uint8_t *left) {
-  int r, c;
-  (void) left;
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) {
-      dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1],
-                            above[(r >> 1) + c + 2])
-          : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]);
-    }
-    dst += stride;
-  }
-}
-#endif  // CONFIG_MISC_FIXES
-
-static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                                 const uint8_t *above, const uint8_t *left) {
-  const uint8_t above_right = above[bs - 1];
-  const uint8_t *const dst_row0 = dst;
-  int x, size;
-  (void)left;
-
-  for (x = 0; x < bs - 1; ++x) {
-    dst[x] = AVG3(above[x], above[x + 1], above[x + 2]);
-  }
-  dst[bs - 1] = above_right;
-  dst += stride;
-  for (x = 1, size = bs - 2; x < bs; ++x, --size) {
-    memcpy(dst, dst_row0 + x, size);
-    memset(dst + size, above_right, x + 1);
-    dst += stride;
-  }
-}
-
-#if CONFIG_MISC_FIXES
-static INLINE void d45e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                                  const uint8_t *above, const uint8_t *left) {
-  int r, c;
-  (void) left;
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) {
-      dst[c] = AVG3(above[r + c], above[r + c + 1],
-                    above[r + c + 1 + (r + c + 2 < bs * 2)]);
-    }
-    dst += stride;
-  }
-}
-#endif  // CONFIG_MISC_FIXES
-
-static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                                  const uint8_t *above, const uint8_t *left) {
-  int r, c;
-
-  // first row
-  for (c = 0; c < bs; c++)
-    dst[c] = AVG2(above[c - 1], above[c]);
-  dst += stride;
-
-  // second row
-  dst[0] = AVG3(left[0], above[-1], above[0]);
-  for (c = 1; c < bs; c++)
-    dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
-  dst += stride;
-
-  // the rest of first col
-  dst[0] = AVG3(above[-1], left[0], left[1]);
-  for (r = 3; r < bs; ++r)
-    dst[(r - 2) * stride] = AVG3(left[r - 3], left[r - 2], left[r - 1]);
-
-  // the rest of the block
-  for (r = 2; r < bs; ++r) {
-    for (c = 1; c < bs; c++)
-      dst[c] = dst[-2 * stride + c - 1];
-    dst += stride;
-  }
-}
-
-static INLINE void d135_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                                  const uint8_t *above, const uint8_t *left) {
-  int i;
-#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ > 7
-  // silence a spurious -Warray-bounds warning, possibly related to:
-  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56273
-  uint8_t border[69];
-#else
-  uint8_t border[32 + 32 - 1];  // outer border from bottom-left to top-right
-#endif
-
-  // dst(bs, bs - 2)[0], i.e., border starting at bottom-left
-  for (i = 0; i < bs - 2; ++i) {
-    border[i] = AVG3(left[bs - 3 - i], left[bs - 2 - i], left[bs - 1 - i]);
-  }
-  border[bs - 2] = AVG3(above[-1], left[0], left[1]);
-  border[bs - 1] = AVG3(left[0], above[-1], above[0]);
-  border[bs - 0] = AVG3(above[-1], above[0], above[1]);
-  // dst[0][2, size), i.e., remaining top border ascending
-  for (i = 0; i < bs - 2; ++i) {
-    border[bs + 1 + i] = AVG3(above[i], above[i + 1], above[i + 2]);
-  }
-
-  for (i = 0; i < bs; ++i) {
-    memcpy(dst + i * stride, border + bs - 1 - i, bs);
-  }
-}
-
-static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                                  const uint8_t *above, const uint8_t *left) {
-  int r, c;
-  dst[0] = AVG2(above[-1], left[0]);
-  for (r = 1; r < bs; r++)
-    dst[r * stride] = AVG2(left[r - 1], left[r]);
-  dst++;
-
-  dst[0] = AVG3(left[0], above[-1], above[0]);
-  dst[stride] = AVG3(above[-1], left[0], left[1]);
-  for (r = 2; r < bs; r++)
-    dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]);
-  dst++;
-
-  for (c = 0; c < bs - 2; c++)
-    dst[c] = AVG3(above[c - 1], above[c], above[c + 1]);
-  dst += stride;
-
-  for (r = 1; r < bs; ++r) {
-    for (c = 0; c < bs - 2; c++)
-      dst[c] = dst[-stride + c - 2];
-    dst += stride;
-  }
-}
-
-static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                               const uint8_t *above, const uint8_t *left) {
-  int r;
-  (void) left;
-
-  for (r = 0; r < bs; r++) {
-    memcpy(dst, above, bs);
-    dst += stride;
-  }
-}
-
-static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                               const uint8_t *above, const uint8_t *left) {
-  int r;
-  (void) above;
-
-  for (r = 0; r < bs; r++) {
-    memset(dst, left[r], bs);
-    dst += stride;
-  }
-}
-
-static INLINE void tm_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                                const uint8_t *above, const uint8_t *left) {
-  int r, c;
-  int ytop_left = above[-1];
-
-  for (r = 0; r < bs; r++) {
-    for (c = 0; c < bs; c++)
-      dst[c] = clip_pixel(left[r] + above[c] - ytop_left);
-    dst += stride;
-  }
-}
-
-static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                                    const uint8_t *above, const uint8_t *left) {
-  int r;
-  (void) above;
-  (void) left;
-
-  for (r = 0; r < bs; r++) {
-    memset(dst, 128, bs);
-    dst += stride;
-  }
-}
-
-static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  int i, r, expected_dc, sum = 0;
-  (void) above;
-
-  for (i = 0; i < bs; i++)
-    sum += left[i];
-  expected_dc = (sum + (bs >> 1)) / bs;
-
-  for (r = 0; r < bs; r++) {
-    memset(dst, expected_dc, bs);
-    dst += stride;
-  }
-}
-
-static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                                    const uint8_t *above, const uint8_t *left) {
-  int i, r, expected_dc, sum = 0;
-  (void) left;
-
-  for (i = 0; i < bs; i++)
-    sum += above[i];
-  expected_dc = (sum + (bs >> 1)) / bs;
-
-  for (r = 0; r < bs; r++) {
-    memset(dst, expected_dc, bs);
-    dst += stride;
-  }
-}
-
-static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                                const uint8_t *above, const uint8_t *left) {
-  int i, r, expected_dc, sum = 0;
-  const int count = 2 * bs;
-
-  for (i = 0; i < bs; i++) {
-    sum += above[i];
-    sum += left[i];
-  }
-
-  expected_dc = (sum + (count >> 1)) / count;
-
-  for (r = 0; r < bs; r++) {
-    memset(dst, expected_dc, bs);
-    dst += stride;
-  }
-}
-
-void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                            const uint8_t *above, const uint8_t *left) {
-  const int H = above[-1];
-  const int I = left[0];
-  const int J = left[1];
-  const int K = left[2];
-  const int L = left[3];
-
-  memset(dst + stride * 0, AVG3(H, I, J), 4);
-  memset(dst + stride * 1, AVG3(I, J, K), 4);
-  memset(dst + stride * 2, AVG3(J, K, L), 4);
-  memset(dst + stride * 3, AVG3(K, L, L), 4);
-}
-
-void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                            const uint8_t *above, const uint8_t *left) {
-  const int H = above[-1];
-  const int I = above[0];
-  const int J = above[1];
-  const int K = above[2];
-  const int L = above[3];
-  const int M = above[4];
-  (void)left;
-
-  dst[0] = AVG3(H, I, J);
-  dst[1] = AVG3(I, J, K);
-  dst[2] = AVG3(J, K, L);
-  dst[3] = AVG3(K, L, M);
-  memcpy(dst + stride * 1, dst, 4);
-  memcpy(dst + stride * 2, dst, 4);
-  memcpy(dst + stride * 3, dst, 4);
-}
-
-void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  const int I = left[0];
-  const int J = left[1];
-  const int K = left[2];
-  const int L = left[3];
-  (void)above;
-  DST(0, 0) =             AVG2(I, J);
-  DST(2, 0) = DST(0, 1) = AVG2(J, K);
-  DST(2, 1) = DST(0, 2) = AVG2(K, L);
-  DST(1, 0) =             AVG3(I, J, K);
-  DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
-  DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
-  DST(3, 2) = DST(2, 2) =
-      DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
-}
-
-void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                             const uint8_t *above, const uint8_t *left) {
-  const int A = above[0];
-  const int B = above[1];
-  const int C = above[2];
-  const int D = above[3];
-  const int E = above[4];
-  const int F = above[5];
-  const int G = above[6];
-  (void)left;
-  DST(0, 0) =             AVG2(A, B);
-  DST(1, 0) = DST(0, 2) = AVG2(B, C);
-  DST(2, 0) = DST(1, 2) = AVG2(C, D);
-  DST(3, 0) = DST(2, 2) = AVG2(D, E);
-              DST(3, 2) = AVG2(E, F);  // differs from vp8
-
-  DST(0, 1) =             AVG3(A, B, C);
-  DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
-  DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
-  DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
-              DST(3, 3) = AVG3(E, F, G);  // differs from vp8
-}
-
-void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  const int A = above[0];
-  const int B = above[1];
-  const int C = above[2];
-  const int D = above[3];
-  const int E = above[4];
-  const int F = above[5];
-  const int G = above[6];
-  const int H = above[7];
-  (void)left;
-  DST(0, 0) =             AVG2(A, B);
-  DST(1, 0) = DST(0, 2) = AVG2(B, C);
-  DST(2, 0) = DST(1, 2) = AVG2(C, D);
-  DST(3, 0) = DST(2, 2) = AVG2(D, E);
-              DST(3, 2) = AVG3(E, F, G);
-
-  DST(0, 1) =             AVG3(A, B, C);
-  DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
-  DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
-  DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
-              DST(3, 3) = AVG3(F, G, H);
-}
-
-void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                             const uint8_t *above, const uint8_t *left) {
-  const int A = above[0];
-  const int B = above[1];
-  const int C = above[2];
-  const int D = above[3];
-  const int E = above[4];
-  const int F = above[5];
-  const int G = above[6];
-  const int H = above[7];
-  (void)stride;
-  (void)left;
-  DST(0, 0)                                     = AVG3(A, B, C);
-  DST(1, 0) = DST(0, 1)                         = AVG3(B, C, D);
-  DST(2, 0) = DST(1, 1) = DST(0, 2)             = AVG3(C, D, E);
-  DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
-              DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
-                          DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
-                                      DST(3, 3) = H;  // differs from vp8
-}
-
-void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  const int A = above[0];
-  const int B = above[1];
-  const int C = above[2];
-  const int D = above[3];
-  const int E = above[4];
-  const int F = above[5];
-  const int G = above[6];
-  const int H = above[7];
-  (void)stride;
-  (void)left;
-  DST(0, 0)                                     = AVG3(A, B, C);
-  DST(1, 0) = DST(0, 1)                         = AVG3(B, C, D);
-  DST(2, 0) = DST(1, 1) = DST(0, 2)             = AVG3(C, D, E);
-  DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
-              DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
-                          DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
-                                      DST(3, 3) = AVG3(G, H, H);
-}
-
-void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  const int I = left[0];
-  const int J = left[1];
-  const int K = left[2];
-  const int X = above[-1];
-  const int A = above[0];
-  const int B = above[1];
-  const int C = above[2];
-  const int D = above[3];
-  DST(0, 0) = DST(1, 2) = AVG2(X, A);
-  DST(1, 0) = DST(2, 2) = AVG2(A, B);
-  DST(2, 0) = DST(3, 2) = AVG2(B, C);
-  DST(3, 0)             = AVG2(C, D);
-
-  DST(0, 3) =             AVG3(K, J, I);
-  DST(0, 2) =             AVG3(J, I, X);
-  DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
-  DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
-  DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
-  DST(3, 1) =             AVG3(B, C, D);
-}
-
-void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  const int I = left[0];
-  const int J = left[1];
-  const int K = left[2];
-  const int L = left[3];
-  const int X = above[-1];
-  const int A = above[0];
-  const int B = above[1];
-  const int C = above[2];
-  const int D = above[3];
-  (void)stride;
-  DST(0, 3)                                     = AVG3(J, K, L);
-  DST(1, 3) = DST(0, 2)                         = AVG3(I, J, K);
-  DST(2, 3) = DST(1, 2) = DST(0, 1)             = AVG3(X, I, J);
-  DST(3, 3) = DST(2, 2) = DST(1, 1) = DST(0, 0) = AVG3(A, X, I);
-              DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X);
-                          DST(3, 1) = DST(2, 0) = AVG3(C, B, A);
-                                      DST(3, 0) = AVG3(D, C, B);
-}
-
-void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  const int I = left[0];
-  const int J = left[1];
-  const int K = left[2];
-  const int L = left[3];
-  const int X = above[-1];
-  const int A = above[0];
-  const int B = above[1];
-  const int C = above[2];
-
-  DST(0, 0) = DST(2, 1) = AVG2(I, X);
-  DST(0, 1) = DST(2, 2) = AVG2(J, I);
-  DST(0, 2) = DST(2, 3) = AVG2(K, J);
-  DST(0, 3)             = AVG2(L, K);
-
-  DST(3, 0)             = AVG3(A, B, C);
-  DST(2, 0)             = AVG3(X, A, B);
-  DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
-  DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
-  DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
-  DST(1, 3)             = AVG3(L, K, J);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static INLINE void highbd_d207_predictor(uint16_t *dst, ptrdiff_t stride,
-                                         int bs, const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  int r, c;
-  (void) above;
-  (void) bd;
-
-  // First column.
-  for (r = 0; r < bs - 1; ++r) {
-    dst[r * stride] = AVG2(left[r], left[r + 1]);
-  }
-  dst[(bs - 1) * stride] = left[bs - 1];
-  dst++;
-
-  // Second column.
-  for (r = 0; r < bs - 2; ++r) {
-    dst[r * stride] = AVG3(left[r], left[r + 1], left[r + 2]);
-  }
-  dst[(bs - 2) * stride] = AVG3(left[bs - 2], left[bs - 1], left[bs - 1]);
-  dst[(bs - 1) * stride] = left[bs - 1];
-  dst++;
-
-  // Rest of last row.
-  for (c = 0; c < bs - 2; ++c)
-    dst[(bs - 1) * stride + c] = left[bs - 1];
-
-  for (r = bs - 2; r >= 0; --r) {
-    for (c = 0; c < bs - 2; ++c)
-      dst[r * stride + c] = dst[(r + 1) * stride + c - 2];
-  }
-}
-
-#if CONFIG_MISC_FIXES
-static INLINE void highbd_d207e_predictor(uint16_t *dst, ptrdiff_t stride,
-                                          int bs, const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  int r, c;
-  (void) above;
-  (void) bd;
-
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) {
-      dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1],
-                            left[(c >> 1) + r + 2])
-          : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]);
-    }
-    dst += stride;
-  }
-}
-#endif  // CONFIG_MISC_FIXES
-
-static INLINE void highbd_d63_predictor(uint16_t *dst, ptrdiff_t stride,
-                                        int bs, const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-  int r, c;
-  (void) left;
-  (void) bd;
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) {
-      dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1],
-                            above[(r >> 1) + c + 2])
-          : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]);
-    }
-    dst += stride;
-  }
-}
-
-#define highbd_d63e_predictor highbd_d63_predictor
-
-static INLINE void highbd_d45_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
-                                        const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-  int r, c;
-  (void) left;
-  (void) bd;
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) {
-      dst[c] = r + c + 2 < bs * 2 ? AVG3(above[r + c], above[r + c + 1],
-                                         above[r + c + 2])
-          : above[bs * 2 - 1];
-    }
-    dst += stride;
-  }
-}
-
-#if CONFIG_MISC_FIXES
-static INLINE void highbd_d45e_predictor(uint16_t *dst, ptrdiff_t stride,
-                                         int bs, const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  int r, c;
-  (void) left;
-  (void) bd;
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) {
-      dst[c] = AVG3(above[r + c], above[r + c + 1],
-                    above[r + c + 1 + (r + c + 2 < bs * 2)]);
-    }
-    dst += stride;
-  }
-}
-#endif  // CONFIG_MISC_FIXES
-
-static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride,
-                                         int bs, const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  int r, c;
-  (void) bd;
-
-  // first row
-  for (c = 0; c < bs; c++)
-    dst[c] = AVG2(above[c - 1], above[c]);
-  dst += stride;
-
-  // second row
-  dst[0] = AVG3(left[0], above[-1], above[0]);
-  for (c = 1; c < bs; c++)
-    dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
-  dst += stride;
-
-  // the rest of first col
-  dst[0] = AVG3(above[-1], left[0], left[1]);
-  for (r = 3; r < bs; ++r)
-    dst[(r - 2) * stride] = AVG3(left[r - 3], left[r - 2], left[r - 1]);
-
-  // the rest of the block
-  for (r = 2; r < bs; ++r) {
-    for (c = 1; c < bs; c++)
-      dst[c] = dst[-2 * stride + c - 1];
-    dst += stride;
-  }
-}
-
-static INLINE void highbd_d135_predictor(uint16_t *dst, ptrdiff_t stride,
-                                         int bs, const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  int r, c;
-  (void) bd;
-  dst[0] = AVG3(left[0], above[-1], above[0]);
-  for (c = 1; c < bs; c++)
-    dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
-
-  dst[stride] = AVG3(above[-1], left[0], left[1]);
-  for (r = 2; r < bs; ++r)
-    dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]);
-
-  dst += stride;
-  for (r = 1; r < bs; ++r) {
-    for (c = 1; c < bs; c++)
-      dst[c] = dst[-stride + c - 1];
-    dst += stride;
-  }
-}
-
-static INLINE void highbd_d153_predictor(uint16_t *dst, ptrdiff_t stride,
-                                         int bs, const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  int r, c;
-  (void) bd;
-  dst[0] = AVG2(above[-1], left[0]);
-  for (r = 1; r < bs; r++)
-    dst[r * stride] = AVG2(left[r - 1], left[r]);
-  dst++;
-
-  dst[0] = AVG3(left[0], above[-1], above[0]);
-  dst[stride] = AVG3(above[-1], left[0], left[1]);
-  for (r = 2; r < bs; r++)
-    dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]);
-  dst++;
-
-  for (c = 0; c < bs - 2; c++)
-    dst[c] = AVG3(above[c - 1], above[c], above[c + 1]);
-  dst += stride;
-
-  for (r = 1; r < bs; ++r) {
-    for (c = 0; c < bs - 2; c++)
-      dst[c] = dst[-stride + c - 2];
-    dst += stride;
-  }
-}
-
-static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride,
-                                      int bs, const uint16_t *above,
-                                      const uint16_t *left, int bd) {
-  int r;
-  (void) left;
-  (void) bd;
-  for (r = 0; r < bs; r++) {
-    memcpy(dst, above, bs * sizeof(uint16_t));
-    dst += stride;
-  }
-}
-
-static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride,
-                                      int bs, const uint16_t *above,
-                                      const uint16_t *left, int bd) {
-  int r;
-  (void) above;
-  (void) bd;
-  for (r = 0; r < bs; r++) {
-    vpx_memset16(dst, left[r], bs);
-    dst += stride;
-  }
-}
-
-static INLINE void highbd_tm_predictor(uint16_t *dst, ptrdiff_t stride,
-                                       int bs, const uint16_t *above,
-                                       const uint16_t *left, int bd) {
-  int r, c;
-  int ytop_left = above[-1];
-  (void) bd;
-
-  for (r = 0; r < bs; r++) {
-    for (c = 0; c < bs; c++)
-      dst[c] = clip_pixel_highbd(left[r] + above[c] - ytop_left, bd);
-    dst += stride;
-  }
-}
-
-static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride,
-                                           int bs, const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  int r;
-  (void) above;
-  (void) left;
-
-  for (r = 0; r < bs; r++) {
-    vpx_memset16(dst, 128 << (bd - 8), bs);
-    dst += stride;
-  }
-}
-
-static INLINE void highbd_dc_left_predictor(uint16_t *dst, ptrdiff_t stride,
-                                            int bs, const uint16_t *above,
-                                            const uint16_t *left, int bd) {
-  int i, r, expected_dc, sum = 0;
-  (void) above;
-  (void) bd;
-
-  for (i = 0; i < bs; i++)
-    sum += left[i];
-  expected_dc = (sum + (bs >> 1)) / bs;
-
-  for (r = 0; r < bs; r++) {
-    vpx_memset16(dst, expected_dc, bs);
-    dst += stride;
-  }
-}
-
-static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride,
-                                           int bs, const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  int i, r, expected_dc, sum = 0;
-  (void) left;
-  (void) bd;
-
-  for (i = 0; i < bs; i++)
-    sum += above[i];
-  expected_dc = (sum + (bs >> 1)) / bs;
-
-  for (r = 0; r < bs; r++) {
-    vpx_memset16(dst, expected_dc, bs);
-    dst += stride;
-  }
-}
-
-static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride,
-                                       int bs, const uint16_t *above,
-                                       const uint16_t *left, int bd) {
-  int i, r, expected_dc, sum = 0;
-  const int count = 2 * bs;
-  (void) bd;
-
-  for (i = 0; i < bs; i++) {
-    sum += above[i];
-    sum += left[i];
-  }
-
-  expected_dc = (sum + (count >> 1)) / count;
-
-  for (r = 0; r < bs; r++) {
-    vpx_memset16(dst, expected_dc, bs);
-    dst += stride;
-  }
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-// This serves as a wrapper function, so that all the prediction functions
-// can be unified and accessed as a pointer array. Note that the boundary
-// above and left are not necessarily used all the time.
-#define intra_pred_sized(type, size) \
-  void vpx_##type##_predictor_##size##x##size##_c(uint8_t *dst, \
-                                                  ptrdiff_t stride, \
-                                                  const uint8_t *above, \
-                                                  const uint8_t *left) { \
-    type##_predictor(dst, stride, size, above, left); \
-  }
-
-#if CONFIG_VP9_HIGHBITDEPTH
-#define intra_pred_highbd_sized(type, size) \
-  void vpx_highbd_##type##_predictor_##size##x##size##_c( \
-      uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
-      const uint16_t *left, int bd) { \
-    highbd_##type##_predictor(dst, stride, size, above, left, bd); \
-  }
-
-#define intra_pred_allsizes(type) \
-  intra_pred_sized(type, 4) \
-  intra_pred_sized(type, 8) \
-  intra_pred_sized(type, 16) \
-  intra_pred_sized(type, 32) \
-  intra_pred_highbd_sized(type, 4) \
-  intra_pred_highbd_sized(type, 8) \
-  intra_pred_highbd_sized(type, 16) \
-  intra_pred_highbd_sized(type, 32)
-
-#define intra_pred_no_4x4(type) \
-  intra_pred_sized(type, 8) \
-  intra_pred_sized(type, 16) \
-  intra_pred_sized(type, 32) \
-  intra_pred_highbd_sized(type, 4) \
-  intra_pred_highbd_sized(type, 8) \
-  intra_pred_highbd_sized(type, 16) \
-  intra_pred_highbd_sized(type, 32)
-
-#else
-#define intra_pred_allsizes(type) \
-  intra_pred_sized(type, 4) \
-  intra_pred_sized(type, 8) \
-  intra_pred_sized(type, 16) \
-  intra_pred_sized(type, 32)
-
-#define intra_pred_no_4x4(type) \
-  intra_pred_sized(type, 8) \
-  intra_pred_sized(type, 16) \
-  intra_pred_sized(type, 32)
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-intra_pred_no_4x4(d207)
-intra_pred_no_4x4(d63)
-intra_pred_no_4x4(d45)
-#if CONFIG_MISC_FIXES
-intra_pred_allsizes(d207e)
-intra_pred_allsizes(d63e)
-intra_pred_no_4x4(d45e)
-#endif
-intra_pred_no_4x4(d117)
-intra_pred_no_4x4(d135)
-intra_pred_no_4x4(d153)
-intra_pred_allsizes(v)
-intra_pred_allsizes(h)
-intra_pred_allsizes(tm)
-intra_pred_allsizes(dc_128)
-intra_pred_allsizes(dc_left)
-intra_pred_allsizes(dc_top)
-intra_pred_allsizes(dc)
-#undef intra_pred_allsizes
diff --git a/thirdparty/libvpx/vpx_dsp/inv_txfm.c b/thirdparty/libvpx/vpx_dsp/inv_txfm.c
deleted file mode 100644
index e18d31d7aa..0000000000
--- a/thirdparty/libvpx/vpx_dsp/inv_txfm.c
+++ /dev/null
@@ -1,2518 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <math.h>
-#include <string.h>
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/inv_txfm.h"
-
-void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
-   0.5 shifts per pixel. */
-  int i;
-  tran_low_t output[16];
-  tran_high_t a1, b1, c1, d1, e1;
-  const tran_low_t *ip = input;
-  tran_low_t *op = output;
-
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0] >> UNIT_QUANT_SHIFT;
-    c1 = ip[1] >> UNIT_QUANT_SHIFT;
-    d1 = ip[2] >> UNIT_QUANT_SHIFT;
-    b1 = ip[3] >> UNIT_QUANT_SHIFT;
-    a1 += c1;
-    d1 -= b1;
-    e1 = (a1 - d1) >> 1;
-    b1 = e1 - b1;
-    c1 = e1 - c1;
-    a1 -= b1;
-    d1 += c1;
-    op[0] = WRAPLOW(a1);
-    op[1] = WRAPLOW(b1);
-    op[2] = WRAPLOW(c1);
-    op[3] = WRAPLOW(d1);
-    ip += 4;
-    op += 4;
-  }
-
-  ip = output;
-  for (i = 0; i < 4; i++) {
-    a1 = ip[4 * 0];
-    c1 = ip[4 * 1];
-    d1 = ip[4 * 2];
-    b1 = ip[4 * 3];
-    a1 += c1;
-    d1 -= b1;
-    e1 = (a1 - d1) >> 1;
-    b1 = e1 - b1;
-    c1 = e1 - c1;
-    a1 -= b1;
-    d1 += c1;
-    dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1));
-    dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1));
-    dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1));
-    dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1));
-
-    ip++;
-    dest++;
-  }
-}
-
-void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
-  int i;
-  tran_high_t a1, e1;
-  tran_low_t tmp[4];
-  const tran_low_t *ip = in;
-  tran_low_t *op = tmp;
-
-  a1 = ip[0] >> UNIT_QUANT_SHIFT;
-  e1 = a1 >> 1;
-  a1 -= e1;
-  op[0] = WRAPLOW(a1);
-  op[1] = op[2] = op[3] = WRAPLOW(e1);
-
-  ip = tmp;
-  for (i = 0; i < 4; i++) {
-    e1 = ip[0] >> 1;
-    a1 = ip[0] - e1;
-    dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
-    dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
-    dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
-    dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
-    ip++;
-    dest++;
-  }
-}
-
-void idct4_c(const tran_low_t *input, tran_low_t *output) {
-  tran_low_t step[4];
-  tran_high_t temp1, temp2;
-  // stage 1
-  temp1 = (input[0] + input[2]) * cospi_16_64;
-  temp2 = (input[0] - input[2]) * cospi_16_64;
-  step[0] = WRAPLOW(dct_const_round_shift(temp1));
-  step[1] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
-  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-  step[2] = WRAPLOW(dct_const_round_shift(temp1));
-  step[3] = WRAPLOW(dct_const_round_shift(temp2));
-
-  // stage 2
-  output[0] = WRAPLOW(step[0] + step[3]);
-  output[1] = WRAPLOW(step[1] + step[2]);
-  output[2] = WRAPLOW(step[1] - step[2]);
-  output[3] = WRAPLOW(step[0] - step[3]);
-}
-
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  tran_low_t out[4 * 4];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[4], temp_out[4];
-
-  // Rows
-  for (i = 0; i < 4; ++i) {
-    idct4_c(input, outptr);
-    input += 4;
-    outptr += 4;
-  }
-
-  // Columns
-  for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j)
-      temp_in[j] = out[j * 4 + i];
-    idct4_c(temp_in, temp_out);
-    for (j = 0; j < 4; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 4));
-    }
-  }
-}
-
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
-                         int dest_stride) {
-  int i;
-  tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
-  a1 = ROUND_POWER_OF_TWO(out, 4);
-
-  for (i = 0; i < 4; i++) {
-    dest[0] = clip_pixel_add(dest[0], a1);
-    dest[1] = clip_pixel_add(dest[1], a1);
-    dest[2] = clip_pixel_add(dest[2], a1);
-    dest[3] = clip_pixel_add(dest[3], a1);
-    dest += dest_stride;
-  }
-}
-
-void idct8_c(const tran_low_t *input, tran_low_t *output) {
-  tran_low_t step1[8], step2[8];
-  tran_high_t temp1, temp2;
-  // stage 1
-  step1[0] = input[0];
-  step1[2] = input[4];
-  step1[1] = input[2];
-  step1[3] = input[6];
-  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
-  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
-  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-
-  // stage 2
-  temp1 = (step1[0] + step1[2]) * cospi_16_64;
-  temp2 = (step1[0] - step1[2]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
-  temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[4] = WRAPLOW(step1[4] + step1[5]);
-  step2[5] = WRAPLOW(step1[4] - step1[5]);
-  step2[6] = WRAPLOW(-step1[6] + step1[7]);
-  step2[7] = WRAPLOW(step1[6] + step1[7]);
-
-  // stage 3
-  step1[0] = WRAPLOW(step2[0] + step2[3]);
-  step1[1] = WRAPLOW(step2[1] + step2[2]);
-  step1[2] = WRAPLOW(step2[1] - step2[2]);
-  step1[3] = WRAPLOW(step2[0] - step2[3]);
-  step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-  step1[7] = step2[7];
-
-  // stage 4
-  output[0] = WRAPLOW(step1[0] + step1[7]);
-  output[1] = WRAPLOW(step1[1] + step1[6]);
-  output[2] = WRAPLOW(step1[2] + step1[5]);
-  output[3] = WRAPLOW(step1[3] + step1[4]);
-  output[4] = WRAPLOW(step1[3] - step1[4]);
-  output[5] = WRAPLOW(step1[2] - step1[5]);
-  output[6] = WRAPLOW(step1[1] - step1[6]);
-  output[7] = WRAPLOW(step1[0] - step1[7]);
-}
-
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  tran_low_t out[8 * 8];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[8], temp_out[8];
-
-  // First transform rows
-  for (i = 0; i < 8; ++i) {
-    idct8_c(input, outptr);
-    input += 8;
-    outptr += 8;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j)
-      temp_in[j] = out[j * 8 + i];
-    idct8_c(temp_in, temp_out);
-    for (j = 0; j < 8; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
-    }
-  }
-}
-
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  int i, j;
-  tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
-  a1 = ROUND_POWER_OF_TWO(out, 5);
-  for (j = 0; j < 8; ++j) {
-    for (i = 0; i < 8; ++i)
-      dest[i] = clip_pixel_add(dest[i], a1);
-    dest += stride;
-  }
-}
-
-void iadst4_c(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
-  tran_low_t x0 = input[0];
-  tran_low_t x1 = input[1];
-  tran_low_t x2 = input[2];
-  tran_low_t x3 = input[3];
-
-  if (!(x0 | x1 | x2 | x3)) {
-    output[0] = output[1] = output[2] = output[3] = 0;
-    return;
-  }
-
-  s0 = sinpi_1_9 * x0;
-  s1 = sinpi_2_9 * x0;
-  s2 = sinpi_3_9 * x1;
-  s3 = sinpi_4_9 * x2;
-  s4 = sinpi_1_9 * x2;
-  s5 = sinpi_2_9 * x3;
-  s6 = sinpi_4_9 * x3;
-  s7 = WRAPLOW(x0 - x2 + x3);
-
-  s0 = s0 + s3 + s5;
-  s1 = s1 - s4 - s6;
-  s3 = s2;
-  s2 = sinpi_3_9 * s7;
-
-  // 1-D transform scaling factor is sqrt(2).
-  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
-  // + 1b (addition) = 29b.
-  // Hence the output bit depth is 15b.
-  output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
-  output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
-  output[2] = WRAPLOW(dct_const_round_shift(s2));
-  output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
-}
-
-void iadst8_c(const tran_low_t *input, tran_low_t *output) {
-  int s0, s1, s2, s3, s4, s5, s6, s7;
-
-  tran_high_t x0 = input[7];
-  tran_high_t x1 = input[0];
-  tran_high_t x2 = input[5];
-  tran_high_t x3 = input[2];
-  tran_high_t x4 = input[3];
-  tran_high_t x5 = input[4];
-  tran_high_t x6 = input[1];
-  tran_high_t x7 = input[6];
-
-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
-    output[0] = output[1] = output[2] = output[3] = output[4]
-              = output[5] = output[6] = output[7] = 0;
-    return;
-  }
-
-  // stage 1
-  s0 = (int)(cospi_2_64  * x0 + cospi_30_64 * x1);
-  s1 = (int)(cospi_30_64 * x0 - cospi_2_64  * x1);
-  s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
-  s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
-  s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
-  s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
-  s6 = (int)(cospi_26_64 * x6 + cospi_6_64  * x7);
-  s7 = (int)(cospi_6_64  * x6 - cospi_26_64 * x7);
-
-  x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
-  x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
-  x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
-  x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
-  x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
-  x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
-  x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
-  x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
-
-  // stage 2
-  s0 = (int)x0;
-  s1 = (int)x1;
-  s2 = (int)x2;
-  s3 = (int)x3;
-  s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
-  s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
-  s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
-  s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
-
-  x0 = WRAPLOW(s0 + s2);
-  x1 = WRAPLOW(s1 + s3);
-  x2 = WRAPLOW(s0 - s2);
-  x3 = WRAPLOW(s1 - s3);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
-  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
-  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
-
-  // stage 3
-  s2 = (int)(cospi_16_64 * (x2 + x3));
-  s3 = (int)(cospi_16_64 * (x2 - x3));
-  s6 = (int)(cospi_16_64 * (x6 + x7));
-  s7 = (int)(cospi_16_64 * (x6 - x7));
-
-  x2 = WRAPLOW(dct_const_round_shift(s2));
-  x3 = WRAPLOW(dct_const_round_shift(s3));
-  x6 = WRAPLOW(dct_const_round_shift(s6));
-  x7 = WRAPLOW(dct_const_round_shift(s7));
-
-  output[0] = WRAPLOW(x0);
-  output[1] = WRAPLOW(-x4);
-  output[2] = WRAPLOW(x6);
-  output[3] = WRAPLOW(-x2);
-  output[4] = WRAPLOW(x3);
-  output[5] = WRAPLOW(-x7);
-  output[6] = WRAPLOW(x5);
-  output[7] = WRAPLOW(-x1);
-}
-
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  tran_low_t out[8 * 8] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[8], temp_out[8];
-
-  // First transform rows
-  // only first 4 row has non-zero coefs
-  for (i = 0; i < 4; ++i) {
-    idct8_c(input, outptr);
-    input += 8;
-    outptr += 8;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j)
-      temp_in[j] = out[j * 8 + i];
-    idct8_c(temp_in, temp_out);
-    for (j = 0; j < 8; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
-    }
-  }
-}
-
-void idct16_c(const tran_low_t *input, tran_low_t *output) {
-  tran_low_t step1[16], step2[16];
-  tran_high_t temp1, temp2;
-
-  // stage 1
-  step1[0] = input[0/2];
-  step1[1] = input[16/2];
-  step1[2] = input[8/2];
-  step1[3] = input[24/2];
-  step1[4] = input[4/2];
-  step1[5] = input[20/2];
-  step1[6] = input[12/2];
-  step1[7] = input[28/2];
-  step1[8] = input[2/2];
-  step1[9] = input[18/2];
-  step1[10] = input[10/2];
-  step1[11] = input[26/2];
-  step1[12] = input[6/2];
-  step1[13] = input[22/2];
-  step1[14] = input[14/2];
-  step1[15] = input[30/2];
-
-  // stage 2
-  step2[0] = step1[0];
-  step2[1] = step1[1];
-  step2[2] = step1[2];
-  step2[3] = step1[3];
-  step2[4] = step1[4];
-  step2[5] = step1[5];
-  step2[6] = step1[6];
-  step2[7] = step1[7];
-
-  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
-  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[15] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
-  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
-  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
-  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
-
-  // stage 3
-  step1[0] = step2[0];
-  step1[1] = step2[1];
-  step1[2] = step2[2];
-  step1[3] = step2[3];
-
-  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
-  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
-  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-
-  step1[8] = WRAPLOW(step2[8] + step2[9]);
-  step1[9] = WRAPLOW(step2[8] - step2[9]);
-  step1[10] = WRAPLOW(-step2[10] + step2[11]);
-  step1[11] = WRAPLOW(step2[10] + step2[11]);
-  step1[12] = WRAPLOW(step2[12] + step2[13]);
-  step1[13] = WRAPLOW(step2[12] - step2[13]);
-  step1[14] = WRAPLOW(-step2[14] + step2[15]);
-  step1[15] = WRAPLOW(step2[14] + step2[15]);
-
-  // stage 4
-  temp1 = (step1[0] + step1[1]) * cospi_16_64;
-  temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
-  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[4] = WRAPLOW(step1[4] + step1[5]);
-  step2[5] = WRAPLOW(step1[4] - step1[5]);
-  step2[6] = WRAPLOW(-step1[6] + step1[7]);
-  step2[7] = WRAPLOW(step1[6] + step1[7]);
-
-  step2[8] = step1[8];
-  step2[15] = step1[15];
-  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
-  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
-  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[11] = step1[11];
-  step2[12] = step1[12];
-
-  // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3]);
-  step1[1] = WRAPLOW(step2[1] + step2[2]);
-  step1[2] = WRAPLOW(step2[1] - step2[2]);
-  step1[3] = WRAPLOW(step2[0] - step2[3]);
-  step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-  step1[7] = step2[7];
-
-  step1[8] = WRAPLOW(step2[8] + step2[11]);
-  step1[9] = WRAPLOW(step2[9] + step2[10]);
-  step1[10] = WRAPLOW(step2[9] - step2[10]);
-  step1[11] = WRAPLOW(step2[8] - step2[11]);
-  step1[12] = WRAPLOW(-step2[12] + step2[15]);
-  step1[13] = WRAPLOW(-step2[13] + step2[14]);
-  step1[14] = WRAPLOW(step2[13] + step2[14]);
-  step1[15] = WRAPLOW(step2[12] + step2[15]);
-
-  // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7]);
-  step2[1] = WRAPLOW(step1[1] + step1[6]);
-  step2[2] = WRAPLOW(step1[2] + step1[5]);
-  step2[3] = WRAPLOW(step1[3] + step1[4]);
-  step2[4] = WRAPLOW(step1[3] - step1[4]);
-  step2[5] = WRAPLOW(step1[2] - step1[5]);
-  step2[6] = WRAPLOW(step1[1] - step1[6]);
-  step2[7] = WRAPLOW(step1[0] - step1[7]);
-  step2[8] = step1[8];
-  step2[9] = step1[9];
-  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
-  temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
-  temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[14] = step1[14];
-  step2[15] = step1[15];
-
-  // stage 7
-  output[0] = WRAPLOW(step2[0] + step2[15]);
-  output[1] = WRAPLOW(step2[1] + step2[14]);
-  output[2] = WRAPLOW(step2[2] + step2[13]);
-  output[3] = WRAPLOW(step2[3] + step2[12]);
-  output[4] = WRAPLOW(step2[4] + step2[11]);
-  output[5] = WRAPLOW(step2[5] + step2[10]);
-  output[6] = WRAPLOW(step2[6] + step2[9]);
-  output[7] = WRAPLOW(step2[7] + step2[8]);
-  output[8] = WRAPLOW(step2[7] - step2[8]);
-  output[9] = WRAPLOW(step2[6] - step2[9]);
-  output[10] = WRAPLOW(step2[5] - step2[10]);
-  output[11] = WRAPLOW(step2[4] - step2[11]);
-  output[12] = WRAPLOW(step2[3] - step2[12]);
-  output[13] = WRAPLOW(step2[2] - step2[13]);
-  output[14] = WRAPLOW(step2[1] - step2[14]);
-  output[15] = WRAPLOW(step2[0] - step2[15]);
-}
-
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
-                             int stride) {
-  tran_low_t out[16 * 16];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[16], temp_out[16];
-
-  // First transform rows
-  for (i = 0; i < 16; ++i) {
-    idct16_c(input, outptr);
-    input += 16;
-    outptr += 16;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j)
-      temp_in[j] = out[j * 16 + i];
-    idct16_c(temp_in, temp_out);
-    for (j = 0; j < 16; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
-    }
-  }
-}
-
-void iadst16_c(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
-  tran_high_t s9, s10, s11, s12, s13, s14, s15;
-
-  tran_high_t x0 = input[15];
-  tran_high_t x1 = input[0];
-  tran_high_t x2 = input[13];
-  tran_high_t x3 = input[2];
-  tran_high_t x4 = input[11];
-  tran_high_t x5 = input[4];
-  tran_high_t x6 = input[9];
-  tran_high_t x7 = input[6];
-  tran_high_t x8 = input[7];
-  tran_high_t x9 = input[8];
-  tran_high_t x10 = input[5];
-  tran_high_t x11 = input[10];
-  tran_high_t x12 = input[3];
-  tran_high_t x13 = input[12];
-  tran_high_t x14 = input[1];
-  tran_high_t x15 = input[14];
-
-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
-           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
-    output[0] = output[1] = output[2] = output[3] = output[4]
-              = output[5] = output[6] = output[7] = output[8]
-              = output[9] = output[10] = output[11] = output[12]
-              = output[13] = output[14] = output[15] = 0;
-    return;
-  }
-
-  // stage 1
-  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
-  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
-  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
-  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
-  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
-  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
-  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
-  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
-  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
-  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
-  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
-  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
-  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
-  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
-  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
-  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
-
-  x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
-  x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
-  x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
-  x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
-  x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
-  x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
-  x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
-  x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
-  x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
-  x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
-  x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
-  x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
-  x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
-  x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
-
-  // stage 2
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4;
-  s5 = x5;
-  s6 = x6;
-  s7 = x7;
-  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
-  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
-  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
-  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
-  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
-  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
-  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
-  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
-
-  x0 = WRAPLOW(s0 + s4);
-  x1 = WRAPLOW(s1 + s5);
-  x2 = WRAPLOW(s2 + s6);
-  x3 = WRAPLOW(s3 + s7);
-  x4 = WRAPLOW(s0 - s4);
-  x5 = WRAPLOW(s1 - s5);
-  x6 = WRAPLOW(s2 - s6);
-  x7 = WRAPLOW(s3 - s7);
-  x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
-  x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
-  x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
-  x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
-  x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
-  x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
-  x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
-  x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
-
-  // stage 3
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
-  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
-  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
-  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
-  s8 = x8;
-  s9 = x9;
-  s10 = x10;
-  s11 = x11;
-  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
-  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
-  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
-  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
-
-  x0 = WRAPLOW(s0 + s2);
-  x1 = WRAPLOW(s1 + s3);
-  x2 = WRAPLOW(s0 - s2);
-  x3 = WRAPLOW(s1 - s3);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
-  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
-  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
-  x8 = WRAPLOW(s8 + s10);
-  x9 = WRAPLOW(s9 + s11);
-  x10 = WRAPLOW(s8 - s10);
-  x11 = WRAPLOW(s9 - s11);
-  x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
-  x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
-  x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
-  x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
-
-  // stage 4
-  s2 = (- cospi_16_64) * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (- x6 + x7);
-  s10 = cospi_16_64 * (x10 + x11);
-  s11 = cospi_16_64 * (- x10 + x11);
-  s14 = (- cospi_16_64) * (x14 + x15);
-  s15 = cospi_16_64 * (x14 - x15);
-
-  x2 = WRAPLOW(dct_const_round_shift(s2));
-  x3 = WRAPLOW(dct_const_round_shift(s3));
-  x6 = WRAPLOW(dct_const_round_shift(s6));
-  x7 = WRAPLOW(dct_const_round_shift(s7));
-  x10 = WRAPLOW(dct_const_round_shift(s10));
-  x11 = WRAPLOW(dct_const_round_shift(s11));
-  x14 = WRAPLOW(dct_const_round_shift(s14));
-  x15 = WRAPLOW(dct_const_round_shift(s15));
-
-  output[0] = WRAPLOW(x0);
-  output[1] = WRAPLOW(-x8);
-  output[2] = WRAPLOW(x12);
-  output[3] = WRAPLOW(-x4);
-  output[4] = WRAPLOW(x6);
-  output[5] = WRAPLOW(x14);
-  output[6] = WRAPLOW(x10);
-  output[7] = WRAPLOW(x2);
-  output[8] = WRAPLOW(x3);
-  output[9] = WRAPLOW(x11);
-  output[10] = WRAPLOW(x15);
-  output[11] = WRAPLOW(x7);
-  output[12] = WRAPLOW(x5);
-  output[13] = WRAPLOW(-x13);
-  output[14] = WRAPLOW(x9);
-  output[15] = WRAPLOW(-x1);
-}
-
-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
-                            int stride) {
-  tran_low_t out[16 * 16] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[16], temp_out[16];
-
-  // First transform rows. Since all non-zero dct coefficients are in
-  // upper-left 4x4 area, we only need to calculate first 4 rows here.
-  for (i = 0; i < 4; ++i) {
-    idct16_c(input, outptr);
-    input += 16;
-    outptr += 16;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j)
-      temp_in[j] = out[j*16 + i];
-    idct16_c(temp_in, temp_out);
-    for (j = 0; j < 16; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
-    }
-  }
-}
-
-void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  int i, j;
-  tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
-  a1 = ROUND_POWER_OF_TWO(out, 6);
-  for (j = 0; j < 16; ++j) {
-    for (i = 0; i < 16; ++i)
-      dest[i] = clip_pixel_add(dest[i], a1);
-    dest += stride;
-  }
-}
-
-void idct32_c(const tran_low_t *input, tran_low_t *output) {
-  tran_low_t step1[32], step2[32];
-  tran_high_t temp1, temp2;
-
-  // stage 1
-  step1[0] = input[0];
-  step1[1] = input[16];
-  step1[2] = input[8];
-  step1[3] = input[24];
-  step1[4] = input[4];
-  step1[5] = input[20];
-  step1[6] = input[12];
-  step1[7] = input[28];
-  step1[8] = input[2];
-  step1[9] = input[18];
-  step1[10] = input[10];
-  step1[11] = input[26];
-  step1[12] = input[6];
-  step1[13] = input[22];
-  step1[14] = input[14];
-  step1[15] = input[30];
-
-  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
-  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
-  step1[16] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[31] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
-  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
-  step1[17] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[30] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
-  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
-  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
-  step1[19] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[28] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
-  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
-  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
-  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
-  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
-  step1[23] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[24] = WRAPLOW(dct_const_round_shift(temp2));
-
-  // stage 2
-  step2[0] = step1[0];
-  step2[1] = step1[1];
-  step2[2] = step1[2];
-  step2[3] = step1[3];
-  step2[4] = step1[4];
-  step2[5] = step1[5];
-  step2[6] = step1[6];
-  step2[7] = step1[7];
-
-  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
-  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[15] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
-  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
-  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
-  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
-
-  step2[16] = WRAPLOW(step1[16] + step1[17]);
-  step2[17] = WRAPLOW(step1[16] - step1[17]);
-  step2[18] = WRAPLOW(-step1[18] + step1[19]);
-  step2[19] = WRAPLOW(step1[18] + step1[19]);
-  step2[20] = WRAPLOW(step1[20] + step1[21]);
-  step2[21] = WRAPLOW(step1[20] - step1[21]);
-  step2[22] = WRAPLOW(-step1[22] + step1[23]);
-  step2[23] = WRAPLOW(step1[22] + step1[23]);
-  step2[24] = WRAPLOW(step1[24] + step1[25]);
-  step2[25] = WRAPLOW(step1[24] - step1[25]);
-  step2[26] = WRAPLOW(-step1[26] + step1[27]);
-  step2[27] = WRAPLOW(step1[26] + step1[27]);
-  step2[28] = WRAPLOW(step1[28] + step1[29]);
-  step2[29] = WRAPLOW(step1[28] - step1[29]);
-  step2[30] = WRAPLOW(-step1[30] + step1[31]);
-  step2[31] = WRAPLOW(step1[30] + step1[31]);
-
-  // stage 3
-  step1[0] = step2[0];
-  step1[1] = step2[1];
-  step1[2] = step2[2];
-  step1[3] = step2[3];
-
-  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
-  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
-  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-
-  step1[8] = WRAPLOW(step2[8] + step2[9]);
-  step1[9] = WRAPLOW(step2[8] - step2[9]);
-  step1[10] = WRAPLOW(-step2[10] + step2[11]);
-  step1[11] = WRAPLOW(step2[10] + step2[11]);
-  step1[12] = WRAPLOW(step2[12] + step2[13]);
-  step1[13] = WRAPLOW(step2[12] - step2[13]);
-  step1[14] = WRAPLOW(-step2[14] + step2[15]);
-  step1[15] = WRAPLOW(step2[14] + step2[15]);
-
-  step1[16] = step2[16];
-  step1[31] = step2[31];
-  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
-  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
-  step1[17] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[30] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
-  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
-  step1[19] = step2[19];
-  step1[20] = step2[20];
-  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
-  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
-  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
-  step1[23] = step2[23];
-  step1[24] = step2[24];
-  step1[27] = step2[27];
-  step1[28] = step2[28];
-
-  // stage 4
-  temp1 = (step1[0] + step1[1]) * cospi_16_64;
-  temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
-  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[4] = WRAPLOW(step1[4] + step1[5]);
-  step2[5] = WRAPLOW(step1[4] - step1[5]);
-  step2[6] = WRAPLOW(-step1[6] + step1[7]);
-  step2[7] = WRAPLOW(step1[6] + step1[7]);
-
-  step2[8] = step1[8];
-  step2[15] = step1[15];
-  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
-  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
-  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[11] = step1[11];
-  step2[12] = step1[12];
-
-  step2[16] = WRAPLOW(step1[16] + step1[19]);
-  step2[17] = WRAPLOW(step1[17] + step1[18]);
-  step2[18] = WRAPLOW(step1[17] - step1[18]);
-  step2[19] = WRAPLOW(step1[16] - step1[19]);
-  step2[20] = WRAPLOW(-step1[20] + step1[23]);
-  step2[21] = WRAPLOW(-step1[21] + step1[22]);
-  step2[22] = WRAPLOW(step1[21] + step1[22]);
-  step2[23] = WRAPLOW(step1[20] + step1[23]);
-
-  step2[24] = WRAPLOW(step1[24] + step1[27]);
-  step2[25] = WRAPLOW(step1[25] + step1[26]);
-  step2[26] = WRAPLOW(step1[25] - step1[26]);
-  step2[27] = WRAPLOW(step1[24] - step1[27]);
-  step2[28] = WRAPLOW(-step1[28] + step1[31]);
-  step2[29] = WRAPLOW(-step1[29] + step1[30]);
-  step2[30] = WRAPLOW(step1[29] + step1[30]);
-  step2[31] = WRAPLOW(step1[28] + step1[31]);
-
-  // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3]);
-  step1[1] = WRAPLOW(step2[1] + step2[2]);
-  step1[2] = WRAPLOW(step2[1] - step2[2]);
-  step1[3] = WRAPLOW(step2[0] - step2[3]);
-  step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-  step1[7] = step2[7];
-
-  step1[8] = WRAPLOW(step2[8] + step2[11]);
-  step1[9] = WRAPLOW(step2[9] + step2[10]);
-  step1[10] = WRAPLOW(step2[9] - step2[10]);
-  step1[11] = WRAPLOW(step2[8] - step2[11]);
-  step1[12] = WRAPLOW(-step2[12] + step2[15]);
-  step1[13] = WRAPLOW(-step2[13] + step2[14]);
-  step1[14] = WRAPLOW(step2[13] + step2[14]);
-  step1[15] = WRAPLOW(step2[12] + step2[15]);
-
-  step1[16] = step2[16];
-  step1[17] = step2[17];
-  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
-  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
-  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
-  step1[19] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[28] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
-  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
-  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
-  step1[22] = step2[22];
-  step1[23] = step2[23];
-  step1[24] = step2[24];
-  step1[25] = step2[25];
-  step1[30] = step2[30];
-  step1[31] = step2[31];
-
-  // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7]);
-  step2[1] = WRAPLOW(step1[1] + step1[6]);
-  step2[2] = WRAPLOW(step1[2] + step1[5]);
-  step2[3] = WRAPLOW(step1[3] + step1[4]);
-  step2[4] = WRAPLOW(step1[3] - step1[4]);
-  step2[5] = WRAPLOW(step1[2] - step1[5]);
-  step2[6] = WRAPLOW(step1[1] - step1[6]);
-  step2[7] = WRAPLOW(step1[0] - step1[7]);
-  step2[8] = step1[8];
-  step2[9] = step1[9];
-  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
-  temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
-  temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[14] = step1[14];
-  step2[15] = step1[15];
-
-  step2[16] = WRAPLOW(step1[16] + step1[23]);
-  step2[17] = WRAPLOW(step1[17] + step1[22]);
-  step2[18] = WRAPLOW(step1[18] + step1[21]);
-  step2[19] = WRAPLOW(step1[19] + step1[20]);
-  step2[20] = WRAPLOW(step1[19] - step1[20]);
-  step2[21] = WRAPLOW(step1[18] - step1[21]);
-  step2[22] = WRAPLOW(step1[17] - step1[22]);
-  step2[23] = WRAPLOW(step1[16] - step1[23]);
-
-  step2[24] = WRAPLOW(-step1[24] + step1[31]);
-  step2[25] = WRAPLOW(-step1[25] + step1[30]);
-  step2[26] = WRAPLOW(-step1[26] + step1[29]);
-  step2[27] = WRAPLOW(-step1[27] + step1[28]);
-  step2[28] = WRAPLOW(step1[27] + step1[28]);
-  step2[29] = WRAPLOW(step1[26] + step1[29]);
-  step2[30] = WRAPLOW(step1[25] + step1[30]);
-  step2[31] = WRAPLOW(step1[24] + step1[31]);
-
-  // stage 7
-  step1[0] = WRAPLOW(step2[0] + step2[15]);
-  step1[1] = WRAPLOW(step2[1] + step2[14]);
-  step1[2] = WRAPLOW(step2[2] + step2[13]);
-  step1[3] = WRAPLOW(step2[3] + step2[12]);
-  step1[4] = WRAPLOW(step2[4] + step2[11]);
-  step1[5] = WRAPLOW(step2[5] + step2[10]);
-  step1[6] = WRAPLOW(step2[6] + step2[9]);
-  step1[7] = WRAPLOW(step2[7] + step2[8]);
-  step1[8] = WRAPLOW(step2[7] - step2[8]);
-  step1[9] = WRAPLOW(step2[6] - step2[9]);
-  step1[10] = WRAPLOW(step2[5] - step2[10]);
-  step1[11] = WRAPLOW(step2[4] - step2[11]);
-  step1[12] = WRAPLOW(step2[3] - step2[12]);
-  step1[13] = WRAPLOW(step2[2] - step2[13]);
-  step1[14] = WRAPLOW(step2[1] - step2[14]);
-  step1[15] = WRAPLOW(step2[0] - step2[15]);
-
-  step1[16] = step2[16];
-  step1[17] = step2[17];
-  step1[18] = step2[18];
-  step1[19] = step2[19];
-  temp1 = (-step2[20] + step2[27]) * cospi_16_64;
-  temp2 = (step2[20] + step2[27]) * cospi_16_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = (-step2[21] + step2[26]) * cospi_16_64;
-  temp2 = (step2[21] + step2[26]) * cospi_16_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = (-step2[22] + step2[25]) * cospi_16_64;
-  temp2 = (step2[22] + step2[25]) * cospi_16_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = (-step2[23] + step2[24]) * cospi_16_64;
-  temp2 = (step2[23] + step2[24]) * cospi_16_64;
-  step1[23] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[24] = WRAPLOW(dct_const_round_shift(temp2));
-  step1[28] = step2[28];
-  step1[29] = step2[29];
-  step1[30] = step2[30];
-  step1[31] = step2[31];
-
-  // final stage
-  output[0] = WRAPLOW(step1[0] + step1[31]);
-  output[1] = WRAPLOW(step1[1] + step1[30]);
-  output[2] = WRAPLOW(step1[2] + step1[29]);
-  output[3] = WRAPLOW(step1[3] + step1[28]);
-  output[4] = WRAPLOW(step1[4] + step1[27]);
-  output[5] = WRAPLOW(step1[5] + step1[26]);
-  output[6] = WRAPLOW(step1[6] + step1[25]);
-  output[7] = WRAPLOW(step1[7] + step1[24]);
-  output[8] = WRAPLOW(step1[8] + step1[23]);
-  output[9] = WRAPLOW(step1[9] + step1[22]);
-  output[10] = WRAPLOW(step1[10] + step1[21]);
-  output[11] = WRAPLOW(step1[11] + step1[20]);
-  output[12] = WRAPLOW(step1[12] + step1[19]);
-  output[13] = WRAPLOW(step1[13] + step1[18]);
-  output[14] = WRAPLOW(step1[14] + step1[17]);
-  output[15] = WRAPLOW(step1[15] + step1[16]);
-  output[16] = WRAPLOW(step1[15] - step1[16]);
-  output[17] = WRAPLOW(step1[14] - step1[17]);
-  output[18] = WRAPLOW(step1[13] - step1[18]);
-  output[19] = WRAPLOW(step1[12] - step1[19]);
-  output[20] = WRAPLOW(step1[11] - step1[20]);
-  output[21] = WRAPLOW(step1[10] - step1[21]);
-  output[22] = WRAPLOW(step1[9] - step1[22]);
-  output[23] = WRAPLOW(step1[8] - step1[23]);
-  output[24] = WRAPLOW(step1[7] - step1[24]);
-  output[25] = WRAPLOW(step1[6] - step1[25]);
-  output[26] = WRAPLOW(step1[5] - step1[26]);
-  output[27] = WRAPLOW(step1[4] - step1[27]);
-  output[28] = WRAPLOW(step1[3] - step1[28]);
-  output[29] = WRAPLOW(step1[2] - step1[29]);
-  output[30] = WRAPLOW(step1[1] - step1[30]);
-  output[31] = WRAPLOW(step1[0] - step1[31]);
-}
-
-void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
-                              int stride) {
-  tran_low_t out[32 * 32];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[32], temp_out[32];
-
-  // Rows
-  for (i = 0; i < 32; ++i) {
-    int16_t zero_coeff[16];
-    for (j = 0; j < 16; ++j)
-      zero_coeff[j] = input[2 * j] | input[2 * j + 1];
-    for (j = 0; j < 8; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 4; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 2; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-
-    if (zero_coeff[0] | zero_coeff[1])
-      idct32_c(input, outptr);
-    else
-      memset(outptr, 0, sizeof(tran_low_t) * 32);
-    input += 32;
-    outptr += 32;
-  }
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j)
-      temp_in[j] = out[j * 32 + i];
-    idct32_c(temp_in, temp_out);
-    for (j = 0; j < 32; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
-    }
-  }
-}
-
-void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
-                             int stride) {
-  tran_low_t out[32 * 32] = {0};
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[32], temp_out[32];
-
-  // Rows
-  // only upper-left 16x16 has non-zero coeff
-  for (i = 0; i < 16; ++i) {
-    idct32_c(input, outptr);
-    input += 32;
-    outptr += 32;
-  }
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j)
-      temp_in[j] = out[j * 32 + i];
-    idct32_c(temp_in, temp_out);
-    for (j = 0; j < 32; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
-    }
-  }
-}
-
-void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
-                            int stride) {
-  tran_low_t out[32 * 32] = {0};
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[32], temp_out[32];
-
-  // Rows
-  // only upper-left 8x8 has non-zero coeff
-  for (i = 0; i < 8; ++i) {
-    idct32_c(input, outptr);
-    input += 32;
-    outptr += 32;
-  }
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j)
-      temp_in[j] = out[j * 32 + i];
-    idct32_c(temp_in, temp_out);
-    for (j = 0; j < 32; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
-    }
-  }
-}
-
-void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  int i, j;
-  tran_high_t a1;
-
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
-  a1 = ROUND_POWER_OF_TWO(out, 6);
-
-  for (j = 0; j < 32; ++j) {
-    for (i = 0; i < 32; ++i)
-      dest[i] = clip_pixel_add(dest[i], a1);
-    dest += stride;
-  }
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
-                                 int stride, int bd) {
-  /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
-     0.5 shifts per pixel. */
-  int i;
-  tran_low_t output[16];
-  tran_high_t a1, b1, c1, d1, e1;
-  const tran_low_t *ip = input;
-  tran_low_t *op = output;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0] >> UNIT_QUANT_SHIFT;
-    c1 = ip[1] >> UNIT_QUANT_SHIFT;
-    d1 = ip[2] >> UNIT_QUANT_SHIFT;
-    b1 = ip[3] >> UNIT_QUANT_SHIFT;
-    a1 += c1;
-    d1 -= b1;
-    e1 = (a1 - d1) >> 1;
-    b1 = e1 - b1;
-    c1 = e1 - c1;
-    a1 -= b1;
-    d1 += c1;
-    op[0] = HIGHBD_WRAPLOW(a1, bd);
-    op[1] = HIGHBD_WRAPLOW(b1, bd);
-    op[2] = HIGHBD_WRAPLOW(c1, bd);
-    op[3] = HIGHBD_WRAPLOW(d1, bd);
-    ip += 4;
-    op += 4;
-  }
-
-  ip = output;
-  for (i = 0; i < 4; i++) {
-    a1 = ip[4 * 0];
-    c1 = ip[4 * 1];
-    d1 = ip[4 * 2];
-    b1 = ip[4 * 3];
-    a1 += c1;
-    d1 -= b1;
-    e1 = (a1 - d1) >> 1;
-    b1 = e1 - b1;
-    c1 = e1 - c1;
-    a1 -= b1;
-    d1 += c1;
-    dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0],
-                                             HIGHBD_WRAPLOW(a1, bd), bd);
-    dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1],
-                                             HIGHBD_WRAPLOW(b1, bd), bd);
-    dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2],
-                                             HIGHBD_WRAPLOW(c1, bd), bd);
-    dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3],
-                                             HIGHBD_WRAPLOW(d1, bd), bd);
-
-    ip++;
-    dest++;
-  }
-}
-
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
-                                int dest_stride, int bd) {
-  int i;
-  tran_high_t a1, e1;
-  tran_low_t tmp[4];
-  const tran_low_t *ip = in;
-  tran_low_t *op = tmp;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  (void) bd;
-
-  a1 = ip[0] >> UNIT_QUANT_SHIFT;
-  e1 = a1 >> 1;
-  a1 -= e1;
-  op[0] = HIGHBD_WRAPLOW(a1, bd);
-  op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
-
-  ip = tmp;
-  for (i = 0; i < 4; i++) {
-    e1 = ip[0] >> 1;
-    a1 = ip[0] - e1;
-    dest[dest_stride * 0] = highbd_clip_pixel_add(
-        dest[dest_stride * 0], a1, bd);
-    dest[dest_stride * 1] = highbd_clip_pixel_add(
-        dest[dest_stride * 1], e1, bd);
-    dest[dest_stride * 2] = highbd_clip_pixel_add(
-        dest[dest_stride * 2], e1, bd);
-    dest[dest_stride * 3] = highbd_clip_pixel_add(
-        dest[dest_stride * 3], e1, bd);
-    ip++;
-    dest++;
-  }
-}
-
-void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
-  tran_low_t step[4];
-  tran_high_t temp1, temp2;
-  (void) bd;
-  // stage 1
-  temp1 = (input[0] + input[2]) * cospi_16_64;
-  temp2 = (input[0] - input[2]) * cospi_16_64;
-  step[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
-  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-  step[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  // stage 2
-  output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
-  output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd);
-  output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd);
-  output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
-}
-
-void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
-                                 int stride, int bd) {
-  tran_low_t out[4 * 4];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[4], temp_out[4];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  // Rows
-  for (i = 0; i < 4; ++i) {
-    vpx_highbd_idct4_c(input, outptr, bd);
-    input += 4;
-    outptr += 4;
-  }
-
-  // Columns
-  for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j)
-      temp_in[j] = out[j * 4 + i];
-    vpx_highbd_idct4_c(temp_in, temp_out, bd);
-    for (j = 0; j < 4; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
-    }
-  }
-}
-
-void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
-                                int dest_stride, int bd) {
-  int i;
-  tran_high_t a1;
-  tran_low_t out = HIGHBD_WRAPLOW(
-      highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
-  a1 = ROUND_POWER_OF_TWO(out, 4);
-
-  for (i = 0; i < 4; i++) {
-    dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
-    dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
-    dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
-    dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
-    dest += dest_stride;
-  }
-}
-
-void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
-  tran_low_t step1[8], step2[8];
-  tran_high_t temp1, temp2;
-  // stage 1
-  step1[0] = input[0];
-  step1[2] = input[4];
-  step1[1] = input[2];
-  step1[3] = input[6];
-  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
-  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
-  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  // stage 2 & stage 3 - even half
-  vpx_highbd_idct4_c(step1, step1, bd);
-
-  // stage 2 - odd half
-  step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
-  step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
-  step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
-  step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
-
-  // stage 3 - odd half
-  step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  step1[7] = step2[7];
-
-  // stage 4
-  output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
-  output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
-  output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
-  output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
-  output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
-  output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
-  output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
-  output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
-}
-
-void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
-                                 int stride, int bd) {
-  tran_low_t out[8 * 8];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[8], temp_out[8];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  // First transform rows.
-  for (i = 0; i < 8; ++i) {
-    vpx_highbd_idct8_c(input, outptr, bd);
-    input += 8;
-    outptr += 8;
-  }
-
-  // Then transform columns.
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j)
-      temp_in[j] = out[j * 8 + i];
-    vpx_highbd_idct8_c(temp_in, temp_out, bd);
-    for (j = 0; j < 8; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
-    }
-  }
-}
-
-void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
-                                int stride, int bd) {
-  int i, j;
-  tran_high_t a1;
-  tran_low_t out = HIGHBD_WRAPLOW(
-      highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
-  a1 = ROUND_POWER_OF_TWO(out, 5);
-  for (j = 0; j < 8; ++j) {
-    for (i = 0; i < 8; ++i)
-      dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
-    dest += stride;
-  }
-}
-
-void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
-  tran_low_t x0 = input[0];
-  tran_low_t x1 = input[1];
-  tran_low_t x2 = input[2];
-  tran_low_t x3 = input[3];
-  (void) bd;
-
-  if (!(x0 | x1 | x2 | x3)) {
-    memset(output, 0, 4 * sizeof(*output));
-    return;
-  }
-
-  s0 = sinpi_1_9 * x0;
-  s1 = sinpi_2_9 * x0;
-  s2 = sinpi_3_9 * x1;
-  s3 = sinpi_4_9 * x2;
-  s4 = sinpi_1_9 * x2;
-  s5 = sinpi_2_9 * x3;
-  s6 = sinpi_4_9 * x3;
-  s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
-
-  s0 = s0 + s3 + s5;
-  s1 = s1 - s4 - s6;
-  s3 = s2;
-  s2 = sinpi_3_9 * s7;
-
-  // 1-D transform scaling factor is sqrt(2).
-  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
-  // + 1b (addition) = 29b.
-  // Hence the output bit depth is 15b.
-  output[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s3), bd);
-  output[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s3), bd);
-  output[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
-  output[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3), bd);
-}
-
-void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
-  tran_low_t x0 = input[7];
-  tran_low_t x1 = input[0];
-  tran_low_t x2 = input[5];
-  tran_low_t x3 = input[2];
-  tran_low_t x4 = input[3];
-  tran_low_t x5 = input[4];
-  tran_low_t x6 = input[1];
-  tran_low_t x7 = input[6];
-  (void) bd;
-
-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
-    memset(output, 0, 8 * sizeof(*output));
-    return;
-  }
-
-  // stage 1
-  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
-  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
-  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
-  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
-  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
-  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
-  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
-  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
-
-  x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s4), bd);
-  x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s5), bd);
-  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s6), bd);
-  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s7), bd);
-  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s4), bd);
-  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s5), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s6), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s7), bd);
-
-  // stage 2
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
-  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
-  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
-  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
-
-  x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
-  x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
-  x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
-  x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
-  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd);
-  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd);
-
-  // stage 3
-  s2 = cospi_16_64 * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (x6 - x7);
-
-  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
-  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd);
-
-  output[0] = HIGHBD_WRAPLOW(x0, bd);
-  output[1] = HIGHBD_WRAPLOW(-x4, bd);
-  output[2] = HIGHBD_WRAPLOW(x6, bd);
-  output[3] = HIGHBD_WRAPLOW(-x2, bd);
-  output[4] = HIGHBD_WRAPLOW(x3, bd);
-  output[5] = HIGHBD_WRAPLOW(-x7, bd);
-  output[6] = HIGHBD_WRAPLOW(x5, bd);
-  output[7] = HIGHBD_WRAPLOW(-x1, bd);
-}
-
-void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
-                                 int stride, int bd) {
-  tran_low_t out[8 * 8] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[8], temp_out[8];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  // First transform rows.
-  // Only first 4 row has non-zero coefs.
-  for (i = 0; i < 4; ++i) {
-    vpx_highbd_idct8_c(input, outptr, bd);
-    input += 8;
-    outptr += 8;
-  }
-  // Then transform columns.
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j)
-      temp_in[j] = out[j * 8 + i];
-    vpx_highbd_idct8_c(temp_in, temp_out, bd);
-    for (j = 0; j < 8; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
-    }
-  }
-}
-
-void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
-  tran_low_t step1[16], step2[16];
-  tran_high_t temp1, temp2;
-  (void) bd;
-
-  // stage 1
-  step1[0] = input[0/2];
-  step1[1] = input[16/2];
-  step1[2] = input[8/2];
-  step1[3] = input[24/2];
-  step1[4] = input[4/2];
-  step1[5] = input[20/2];
-  step1[6] = input[12/2];
-  step1[7] = input[28/2];
-  step1[8] = input[2/2];
-  step1[9] = input[18/2];
-  step1[10] = input[10/2];
-  step1[11] = input[26/2];
-  step1[12] = input[6/2];
-  step1[13] = input[22/2];
-  step1[14] = input[14/2];
-  step1[15] = input[30/2];
-
-  // stage 2
-  step2[0] = step1[0];
-  step2[1] = step1[1];
-  step2[2] = step1[2];
-  step2[3] = step1[3];
-  step2[4] = step1[4];
-  step2[5] = step1[5];
-  step2[6] = step1[6];
-  step2[7] = step1[7];
-
-  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
-  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
-  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
-  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
-  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  // stage 3
-  step1[0] = step2[0];
-  step1[1] = step2[1];
-  step1[2] = step2[2];
-  step1[3] = step2[3];
-
-  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
-  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
-  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
-  step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
-  step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
-  step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
-  step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
-  step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
-  step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
-  step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
-
-  // stage 4
-  temp1 = (step1[0] + step1[1]) * cospi_16_64;
-  temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
-  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
-  step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
-  step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
-  step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
-
-  step2[8] = step1[8];
-  step2[15] = step1[15];
-  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
-  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
-  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  step2[11] = step1[11];
-  step2[12] = step1[12];
-
-  // stage 5
-  step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
-  step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
-  step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
-  step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
-  step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  step1[7] = step2[7];
-
-  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
-  step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
-  step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
-  step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
-  step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
-  step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
-  step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
-  step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
-
-  // stage 6
-  step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
-  step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
-  step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
-  step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
-  step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
-  step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
-  step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
-  step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
-  step2[8] = step1[8];
-  step2[9] = step1[9];
-  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
-  temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
-  temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  step2[14] = step1[14];
-  step2[15] = step1[15];
-
-  // stage 7
-  output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
-  output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
-  output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
-  output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
-  output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
-  output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
-  output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
-  output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
-  output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
-  output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
-  output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
-  output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
-  output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
-  output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
-  output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
-  output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
-}
-
-void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
-                                    int stride, int bd) {
-  tran_low_t out[16 * 16];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[16], temp_out[16];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  // First transform rows.
-  for (i = 0; i < 16; ++i) {
-    vpx_highbd_idct16_c(input, outptr, bd);
-    input += 16;
-    outptr += 16;
-  }
-
-  // Then transform columns.
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j)
-      temp_in[j] = out[j * 16 + i];
-    vpx_highbd_idct16_c(temp_in, temp_out, bd);
-    for (j = 0; j < 16; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
-    }
-  }
-}
-
-void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
-  tran_high_t s9, s10, s11, s12, s13, s14, s15;
-
-  tran_low_t x0 = input[15];
-  tran_low_t x1 = input[0];
-  tran_low_t x2 = input[13];
-  tran_low_t x3 = input[2];
-  tran_low_t x4 = input[11];
-  tran_low_t x5 = input[4];
-  tran_low_t x6 = input[9];
-  tran_low_t x7 = input[6];
-  tran_low_t x8 = input[7];
-  tran_low_t x9 = input[8];
-  tran_low_t x10 = input[5];
-  tran_low_t x11 = input[10];
-  tran_low_t x12 = input[3];
-  tran_low_t x13 = input[12];
-  tran_low_t x14 = input[1];
-  tran_low_t x15 = input[14];
-  (void) bd;
-
-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
-           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
-    memset(output, 0, 16 * sizeof(*output));
-    return;
-  }
-
-  // stage 1
-  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
-  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
-  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
-  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
-  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
-  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
-  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
-  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
-  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
-  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
-  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
-  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
-  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
-  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
-  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
-  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
-
-  x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s8), bd);
-  x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s9), bd);
-  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s10), bd);
-  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s11), bd);
-  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s12), bd);
-  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s13), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 + s14), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 + s15), bd);
-  x8  = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s8), bd);
-  x9  = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s9), bd);
-  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s10), bd);
-  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s11), bd);
-  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s12), bd);
-  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s13), bd);
-  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 - s14), bd);
-  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 - s15), bd);
-
-  // stage 2
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4;
-  s5 = x5;
-  s6 = x6;
-  s7 = x7;
-  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
-  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
-  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
-  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
-  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
-  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
-  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
-  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
-
-  x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
-  x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
-  x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
-  x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
-  x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
-  x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
-  x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
-  x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
-  x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 + s12), bd);
-  x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 + s13), bd);
-  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 + s14), bd);
-  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 + s15), bd);
-  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 - s12), bd);
-  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 - s13), bd);
-  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 - s14), bd);
-  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 - s15), bd);
-
-  // stage 3
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
-  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
-  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
-  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
-  s8 = x8;
-  s9 = x9;
-  s10 = x10;
-  s11 = x11;
-  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
-  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
-  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
-  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
-
-  x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
-  x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
-  x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
-  x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
-  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd);
-  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd);
-  x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
-  x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
-  x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
-  x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
-  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 + s14), bd);
-  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 + s15), bd);
-  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 - s14), bd);
-  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 - s15), bd);
-
-  // stage 4
-  s2 = (- cospi_16_64) * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (-x6 + x7);
-  s10 = cospi_16_64 * (x10 + x11);
-  s11 = cospi_16_64 * (-x10 + x11);
-  s14 = (- cospi_16_64) * (x14 + x15);
-  s15 = cospi_16_64 * (x14 - x15);
-
-  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
-  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd);
-  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10), bd);
-  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11), bd);
-  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s14), bd);
-  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s15), bd);
-
-  output[0] = HIGHBD_WRAPLOW(x0, bd);
-  output[1] = HIGHBD_WRAPLOW(-x8, bd);
-  output[2] = HIGHBD_WRAPLOW(x12, bd);
-  output[3] = HIGHBD_WRAPLOW(-x4, bd);
-  output[4] = HIGHBD_WRAPLOW(x6, bd);
-  output[5] = HIGHBD_WRAPLOW(x14, bd);
-  output[6] = HIGHBD_WRAPLOW(x10, bd);
-  output[7] = HIGHBD_WRAPLOW(x2, bd);
-  output[8] = HIGHBD_WRAPLOW(x3, bd);
-  output[9] = HIGHBD_WRAPLOW(x11, bd);
-  output[10] = HIGHBD_WRAPLOW(x15, bd);
-  output[11] = HIGHBD_WRAPLOW(x7, bd);
-  output[12] = HIGHBD_WRAPLOW(x5, bd);
-  output[13] = HIGHBD_WRAPLOW(-x13, bd);
-  output[14] = HIGHBD_WRAPLOW(x9, bd);
-  output[15] = HIGHBD_WRAPLOW(-x1, bd);
-}
-
-void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
-                                   int stride, int bd) {
-  tran_low_t out[16 * 16] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[16], temp_out[16];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  // First transform rows. Since all non-zero dct coefficients are in
-  // upper-left 4x4 area, we only need to calculate first 4 rows here.
-  for (i = 0; i < 4; ++i) {
-    vpx_highbd_idct16_c(input, outptr, bd);
-    input += 16;
-    outptr += 16;
-  }
-
-  // Then transform columns.
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j)
-      temp_in[j] = out[j*16 + i];
-    vpx_highbd_idct16_c(temp_in, temp_out, bd);
-    for (j = 0; j < 16; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
-    }
-  }
-}
-
-void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
-                                  int stride, int bd) {
-  int i, j;
-  tran_high_t a1;
-  tran_low_t out = HIGHBD_WRAPLOW(
-      highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
-  a1 = ROUND_POWER_OF_TWO(out, 6);
-  for (j = 0; j < 16; ++j) {
-    for (i = 0; i < 16; ++i)
-      dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
-    dest += stride;
-  }
-}
-
-static void highbd_idct32_c(const tran_low_t *input,
-                            tran_low_t *output, int bd) {
-  tran_low_t step1[32], step2[32];
-  tran_high_t temp1, temp2;
-  (void) bd;
-
-  // stage 1
-  step1[0] = input[0];
-  step1[1] = input[16];
-  step1[2] = input[8];
-  step1[3] = input[24];
-  step1[4] = input[4];
-  step1[5] = input[20];
-  step1[6] = input[12];
-  step1[7] = input[28];
-  step1[8] = input[2];
-  step1[9] = input[18];
-  step1[10] = input[10];
-  step1[11] = input[26];
-  step1[12] = input[6];
-  step1[13] = input[22];
-  step1[14] = input[14];
-  step1[15] = input[30];
-
-  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
-  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
-  step1[16] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[31] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
-  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
-  step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
-  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
-  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
-  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
-  step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
-  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
-  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
-  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
-  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
-  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
-  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
-  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
-  step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  // stage 2
-  step2[0] = step1[0];
-  step2[1] = step1[1];
-  step2[2] = step1[2];
-  step2[3] = step1[3];
-  step2[4] = step1[4];
-  step2[5] = step1[5];
-  step2[6] = step1[6];
-  step2[7] = step1[7];
-
-  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
-  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
-  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
-  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
-  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
-  step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
-  step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd);
-  step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd);
-  step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd);
-  step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd);
-  step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd);
-  step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd);
-  step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd);
-  step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd);
-  step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd);
-  step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd);
-  step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd);
-  step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd);
-  step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd);
-  step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd);
-
-  // stage 3
-  step1[0] = step2[0];
-  step1[1] = step2[1];
-  step1[2] = step2[2];
-  step1[3] = step2[3];
-
-  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
-  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
-  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
-  step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
-  step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
-  step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
-  step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
-  step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
-  step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
-  step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
-
-  step1[16] = step2[16];
-  step1[31] = step2[31];
-  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
-  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
-  step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
-  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
-  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  step1[19] = step2[19];
-  step1[20] = step2[20];
-  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
-  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
-  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
-  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
-  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  step1[23] = step2[23];
-  step1[24] = step2[24];
-  step1[27] = step2[27];
-  step1[28] = step2[28];
-
-  // stage 4
-  temp1 = (step1[0] + step1[1]) * cospi_16_64;
-  temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
-  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
-  step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
-  step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
-  step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
-
-  step2[8] = step1[8];
-  step2[15] = step1[15];
-  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
-  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
-  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  step2[11] = step1[11];
-  step2[12] = step1[12];
-
-  step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd);
-  step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd);
-  step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd);
-  step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd);
-  step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd);
-  step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd);
-  step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd);
-  step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd);
-
-  step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd);
-  step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd);
-  step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd);
-  step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd);
-  step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd);
-  step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd);
-  step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd);
-  step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd);
-
-  // stage 5
-  step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
-  step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
-  step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
-  step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
-  step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  step1[7] = step2[7];
-
-  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
-  step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
-  step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
-  step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
-  step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
-  step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
-  step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
-  step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
-
-  step1[16] = step2[16];
-  step1[17] = step2[17];
-  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
-  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
-  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
-  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
-  step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
-  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
-  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
-  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
-  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  step1[22] = step2[22];
-  step1[23] = step2[23];
-  step1[24] = step2[24];
-  step1[25] = step2[25];
-  step1[30] = step2[30];
-  step1[31] = step2[31];
-
-  // stage 6
-  step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
-  step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
-  step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
-  step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
-  step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
-  step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
-  step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
-  step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
-  step2[8] = step1[8];
-  step2[9] = step1[9];
-  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
-  temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
-  temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  step2[14] = step1[14];
-  step2[15] = step1[15];
-
-  step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd);
-  step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd);
-  step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd);
-  step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd);
-  step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd);
-  step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd);
-  step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd);
-  step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd);
-
-  step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd);
-  step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd);
-  step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd);
-  step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd);
-  step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd);
-  step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd);
-  step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd);
-  step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd);
-
-  // stage 7
-  step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
-  step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
-  step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
-  step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
-  step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
-  step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
-  step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
-  step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
-  step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
-  step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
-  step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
-  step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
-  step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
-  step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
-  step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
-  step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
-
-  step1[16] = step2[16];
-  step1[17] = step2[17];
-  step1[18] = step2[18];
-  step1[19] = step2[19];
-  temp1 = (-step2[20] + step2[27]) * cospi_16_64;
-  temp2 = (step2[20] + step2[27]) * cospi_16_64;
-  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = (-step2[21] + step2[26]) * cospi_16_64;
-  temp2 = (step2[21] + step2[26]) * cospi_16_64;
-  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = (-step2[22] + step2[25]) * cospi_16_64;
-  temp2 = (step2[22] + step2[25]) * cospi_16_64;
-  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = (-step2[23] + step2[24]) * cospi_16_64;
-  temp2 = (step2[23] + step2[24]) * cospi_16_64;
-  step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  step1[28] = step2[28];
-  step1[29] = step2[29];
-  step1[30] = step2[30];
-  step1[31] = step2[31];
-
-  // final stage
-  output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd);
-  output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd);
-  output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd);
-  output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd);
-  output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd);
-  output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd);
-  output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd);
-  output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd);
-  output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd);
-  output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd);
-  output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd);
-  output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd);
-  output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd);
-  output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd);
-  output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd);
-  output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd);
-  output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd);
-  output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd);
-  output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd);
-  output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd);
-  output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd);
-  output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd);
-  output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd);
-  output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd);
-  output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd);
-  output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd);
-  output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd);
-  output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd);
-  output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd);
-  output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd);
-  output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd);
-  output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
-}
-
-void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
-                                     int stride, int bd) {
-  tran_low_t out[32 * 32];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[32], temp_out[32];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  // Rows
-  for (i = 0; i < 32; ++i) {
-    tran_low_t zero_coeff[16];
-    for (j = 0; j < 16; ++j)
-      zero_coeff[j] = input[2 * j] | input[2 * j + 1];
-    for (j = 0; j < 8; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 4; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 2; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-
-    if (zero_coeff[0] | zero_coeff[1])
-      highbd_idct32_c(input, outptr, bd);
-    else
-      memset(outptr, 0, sizeof(tran_low_t) * 32);
-    input += 32;
-    outptr += 32;
-  }
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j)
-      temp_in[j] = out[j * 32 + i];
-    highbd_idct32_c(temp_in, temp_out, bd);
-    for (j = 0; j < 32; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
-    }
-  }
-}
-
-void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
-                                   int stride, int bd) {
-  tran_low_t out[32 * 32] = {0};
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[32], temp_out[32];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  // Rows
-  // Only upper-left 8x8 has non-zero coeff.
-  for (i = 0; i < 8; ++i) {
-    highbd_idct32_c(input, outptr, bd);
-    input += 32;
-    outptr += 32;
-  }
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j)
-      temp_in[j] = out[j * 32 + i];
-    highbd_idct32_c(temp_in, temp_out, bd);
-    for (j = 0; j < 32; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
-    }
-  }
-}
-
-void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
-                                  int stride, int bd) {
-  int i, j;
-  int a1;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  tran_low_t out = HIGHBD_WRAPLOW(
-      highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
-  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
-  a1 = ROUND_POWER_OF_TWO(out, 6);
-
-  for (j = 0; j < 32; ++j) {
-    for (i = 0; i < 32; ++i)
-      dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
-    dest += stride;
-  }
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/thirdparty/libvpx/vpx_dsp/inv_txfm.h b/thirdparty/libvpx/vpx_dsp/inv_txfm.h
deleted file mode 100644
index 9cfe1be3a7..0000000000
--- a/thirdparty/libvpx/vpx_dsp/inv_txfm.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_DSP_INV_TXFM_H_
-#define VPX_DSP_INV_TXFM_H_
-
-#include <assert.h>
-
-#include "./vpx_config.h"
-#include "vpx_dsp/txfm_common.h"
-#include "vpx_ports/mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-static INLINE tran_high_t check_range(tran_high_t input) {
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
-  // For valid VP9 input streams, intermediate stage coefficients should always
-  // stay within the range of a signed 16 bit integer. Coefficients can go out
-  // of this range for invalid/corrupt VP9 streams. However, strictly checking
-  // this range for every intermediate coefficient can burdensome for a decoder,
-  // therefore the following assertion is only enabled when configured with
-  // --enable-coefficient-range-checking.
-  assert(INT16_MIN <= input);
-  assert(input <= INT16_MAX);
-#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
-  return input;
-}
-
-static INLINE tran_high_t dct_const_round_shift(tran_high_t input) {
-  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  return (tran_high_t)rv;
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static INLINE tran_high_t highbd_check_range(tran_high_t input,
-                                             int bd) {
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
-  // For valid highbitdepth VP9 streams, intermediate stage coefficients will
-  // stay within the ranges:
-  // - 8 bit: signed 16 bit integer
-  // - 10 bit: signed 18 bit integer
-  // - 12 bit: signed 20 bit integer
-  const int32_t int_max = (1 << (7 + bd)) - 1;
-  const int32_t int_min = -int_max - 1;
-  assert(int_min <= input);
-  assert(input <= int_max);
-  (void) int_min;
-#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
-  (void) bd;
-  return input;
-}
-
-static INLINE tran_high_t highbd_dct_const_round_shift(tran_high_t input) {
-  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  return (tran_high_t)rv;
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-#if CONFIG_EMULATE_HARDWARE
-// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a
-// non-normative method to handle overflows. A stream that causes
-// overflows  in the inverse transform is considered invalid in VP9,
-// and a hardware implementer is free to choose any reasonable
-// method to handle overflows. However to aid in hardware
-// verification they can use a specific implementation of the
-// WRAPLOW() macro below that is identical to their intended
-// hardware implementation (and also use configure options to trigger
-// the C-implementation of the transform).
-//
-// The particular WRAPLOW implementation below performs strict
-// overflow wrapping to match common hardware implementations.
-// bd of 8 uses trans_low with 16bits, need to remove 16bits
-// bd of 10 uses trans_low with 18bits, need to remove 14bits
-// bd of 12 uses trans_low with 20bits, need to remove 12bits
-// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
-
-#define WRAPLOW(x) ((((int32_t)check_range(x)) << 16) >> 16)
-#if CONFIG_VP9_HIGHBITDEPTH
-#define HIGHBD_WRAPLOW(x, bd) \
-    ((((int32_t)highbd_check_range((x), bd)) << (24 - bd)) >> (24 - bd))
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-#else   // CONFIG_EMULATE_HARDWARE
-
-#define WRAPLOW(x) ((int32_t)check_range(x))
-#if CONFIG_VP9_HIGHBITDEPTH
-#define HIGHBD_WRAPLOW(x, bd) \
-    ((int32_t)highbd_check_range((x), bd))
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-#endif  // CONFIG_EMULATE_HARDWARE
-
-void idct4_c(const tran_low_t *input, tran_low_t *output);
-void idct8_c(const tran_low_t *input, tran_low_t *output);
-void idct16_c(const tran_low_t *input, tran_low_t *output);
-void idct32_c(const tran_low_t *input, tran_low_t *output);
-void iadst4_c(const tran_low_t *input, tran_low_t *output);
-void iadst8_c(const tran_low_t *input, tran_low_t *output);
-void iadst16_c(const tran_low_t *input, tran_low_t *output);
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd);
-void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd);
-void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd);
-
-void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd);
-void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd);
-void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd);
-
-static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
-                                             int bd) {
-  trans = HIGHBD_WRAPLOW(trans, bd);
-  return clip_pixel_highbd(dest + (int)trans, bd);
-}
-#endif
-
-static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
-  trans = WRAPLOW(trans);
-  return clip_pixel(dest + (int)trans);
-}
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VPX_DSP_INV_TXFM_H_
diff --git a/thirdparty/libvpx/vpx_dsp/loopfilter.c b/thirdparty/libvpx/vpx_dsp/loopfilter.c
deleted file mode 100644
index 645a1ab95e..0000000000
--- a/thirdparty/libvpx/vpx_dsp/loopfilter.c
+++ /dev/null
@@ -1,767 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdlib.h>
-
-#include "./vpx_config.h"
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_ports/mem.h"
-
-static INLINE int8_t signed_char_clamp(int t) {
-  return (int8_t)clamp(t, -128, 127);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static INLINE int16_t signed_char_clamp_high(int t, int bd) {
-  switch (bd) {
-    case 10:
-      return (int16_t)clamp(t, -128*4, 128*4-1);
-    case 12:
-      return (int16_t)clamp(t, -128*16, 128*16-1);
-    case 8:
-    default:
-      return (int16_t)clamp(t, -128, 128-1);
-  }
-}
-#endif
-
-// should we apply any filter at all: 11111111 yes, 00000000 no
-static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit,
-                                 uint8_t p3, uint8_t p2,
-                                 uint8_t p1, uint8_t p0,
-                                 uint8_t q0, uint8_t q1,
-                                 uint8_t q2, uint8_t q3) {
-  int8_t mask = 0;
-  mask |= (abs(p3 - p2) > limit) * -1;
-  mask |= (abs(p2 - p1) > limit) * -1;
-  mask |= (abs(p1 - p0) > limit) * -1;
-  mask |= (abs(q1 - q0) > limit) * -1;
-  mask |= (abs(q2 - q1) > limit) * -1;
-  mask |= (abs(q3 - q2) > limit) * -1;
-  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-  return ~mask;
-}
-
-static INLINE int8_t flat_mask4(uint8_t thresh,
-                                uint8_t p3, uint8_t p2,
-                                uint8_t p1, uint8_t p0,
-                                uint8_t q0, uint8_t q1,
-                                uint8_t q2, uint8_t q3) {
-  int8_t mask = 0;
-  mask |= (abs(p1 - p0) > thresh) * -1;
-  mask |= (abs(q1 - q0) > thresh) * -1;
-  mask |= (abs(p2 - p0) > thresh) * -1;
-  mask |= (abs(q2 - q0) > thresh) * -1;
-  mask |= (abs(p3 - p0) > thresh) * -1;
-  mask |= (abs(q3 - q0) > thresh) * -1;
-  return ~mask;
-}
-
-static INLINE int8_t flat_mask5(uint8_t thresh,
-                                uint8_t p4, uint8_t p3,
-                                uint8_t p2, uint8_t p1,
-                                uint8_t p0, uint8_t q0,
-                                uint8_t q1, uint8_t q2,
-                                uint8_t q3, uint8_t q4) {
-  int8_t mask = ~flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3);
-  mask |= (abs(p4 - p0) > thresh) * -1;
-  mask |= (abs(q4 - q0) > thresh) * -1;
-  return ~mask;
-}
-
-// is there high edge variance internal edge: 11111111 yes, 00000000 no
-static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
-                              uint8_t q0, uint8_t q1) {
-  int8_t hev = 0;
-  hev  |= (abs(p1 - p0) > thresh) * -1;
-  hev  |= (abs(q1 - q0) > thresh) * -1;
-  return hev;
-}
-
-static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
-                           uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
-  int8_t filter1, filter2;
-
-  const int8_t ps1 = (int8_t) *op1 ^ 0x80;
-  const int8_t ps0 = (int8_t) *op0 ^ 0x80;
-  const int8_t qs0 = (int8_t) *oq0 ^ 0x80;
-  const int8_t qs1 = (int8_t) *oq1 ^ 0x80;
-  const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1);
-
-  // add outer taps if we have high edge variance
-  int8_t filter = signed_char_clamp(ps1 - qs1) & hev;
-
-  // inner taps
-  filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;
-
-  // save bottom 3 bits so that we round one side +4 and the other +3
-  // if it equals 4 we'll set to adjust by -1 to account for the fact
-  // we'd round 3 the other way
-  filter1 = signed_char_clamp(filter + 4) >> 3;
-  filter2 = signed_char_clamp(filter + 3) >> 3;
-
-  *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;
-  *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;
-
-  // outer tap adjustments
-  filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
-
-  *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;
-  *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
-}
-
-void vpx_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
-                            const uint8_t *blimit, const uint8_t *limit,
-                            const uint8_t *thresh) {
-  int i;
-
-  // loop filter designed to work using chars so that we can make maximum use
-  // of 8 bit simd instructions.
-  for (i = 0; i < 8; ++i) {
-    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint8_t q0 = s[0 * p],  q1 = s[1 * p],  q2 = s[2 * p],  q3 = s[3 * p];
-    const int8_t mask = filter_mask(*limit, *blimit,
-                                    p3, p2, p1, p0, q0, q1, q2, q3);
-    filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
-    ++s;
-  }
-}
-
-void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
-                                 const uint8_t *limit0, const uint8_t *thresh0,
-                                 const uint8_t *blimit1, const uint8_t *limit1,
-                                 const uint8_t *thresh1) {
-  vpx_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0);
-  vpx_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1);
-}
-
-void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
-                          const uint8_t *limit, const uint8_t *thresh) {
-  int i;
-
-  // loop filter designed to work using chars so that we can make maximum use
-  // of 8 bit simd instructions.
-  for (i = 0; i < 8; ++i) {
-    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
-    const uint8_t q0 = s[0],  q1 = s[1],  q2 = s[2],  q3 = s[3];
-    const int8_t mask = filter_mask(*limit, *blimit,
-                                    p3, p2, p1, p0, q0, q1, q2, q3);
-    filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
-    s += pitch;
-  }
-}
-
-void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
-                               const uint8_t *limit0, const uint8_t *thresh0,
-                               const uint8_t *blimit1, const uint8_t *limit1,
-                               const uint8_t *thresh1) {
-  vpx_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0);
-  vpx_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
-}
-
-static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,
-                           uint8_t *op3, uint8_t *op2,
-                           uint8_t *op1, uint8_t *op0,
-                           uint8_t *oq0, uint8_t *oq1,
-                           uint8_t *oq2, uint8_t *oq3) {
-  if (flat && mask) {
-    const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
-    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
-
-    // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
-    *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
-    *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
-    *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
-    *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
-    *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
-    *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
-  } else {
-    filter4(mask, thresh, op1,  op0, oq0, oq1);
-  }
-}
-
-void vpx_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
-                            const uint8_t *limit, const uint8_t *thresh) {
-  int i;
-
-  // loop filter designed to work using chars so that we can make maximum use
-  // of 8 bit simd instructions.
-  for (i = 0; i < 8; ++i) {
-    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
-
-    const int8_t mask = filter_mask(*limit, *blimit,
-                                    p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
-                                 s,         s + 1 * p, s + 2 * p, s + 3 * p);
-    ++s;
-  }
-}
-
-void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
-                                 const uint8_t *limit0, const uint8_t *thresh0,
-                                 const uint8_t *blimit1, const uint8_t *limit1,
-                                 const uint8_t *thresh1) {
-  vpx_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0);
-  vpx_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1);
-}
-
-void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
-                          const uint8_t *limit, const uint8_t *thresh) {
-  int i;
-
-  for (i = 0; i < 8; ++i) {
-    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
-    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
-    const int8_t mask = filter_mask(*limit, *blimit,
-                                    p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1,
-                                 s,     s + 1, s + 2, s + 3);
-    s += pitch;
-  }
-}
-
-void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
-                               const uint8_t *limit0, const uint8_t *thresh0,
-                               const uint8_t *blimit1, const uint8_t *limit1,
-                               const uint8_t *thresh1) {
-  vpx_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0);
-  vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
-}
-
-static INLINE void filter16(int8_t mask, uint8_t thresh,
-                            uint8_t flat, uint8_t flat2,
-                            uint8_t *op7, uint8_t *op6,
-                            uint8_t *op5, uint8_t *op4,
-                            uint8_t *op3, uint8_t *op2,
-                            uint8_t *op1, uint8_t *op0,
-                            uint8_t *oq0, uint8_t *oq1,
-                            uint8_t *oq2, uint8_t *oq3,
-                            uint8_t *oq4, uint8_t *oq5,
-                            uint8_t *oq6, uint8_t *oq7) {
-  if (flat2 && flat && mask) {
-    const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4,
-                  p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
-
-    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3,
-                  q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
-
-    // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
-    *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 +
-                              q0, 4);
-    *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 +
-                              q0 + q1, 4);
-    *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 +
-                              q0 + q1 + q2, 4);
-    *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 +
-                              q0 + q1 + q2 + q3, 4);
-    *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 +
-                              q0 + q1 + q2 + q3 + q4, 4);
-    *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
-                              q0 + q1 + q2 + q3 + q4 + q5, 4);
-    *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
-                              q0 + q1 + q2 + q3 + q4 + q5 + q6, 4);
-    *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 +
-                              q0 * 2 + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4);
-    *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 +
-                              q0 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4);
-    *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 +
-                              q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, 4);
-    *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 +
-                              q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
-    *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 +
-                              q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
-    *oq5 = ROUND_POWER_OF_TWO(p1 + p0 +
-                              q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
-    *oq6 = ROUND_POWER_OF_TWO(p0 +
-                              q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
-  } else {
-    filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
-  }
-}
-
-static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
-                                     const uint8_t *limit,
-                                     const uint8_t *thresh, int count) {
-  int i;
-
-  // loop filter designed to work using chars so that we can make maximum use
-  // of 8 bit simd instructions.
-  for (i = 0; i < 8 * count; ++i) {
-    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
-    const int8_t mask = filter_mask(*limit, *blimit,
-                                    p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t flat2 = flat_mask5(1,
-                             s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0,
-                             q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p]);
-
-    filter16(mask, *thresh, flat, flat2,
-             s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
-             s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
-             s,         s + 1 * p, s + 2 * p, s + 3 * p,
-             s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p);
-    ++s;
-  }
-}
-
-void vpx_lpf_horizontal_edge_8_c(uint8_t *s, int p, const uint8_t *blimit,
-                                 const uint8_t *limit, const uint8_t *thresh) {
-  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
-}
-
-void vpx_lpf_horizontal_edge_16_c(uint8_t *s, int p, const uint8_t *blimit,
-                                  const uint8_t *limit, const uint8_t *thresh) {
-  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2);
-}
-
-static void mb_lpf_vertical_edge_w(uint8_t *s, int p,
-                                   const uint8_t *blimit,
-                                   const uint8_t *limit,
-                                   const uint8_t *thresh,
-                                   int count) {
-  int i;
-
-  for (i = 0; i < count; ++i) {
-    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
-    const uint8_t q0 = s[0], q1 = s[1],  q2 = s[2], q3 = s[3];
-    const int8_t mask = filter_mask(*limit, *blimit,
-                                    p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
-                                    q0, s[4], s[5], s[6], s[7]);
-
-    filter16(mask, *thresh, flat, flat2,
-             s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
-             s,     s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7);
-    s += p;
-  }
-}
-
-void vpx_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
-                           const uint8_t *limit, const uint8_t *thresh) {
-  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8);
-}
-
-void vpx_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
-                                const uint8_t *limit, const uint8_t *thresh) {
-  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-// Should we apply any filter at all: 11111111 yes, 00000000 no ?
-static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
-                                        uint16_t p3, uint16_t p2,
-                                        uint16_t p1, uint16_t p0,
-                                        uint16_t q0, uint16_t q1,
-                                        uint16_t q2, uint16_t q3, int bd) {
-  int8_t mask = 0;
-  int16_t limit16 = (uint16_t)limit << (bd - 8);
-  int16_t blimit16 = (uint16_t)blimit << (bd - 8);
-  mask |= (abs(p3 - p2) > limit16) * -1;
-  mask |= (abs(p2 - p1) > limit16) * -1;
-  mask |= (abs(p1 - p0) > limit16) * -1;
-  mask |= (abs(q1 - q0) > limit16) * -1;
-  mask |= (abs(q2 - q1) > limit16) * -1;
-  mask |= (abs(q3 - q2) > limit16) * -1;
-  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit16) * -1;
-  return ~mask;
-}
-
-static INLINE int8_t highbd_flat_mask4(uint8_t thresh,
-                                       uint16_t p3, uint16_t p2,
-                                       uint16_t p1, uint16_t p0,
-                                       uint16_t q0, uint16_t q1,
-                                       uint16_t q2, uint16_t q3, int bd) {
-  int8_t mask = 0;
-  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
-  mask |= (abs(p1 - p0) > thresh16) * -1;
-  mask |= (abs(q1 - q0) > thresh16) * -1;
-  mask |= (abs(p2 - p0) > thresh16) * -1;
-  mask |= (abs(q2 - q0) > thresh16) * -1;
-  mask |= (abs(p3 - p0) > thresh16) * -1;
-  mask |= (abs(q3 - q0) > thresh16) * -1;
-  return ~mask;
-}
-
-static INLINE int8_t highbd_flat_mask5(uint8_t thresh,
-                                       uint16_t p4, uint16_t p3,
-                                       uint16_t p2, uint16_t p1,
-                                       uint16_t p0, uint16_t q0,
-                                       uint16_t q1, uint16_t q2,
-                                       uint16_t q3, uint16_t q4, int bd) {
-  int8_t mask = ~highbd_flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
-  mask |= (abs(p4 - p0) > thresh16) * -1;
-  mask |= (abs(q4 - q0) > thresh16) * -1;
-  return ~mask;
-}
-
-// Is there high edge variance internal edge:
-// 11111111_11111111 yes, 00000000_00000000 no ?
-static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0,
-                                      uint16_t q0, uint16_t q1, int bd) {
-  int16_t hev = 0;
-  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
-  hev |= (abs(p1 - p0) > thresh16) * -1;
-  hev |= (abs(q1 - q0) > thresh16) * -1;
-  return hev;
-}
-
-static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
-                                  uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
-                                  int bd) {
-  int16_t filter1, filter2;
-  // ^0x80 equivalent to subtracting 0x80 from the values to turn them
-  // into -128 to +127 instead of 0 to 255.
-  int shift = bd - 8;
-  const int16_t ps1 = (int16_t)*op1 - (0x80 << shift);
-  const int16_t ps0 = (int16_t)*op0 - (0x80 << shift);
-  const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift);
-  const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift);
-  const uint16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd);
-
-  // Add outer taps if we have high edge variance.
-  int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev;
-
-  // Inner taps.
-  filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask;
-
-  // Save bottom 3 bits so that we round one side +4 and the other +3
-  // if it equals 4 we'll set to adjust by -1 to account for the fact
-  // we'd round 3 the other way.
-  filter1 = signed_char_clamp_high(filter + 4, bd) >> 3;
-  filter2 = signed_char_clamp_high(filter + 3, bd) >> 3;
-
-  *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift);
-  *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift);
-
-  // Outer tap adjustments.
-  filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
-
-  *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift);
-  *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);
-}
-
-void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
-                                   const uint8_t *blimit, const uint8_t *limit,
-                                   const uint8_t *thresh, int bd) {
-  int i;
-
-  // loop filter designed to work using chars so that we can make maximum use
-  // of 8 bit simd instructions.
-  for (i = 0; i < 8; ++i) {
-    const uint16_t p3 = s[-4 * p];
-    const uint16_t p2 = s[-3 * p];
-    const uint16_t p1 = s[-2 * p];
-    const uint16_t p0 = s[-p];
-    const uint16_t q0 = s[0 * p];
-    const uint16_t q1 = s[1 * p];
-    const uint16_t q2 = s[2 * p];
-    const uint16_t q3 = s[3 * p];
-    const int8_t mask = highbd_filter_mask(*limit, *blimit,
-                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
-    ++s;
-  }
-}
-
-void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int p,
-                                        const uint8_t *blimit0,
-                                        const uint8_t *limit0,
-                                        const uint8_t *thresh0,
-                                        const uint8_t *blimit1,
-                                        const uint8_t *limit1,
-                                        const uint8_t *thresh1,
-                                        int bd) {
-  vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd);
-  vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, bd);
-}
-
-void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
-                                 const uint8_t *limit, const uint8_t *thresh,
-                                 int bd) {
-  int i;
-
-  // loop filter designed to work using chars so that we can make maximum use
-  // of 8 bit simd instructions.
-  for (i = 0; i < 8; ++i) {
-    const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
-    const uint16_t q0 = s[0],  q1 = s[1],  q2 = s[2],  q3 = s[3];
-    const int8_t mask = highbd_filter_mask(*limit, *blimit,
-                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
-    s += pitch;
-  }
-}
-
-void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch,
-                                      const uint8_t *blimit0,
-                                      const uint8_t *limit0,
-                                      const uint8_t *thresh0,
-                                      const uint8_t *blimit1,
-                                      const uint8_t *limit1,
-                                      const uint8_t *thresh1,
-                                      int bd) {
-  vpx_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd);
-  vpx_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,
-                              thresh1, bd);
-}
-
-static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat,
-                                  uint16_t *op3, uint16_t *op2,
-                                  uint16_t *op1, uint16_t *op0,
-                                  uint16_t *oq0, uint16_t *oq1,
-                                  uint16_t *oq2, uint16_t *oq3, int bd) {
-  if (flat && mask) {
-    const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
-    const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
-
-    // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
-    *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
-    *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
-    *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
-    *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
-    *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
-    *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
-  } else {
-    highbd_filter4(mask, thresh, op1,  op0, oq0, oq1, bd);
-  }
-}
-
-void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
-                                   const uint8_t *limit, const uint8_t *thresh,
-                                   int bd) {
-  int i;
-
-  // loop filter designed to work using chars so that we can make maximum use
-  // of 8 bit simd instructions.
-  for (i = 0; i < 8; ++i) {
-    const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
-
-    const int8_t mask = highbd_filter_mask(*limit, *blimit,
-                                         p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
-                                          bd);
-    highbd_filter8(mask, *thresh, flat,
-                 s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
-                 s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
-    ++s;
-  }
-}
-
-void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int p,
-                                        const uint8_t *blimit0,
-                                        const uint8_t *limit0,
-                                        const uint8_t *thresh0,
-                                        const uint8_t *blimit1,
-                                        const uint8_t *limit1,
-                                        const uint8_t *thresh1,
-                                        int bd) {
-  vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd);
-  vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd);
-}
-
-void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
-                                 const uint8_t *limit, const uint8_t *thresh,
-                                 int bd) {
-  int i;
-
-  for (i = 0; i < 8; ++i) {
-    const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
-    const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
-    const int8_t mask = highbd_filter_mask(*limit, *blimit,
-                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
-                                          bd);
-    highbd_filter8(mask, *thresh, flat,
-                 s - 4, s - 3, s - 2, s - 1,
-                 s, s + 1, s + 2, s + 3,
-                 bd);
-    s += pitch;
-  }
-}
-
-void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch,
-                                      const uint8_t *blimit0,
-                                      const uint8_t *limit0,
-                                      const uint8_t *thresh0,
-                                      const uint8_t *blimit1,
-                                      const uint8_t *limit1,
-                                      const uint8_t *thresh1,
-                                      int bd) {
-  vpx_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd);
-  vpx_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,
-                              thresh1, bd);
-}
-
-static INLINE void highbd_filter16(int8_t mask, uint8_t thresh,
-                                   uint8_t flat, uint8_t flat2,
-                                   uint16_t *op7, uint16_t *op6,
-                                   uint16_t *op5, uint16_t *op4,
-                                   uint16_t *op3, uint16_t *op2,
-                                   uint16_t *op1, uint16_t *op0,
-                                   uint16_t *oq0, uint16_t *oq1,
-                                   uint16_t *oq2, uint16_t *oq3,
-                                   uint16_t *oq4, uint16_t *oq5,
-                                   uint16_t *oq6, uint16_t *oq7, int bd) {
-  if (flat2 && flat && mask) {
-    const uint16_t p7 = *op7;
-    const uint16_t p6 = *op6;
-    const uint16_t p5 = *op5;
-    const uint16_t p4 = *op4;
-    const uint16_t p3 = *op3;
-    const uint16_t p2 = *op2;
-    const uint16_t p1 = *op1;
-    const uint16_t p0 = *op0;
-    const uint16_t q0 = *oq0;
-    const uint16_t q1 = *oq1;
-    const uint16_t q2 = *oq2;
-    const uint16_t q3 = *oq3;
-    const uint16_t q4 = *oq4;
-    const uint16_t q5 = *oq5;
-    const uint16_t q6 = *oq6;
-    const uint16_t q7 = *oq7;
-
-    // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
-    *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 +
-                              q0, 4);
-    *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 +
-                              q0 + q1, 4);
-    *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 +
-                              q0 + q1 + q2, 4);
-    *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 +
-                              q0 + q1 + q2 + q3, 4);
-    *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 +
-                              q0 + q1 + q2 + q3 + q4, 4);
-    *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
-                              q0 + q1 + q2 + q3 + q4 + q5, 4);
-    *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
-                              q0 + q1 + q2 + q3 + q4 + q5 + q6, 4);
-    *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 +
-                              q0 * 2 + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4);
-    *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 +
-                              q0 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4);
-    *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 +
-                              q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, 4);
-    *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 +
-                              q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
-    *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 +
-                              q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
-    *oq5 = ROUND_POWER_OF_TWO(p1 + p0 +
-                              q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
-    *oq6 = ROUND_POWER_OF_TWO(p0 +
-                              q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
-  } else {
-    highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
-                   bd);
-  }
-}
-
-static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
-                                            const uint8_t *blimit,
-                                            const uint8_t *limit,
-                                            const uint8_t *thresh,
-                                            int count, int bd) {
-  int i;
-
-  // loop filter designed to work using chars so that we can make maximum use
-  // of 8 bit simd instructions.
-  for (i = 0; i < 8 * count; ++i) {
-    const uint16_t p3 = s[-4 * p];
-    const uint16_t p2 = s[-3 * p];
-    const uint16_t p1 = s[-2 * p];
-    const uint16_t p0 = s[-p];
-    const uint16_t q0 = s[0 * p];
-    const uint16_t q1 = s[1 * p];
-    const uint16_t q2 = s[2 * p];
-    const uint16_t q3 = s[3 * p];
-    const int8_t mask = highbd_filter_mask(*limit, *blimit,
-                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
-                                          bd);
-    const int8_t flat2 = highbd_flat_mask5(
-        1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0,
-        q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd);
-
-    highbd_filter16(mask, *thresh, flat, flat2,
-                    s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
-                    s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
-                    s, s + 1 * p, s + 2 * p, s + 3 * p,
-                    s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p,
-                    bd);
-    ++s;
-  }
-}
-
-void vpx_highbd_lpf_horizontal_edge_8_c(uint16_t *s, int p,
-                                        const uint8_t *blimit,
-                                        const uint8_t *limit,
-                                        const uint8_t *thresh, int bd) {
-  highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd);
-}
-
-void vpx_highbd_lpf_horizontal_edge_16_c(uint16_t *s, int p,
-                                         const uint8_t *blimit,
-                                         const uint8_t *limit,
-                                         const uint8_t *thresh, int bd) {
-  highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd);
-}
-
-static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
-                                          const uint8_t *blimit,
-                                          const uint8_t *limit,
-                                          const uint8_t *thresh,
-                                          int count, int bd) {
-  int i;
-
-  for (i = 0; i < count; ++i) {
-    const uint16_t p3 = s[-4];
-    const uint16_t p2 = s[-3];
-    const uint16_t p1 = s[-2];
-    const uint16_t p0 = s[-1];
-    const uint16_t q0 = s[0];
-    const uint16_t q1 = s[1];
-    const uint16_t q2 = s[2];
-    const uint16_t q3 = s[3];
-    const int8_t mask = highbd_filter_mask(*limit, *blimit,
-                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
-                                          bd);
-    const int8_t flat2 = highbd_flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
-                                           q0, s[4], s[5], s[6], s[7], bd);
-
-    highbd_filter16(mask, *thresh, flat, flat2,
-                    s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
-                    s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7,
-                    bd);
-    s += p;
-  }
-}
-
-void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
-                                  const uint8_t *limit, const uint8_t *thresh,
-                                  int bd) {
-  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd);
-}
-
-void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p,
-                                       const uint8_t *blimit,
-                                       const uint8_t *limit,
-                                       const uint8_t *thresh,
-                                       int bd) {
-  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd);
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/thirdparty/libvpx/vpx_dsp/prob.c b/thirdparty/libvpx/vpx_dsp/prob.c
deleted file mode 100644
index 639d24dd2f..0000000000
--- a/thirdparty/libvpx/vpx_dsp/prob.c
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./prob.h"
-
-const uint8_t vpx_norm[256] = {
-  0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
-  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-static unsigned int tree_merge_probs_impl(unsigned int i,
-                                          const vpx_tree_index *tree,
-                                          const vpx_prob *pre_probs,
-                                          const unsigned int *counts,
-                                          vpx_prob *probs) {
-  const int l = tree[i];
-  const unsigned int left_count = (l <= 0)
-                 ? counts[-l]
-                 : tree_merge_probs_impl(l, tree, pre_probs, counts, probs);
-  const int r = tree[i + 1];
-  const unsigned int right_count = (r <= 0)
-                 ? counts[-r]
-                 : tree_merge_probs_impl(r, tree, pre_probs, counts, probs);
-  const unsigned int ct[2] = { left_count, right_count };
-  probs[i >> 1] = mode_mv_merge_probs(pre_probs[i >> 1], ct);
-  return left_count + right_count;
-}
-
-void vpx_tree_merge_probs(const vpx_tree_index *tree, const vpx_prob *pre_probs,
-                          const unsigned int *counts, vpx_prob *probs) {
-  tree_merge_probs_impl(0, tree, pre_probs, counts, probs);
-}
diff --git a/thirdparty/libvpx/vpx_dsp/prob.h b/thirdparty/libvpx/vpx_dsp/prob.h
deleted file mode 100644
index c3cb103ffb..0000000000
--- a/thirdparty/libvpx/vpx_dsp/prob.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_DSP_PROB_H_
-#define VPX_DSP_PROB_H_
-
-#include "./vpx_config.h"
-#include "./vpx_dsp_common.h"
-
-#include "vpx_ports/mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef uint8_t vpx_prob;
-
-#define MAX_PROB 255
-
-#define vpx_prob_half ((vpx_prob) 128)
-
-typedef int8_t vpx_tree_index;
-
-#define TREE_SIZE(leaf_count) (2 * (leaf_count) - 2)
-
-#define vpx_complement(x) (255 - x)
-
-#define MODE_MV_COUNT_SAT 20
-
-/* We build coding trees compactly in arrays.
-   Each node of the tree is a pair of vpx_tree_indices.
-   Array index often references a corresponding probability table.
-   Index <= 0 means done encoding/decoding and value = -Index,
-   Index > 0 means need another bit, specification at index.
-   Nonnegative indices are always even;  processing begins at node 0. */
-
-typedef const vpx_tree_index vpx_tree[];
-
-static INLINE vpx_prob clip_prob(int p) {
-  return (p > 255) ? 255 : (p < 1) ? 1 : p;
-}
-
-static INLINE vpx_prob get_prob(int num, int den) {
-  return (den == 0) ? 128u : clip_prob(((int64_t)num * 256 + (den >> 1)) / den);
-}
-
-static INLINE vpx_prob get_binary_prob(int n0, int n1) {
-  return get_prob(n0, n0 + n1);
-}
-
-/* This function assumes prob1 and prob2 are already within [1,255] range. */
-static INLINE vpx_prob weighted_prob(int prob1, int prob2, int factor) {
-  return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8);
-}
-
-static INLINE vpx_prob merge_probs(vpx_prob pre_prob,
-                                   const unsigned int ct[2],
-                                   unsigned int count_sat,
-                                   unsigned int max_update_factor) {
-  const vpx_prob prob = get_binary_prob(ct[0], ct[1]);
-  const unsigned int count = VPXMIN(ct[0] + ct[1], count_sat);
-  const unsigned int factor = max_update_factor * count / count_sat;
-  return weighted_prob(pre_prob, prob, factor);
-}
-
-// MODE_MV_MAX_UPDATE_FACTOR (128) * count / MODE_MV_COUNT_SAT;
-static const int count_to_update_factor[MODE_MV_COUNT_SAT + 1] = {
-  0, 6, 12, 19, 25, 32, 38, 44, 51, 57, 64,
-  70, 76, 83, 89, 96, 102, 108, 115, 121, 128
-};
-
-static INLINE vpx_prob mode_mv_merge_probs(vpx_prob pre_prob,
-                                           const unsigned int ct[2]) {
-  const unsigned int den = ct[0] + ct[1];
-  if (den == 0) {
-    return pre_prob;
-  } else {
-    const unsigned int count = VPXMIN(den, MODE_MV_COUNT_SAT);
-    const unsigned int factor = count_to_update_factor[count];
-    const vpx_prob prob =
-        clip_prob(((int64_t)(ct[0]) * 256 + (den >> 1)) / den);
-    return weighted_prob(pre_prob, prob, factor);
-  }
-}
-
-void vpx_tree_merge_probs(const vpx_tree_index *tree, const vpx_prob *pre_probs,
-                          const unsigned int *counts, vpx_prob *probs);
-
-
-DECLARE_ALIGNED(16, extern const uint8_t, vpx_norm[256]);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VPX_DSP_PROB_H_
diff --git a/thirdparty/libvpx/vpx_dsp/txfm_common.h b/thirdparty/libvpx/vpx_dsp/txfm_common.h
deleted file mode 100644
index 442e6a57b5..0000000000
--- a/thirdparty/libvpx/vpx_dsp/txfm_common.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_DSP_TXFM_COMMON_H_
-#define VPX_DSP_TXFM_COMMON_H_
-
-#include "vpx_dsp/vpx_dsp_common.h"
-
-// Constants and Macros used by all idct/dct functions
-#define DCT_CONST_BITS 14
-#define DCT_CONST_ROUNDING  (1 << (DCT_CONST_BITS - 1))
-
-#define UNIT_QUANT_SHIFT 2
-#define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT)
-
-// Constants:
-//  for (int i = 1; i< 32; ++i)
-//    printf("static const int cospi_%d_64 = %.0f;\n", i,
-//           round(16384 * cos(i*M_PI/64)));
-// Note: sin(k*Pi/64) = cos((32-k)*Pi/64)
-static const tran_high_t cospi_1_64  = 16364;
-static const tran_high_t cospi_2_64  = 16305;
-static const tran_high_t cospi_3_64  = 16207;
-static const tran_high_t cospi_4_64  = 16069;
-static const tran_high_t cospi_5_64  = 15893;
-static const tran_high_t cospi_6_64  = 15679;
-static const tran_high_t cospi_7_64  = 15426;
-static const tran_high_t cospi_8_64  = 15137;
-static const tran_high_t cospi_9_64  = 14811;
-static const tran_high_t cospi_10_64 = 14449;
-static const tran_high_t cospi_11_64 = 14053;
-static const tran_high_t cospi_12_64 = 13623;
-static const tran_high_t cospi_13_64 = 13160;
-static const tran_high_t cospi_14_64 = 12665;
-static const tran_high_t cospi_15_64 = 12140;
-static const tran_high_t cospi_16_64 = 11585;
-static const tran_high_t cospi_17_64 = 11003;
-static const tran_high_t cospi_18_64 = 10394;
-static const tran_high_t cospi_19_64 = 9760;
-static const tran_high_t cospi_20_64 = 9102;
-static const tran_high_t cospi_21_64 = 8423;
-static const tran_high_t cospi_22_64 = 7723;
-static const tran_high_t cospi_23_64 = 7005;
-static const tran_high_t cospi_24_64 = 6270;
-static const tran_high_t cospi_25_64 = 5520;
-static const tran_high_t cospi_26_64 = 4756;
-static const tran_high_t cospi_27_64 = 3981;
-static const tran_high_t cospi_28_64 = 3196;
-static const tran_high_t cospi_29_64 = 2404;
-static const tran_high_t cospi_30_64 = 1606;
-static const tran_high_t cospi_31_64 = 804;
-
-//  16384 * sqrt(2) * sin(kPi/9) * 2 / 3
-static const tran_high_t sinpi_1_9 = 5283;
-static const tran_high_t sinpi_2_9 = 9929;
-static const tran_high_t sinpi_3_9 = 13377;
-static const tran_high_t sinpi_4_9 = 15212;
-
-#endif  // VPX_DSP_TXFM_COMMON_H_
diff --git a/thirdparty/libvpx/vpx_dsp/vpx_convolve.c b/thirdparty/libvpx/vpx_dsp/vpx_convolve.c
deleted file mode 100644
index 2d1c927cbe..0000000000
--- a/thirdparty/libvpx/vpx_dsp/vpx_convolve.c
+++ /dev/null
@@ -1,612 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-#include <string.h>
-
-#include "./vpx_config.h"
-#include "./vpx_dsp_rtcd.h"
-#include "vpx/vpx_integer.h"
-#include "vpx_dsp/vpx_convolve.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_dsp/vpx_filter.h"
-#include "vpx_ports/mem.h"
-
-static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const InterpKernel *x_filters,
-                           int x0_q4, int x_step_q4, int w, int h) {
-  int x, y;
-  src -= SUBPEL_TAPS / 2 - 1;
-  for (y = 0; y < h; ++y) {
-    int x_q4 = x0_q4;
-    for (x = 0; x < w; ++x) {
-      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_x[k] * x_filter[k];
-      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-      x_q4 += x_step_q4;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const InterpKernel *x_filters,
-                               int x0_q4, int x_step_q4, int w, int h) {
-  int x, y;
-  src -= SUBPEL_TAPS / 2 - 1;
-  for (y = 0; y < h; ++y) {
-    int x_q4 = x0_q4;
-    for (x = 0; x < w; ++x) {
-      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_x[k] * x_filter[k];
-      dst[x] = ROUND_POWER_OF_TWO(dst[x] +
-          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
-      x_q4 += x_step_q4;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
-                          uint8_t *dst, ptrdiff_t dst_stride,
-                          const InterpKernel *y_filters,
-                          int y0_q4, int y_step_q4, int w, int h) {
-  int x, y;
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-
-  for (x = 0; x < w; ++x) {
-    int y_q4 = y0_q4;
-    for (y = 0; y < h; ++y) {
-      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
-      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-      y_q4 += y_step_q4;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const InterpKernel *y_filters,
-                              int y0_q4, int y_step_q4, int w, int h) {
-  int x, y;
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-
-  for (x = 0; x < w; ++x) {
-    int y_q4 = y0_q4;
-    for (y = 0; y < h; ++y) {
-      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
-      dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
-          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
-      y_q4 += y_step_q4;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-static void convolve(const uint8_t *src, ptrdiff_t src_stride,
-                     uint8_t *dst, ptrdiff_t dst_stride,
-                     const InterpKernel *const x_filters,
-                     int x0_q4, int x_step_q4,
-                     const InterpKernel *const y_filters,
-                     int y0_q4, int y_step_q4,
-                     int w, int h) {
-  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
-  // 2d filtering proceeds in 2 steps:
-  //   (1) Interpolate horizontally into an intermediate buffer, temp.
-  //   (2) Interpolate temp vertically to derive the sub-pixel result.
-  // Deriving the maximum number of rows in the temp buffer (135):
-  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
-  // --Largest block size is 64x64 pixels.
-  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
-  //   original frame (in 1/16th pixel units).
-  // --Must round-up because block may be located at sub-pixel position.
-  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
-  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  uint8_t temp[135 * 64];
-  int intermediate_height =
-          (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
-  assert(w <= 64);
-  assert(h <= 64);
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
-
-  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
-                 x_filters, x0_q4, x_step_q4, w, intermediate_height);
-  convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
-                y_filters, y0_q4, y_step_q4, w, h);
-}
-
-static const InterpKernel *get_filter_base(const int16_t *filter) {
-  // NOTE: This assumes that the filter table is 256-byte aligned.
-  // TODO(agrange) Modify to make independent of table alignment.
-  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
-}
-
-static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
-  return (int)((const InterpKernel *)(intptr_t)f - base);
-}
-
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4,
-                           int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  (void)filter_y;
-  (void)y_step_q4;
-
-  convolve_horiz(src, src_stride, dst, dst_stride, filters_x,
-                 x0_q4, x_step_q4, w, h);
-}
-
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  (void)filter_y;
-  (void)y_step_q4;
-
-  convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x,
-                     x0_q4, x_step_q4, w, h);
-}
-
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                          uint8_t *dst, ptrdiff_t dst_stride,
-                          const int16_t *filter_x, int x_step_q4,
-                          const int16_t *filter_y, int y_step_q4,
-                          int w, int h) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  (void)filter_x;
-  (void)x_step_q4;
-
-  convolve_vert(src, src_stride, dst, dst_stride, filters_y,
-                y0_q4, y_step_q4, w, h);
-}
-
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  (void)filter_x;
-  (void)x_step_q4;
-
-  convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y,
-                    y0_q4, y_step_q4, w, h);
-}
-
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
-                     uint8_t *dst, ptrdiff_t dst_stride,
-                     const int16_t *filter_x, int x_step_q4,
-                     const int16_t *filter_y, int y_step_q4,
-                     int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  convolve(src, src_stride, dst, dst_stride,
-           filters_x, x0_q4, x_step_q4,
-           filters_y, y0_q4, y_step_q4, w, h);
-}
-
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, ptrdiff_t dst_stride,
-                         const int16_t *filter_x, int x_step_q4,
-                         const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  /* Fixed size intermediate buffer places limits on parameters. */
-  DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
-  assert(w <= 64);
-  assert(h <= 64);
-
-  vpx_convolve8_c(src, src_stride, temp, 64,
-                  filter_x, x_step_q4, filter_y, y_step_q4, w, h);
-  vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
-}
-
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, ptrdiff_t dst_stride,
-                         const int16_t *filter_x, int filter_x_stride,
-                         const int16_t *filter_y, int filter_y_stride,
-                         int w, int h) {
-  int r;
-
-  (void)filter_x;  (void)filter_x_stride;
-  (void)filter_y;  (void)filter_y_stride;
-
-  for (r = h; r > 0; --r) {
-    memcpy(dst, src, w);
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
-                        uint8_t *dst, ptrdiff_t dst_stride,
-                        const int16_t *filter_x, int filter_x_stride,
-                        const int16_t *filter_y, int filter_y_stride,
-                        int w, int h) {
-  int x, y;
-
-  (void)filter_x;  (void)filter_x_stride;
-  (void)filter_y;  (void)filter_y_stride;
-
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x)
-      dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
-
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                        uint8_t *dst, ptrdiff_t dst_stride,
-                        const int16_t *filter_x, int x_step_q4,
-                        const int16_t *filter_y, int y_step_q4,
-                        int w, int h) {
-  vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                        filter_y, y_step_q4, w, h);
-}
-
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                       uint8_t *dst, ptrdiff_t dst_stride,
-                       const int16_t *filter_x, int x_step_q4,
-                       const int16_t *filter_y, int y_step_q4,
-                       int w, int h) {
-  vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                       filter_y, y_step_q4, w, h);
-}
-
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride,
-                     uint8_t *dst, ptrdiff_t dst_stride,
-                     const int16_t *filter_x, int x_step_q4,
-                     const int16_t *filter_y, int y_step_q4,
-                     int w, int h) {
-  vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                  filter_y, y_step_q4, w, h);
-}
-
-void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4,
-                            int w, int h) {
-  vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                            x_step_q4, filter_y, y_step_q4, w, h);
-}
-
-void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4,
-                           int w, int h) {
-  vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                           x_step_q4, filter_y, y_step_q4, w, h);
-}
-
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride,
-                     uint8_t *dst, ptrdiff_t dst_stride,
-                     const int16_t *filter_x, int x_step_q4,
-                     const int16_t *filter_y, int y_step_q4,
-                     int w, int h) {
-  vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                      filter_y, y_step_q4, w, h);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
-                                  uint8_t *dst8, ptrdiff_t dst_stride,
-                                  const InterpKernel *x_filters,
-                                  int x0_q4, int x_step_q4,
-                                  int w, int h, int bd) {
-  int x, y;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  src -= SUBPEL_TAPS / 2 - 1;
-  for (y = 0; y < h; ++y) {
-    int x_q4 = x0_q4;
-    for (x = 0; x < w; ++x) {
-      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_x[k] * x_filter[k];
-      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
-      x_q4 += x_step_q4;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
-                                      uint8_t *dst8, ptrdiff_t dst_stride,
-                                      const InterpKernel *x_filters,
-                                      int x0_q4, int x_step_q4,
-                                      int w, int h, int bd) {
-  int x, y;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  src -= SUBPEL_TAPS / 2 - 1;
-  for (y = 0; y < h; ++y) {
-    int x_q4 = x0_q4;
-    for (x = 0; x < w; ++x) {
-      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_x[k] * x_filter[k];
-      dst[x] = ROUND_POWER_OF_TWO(dst[x] +
-          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1);
-      x_q4 += x_step_q4;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
-                                 uint8_t *dst8, ptrdiff_t dst_stride,
-                                 const InterpKernel *y_filters,
-                                 int y0_q4, int y_step_q4, int w, int h,
-                                 int bd) {
-  int x, y;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  for (x = 0; x < w; ++x) {
-    int y_q4 = y0_q4;
-    for (y = 0; y < h; ++y) {
-      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
-      dst[y * dst_stride] = clip_pixel_highbd(
-          ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
-      y_q4 += y_step_q4;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
-                                     uint8_t *dst8, ptrdiff_t dst_stride,
-                                     const InterpKernel *y_filters,
-                                     int y0_q4, int y_step_q4, int w, int h,
-                                     int bd) {
-  int x, y;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  for (x = 0; x < w; ++x) {
-    int y_q4 = y0_q4;
-    for (y = 0; y < h; ++y) {
-      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
-      dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
-          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1);
-      y_q4 += y_step_q4;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const InterpKernel *const x_filters,
-                            int x0_q4, int x_step_q4,
-                            const InterpKernel *const y_filters,
-                            int y0_q4, int y_step_q4,
-                            int w, int h, int bd) {
-  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
-  // 2d filtering proceeds in 2 steps:
-  //   (1) Interpolate horizontally into an intermediate buffer, temp.
-  //   (2) Interpolate temp vertically to derive the sub-pixel result.
-  // Deriving the maximum number of rows in the temp buffer (135):
-  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
-  // --Largest block size is 64x64 pixels.
-  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
-  //   original frame (in 1/16th pixel units).
-  // --Must round-up because block may be located at sub-pixel position.
-  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
-  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  uint16_t temp[64 * 135];
-  int intermediate_height =
-          (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
-  assert(w <= 64);
-  assert(h <= 64);
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
-
-  highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                        src_stride, CONVERT_TO_BYTEPTR(temp), 64,
-                        x_filters, x0_q4, x_step_q4, w,
-                        intermediate_height, bd);
-  highbd_convolve_vert(CONVERT_TO_BYTEPTR(temp) + 64 * (SUBPEL_TAPS / 2 - 1),
-                       64, dst, dst_stride, y_filters, y0_q4, y_step_q4,
-                       w, h, bd);
-}
-
-
-void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4,
-                                  int w, int h, int bd) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-  (void)filter_y;
-  (void)y_step_q4;
-
-  highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x,
-                        x0_q4, x_step_q4, w, h, bd);
-}
-
-void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                                      uint8_t *dst, ptrdiff_t dst_stride,
-                                      const int16_t *filter_x, int x_step_q4,
-                                      const int16_t *filter_y, int y_step_q4,
-                                      int w, int h, int bd) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-  (void)filter_y;
-  (void)y_step_q4;
-
-  highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x,
-                            x0_q4, x_step_q4, w, h, bd);
-}
-
-void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const int16_t *filter_x, int x_step_q4,
-                                 const int16_t *filter_y, int y_step_q4,
-                                 int w, int h, int bd) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-  (void)filter_x;
-  (void)x_step_q4;
-
-  highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y,
-                       y0_q4, y_step_q4, w, h, bd);
-}
-
-void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                                     uint8_t *dst, ptrdiff_t dst_stride,
-                                     const int16_t *filter_x, int x_step_q4,
-                                     const int16_t *filter_y, int y_step_q4,
-                                     int w, int h, int bd) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-  (void)filter_x;
-  (void)x_step_q4;
-
-  highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y,
-                           y0_q4, y_step_q4, w, h, bd);
-}
-
-void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4,
-                            int w, int h, int bd) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  highbd_convolve(src, src_stride, dst, dst_stride,
-                  filters_x, x0_q4, x_step_q4,
-                  filters_y, y0_q4, y_step_q4, w, h, bd);
-}
-
-void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride,
-                                const int16_t *filter_x, int x_step_q4,
-                                const int16_t *filter_y, int y_step_q4,
-                                int w, int h, int bd) {
-  // Fixed size intermediate buffer places limits on parameters.
-  DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]);
-  assert(w <= 64);
-  assert(h <= 64);
-
-  vpx_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64,
-                         filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
-  vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride,
-                            NULL, 0, NULL, 0, w, h, bd);
-}
-
-void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
-                                uint8_t *dst8, ptrdiff_t dst_stride,
-                                const int16_t *filter_x, int filter_x_stride,
-                                const int16_t *filter_y, int filter_y_stride,
-                                int w, int h, int bd) {
-  int r;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
-  (void)bd;
-
-  for (r = h; r > 0; --r) {
-    memcpy(dst, src, w * sizeof(uint16_t));
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void vpx_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
-                               uint8_t *dst8, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int filter_x_stride,
-                               const int16_t *filter_y, int filter_y_stride,
-                               int w, int h, int bd) {
-  int x, y;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
-  (void)bd;
-
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-#endif
diff --git a/thirdparty/libvpx/vpx_dsp/vpx_convolve.h b/thirdparty/libvpx/vpx_dsp/vpx_convolve.h
deleted file mode 100644
index 9ed3f1750f..0000000000
--- a/thirdparty/libvpx/vpx_dsp/vpx_convolve.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#ifndef VPX_DSP_VPX_CONVOLVE_H_
-#define VPX_DSP_VPX_CONVOLVE_H_
-
-#include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h);
-
-#if CONFIG_VP9_HIGHBITDEPTH
-typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
-                                     uint8_t *dst, ptrdiff_t dst_stride,
-                                     const int16_t *filter_x, int x_step_q4,
-                                     const int16_t *filter_y, int y_step_q4,
-                                     int w, int h, int bd);
-#endif
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VPX_DSP_VPX_CONVOLVE_H_
diff --git a/thirdparty/libvpx/vpx_dsp/vpx_dsp_common.h b/thirdparty/libvpx/vpx_dsp/vpx_dsp_common.h
deleted file mode 100644
index a1d0a51ef5..0000000000
--- a/thirdparty/libvpx/vpx_dsp/vpx_dsp_common.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_DSP_VPX_DSP_COMMON_H_
-#define VPX_DSP_VPX_DSP_COMMON_H_
-
-#include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
-#include "vpx_ports/mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define VPXMIN(x, y) (((x) < (y)) ? (x) : (y))
-#define VPXMAX(x, y) (((x) > (y)) ? (x) : (y))
-
-#if CONFIG_VP9_HIGHBITDEPTH
-// Note:
-// tran_low_t  is the datatype used for final transform coefficients.
-// tran_high_t is the datatype used for intermediate transform stages.
-typedef int64_t tran_high_t;
-typedef int32_t tran_low_t;
-#else
-// Note:
-// tran_low_t  is the datatype used for final transform coefficients.
-// tran_high_t is the datatype used for intermediate transform stages.
-typedef int32_t tran_high_t;
-typedef int16_t tran_low_t;
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-static INLINE uint8_t clip_pixel(int val) {
-  return (val > 255) ? 255 : (val < 0) ? 0 : val;
-}
-
-static INLINE int clamp(int value, int low, int high) {
-  return value < low ? low : (value > high ? high : value);
-}
-
-static INLINE double fclamp(double value, double low, double high) {
-  return value < low ? low : (value > high ? high : value);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
-  switch (bd) {
-    case 8:
-    default:
-      return (uint16_t)clamp(val, 0, 255);
-    case 10:
-      return (uint16_t)clamp(val, 0, 1023);
-    case 12:
-      return (uint16_t)clamp(val, 0, 4095);
-  }
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VPX_DSP_VPX_DSP_COMMON_H_
diff --git a/thirdparty/libvpx/vpx_dsp/vpx_dsp_rtcd.c b/thirdparty/libvpx/vpx_dsp/vpx_dsp_rtcd.c
deleted file mode 100644
index 5fe27b614b..0000000000
--- a/thirdparty/libvpx/vpx_dsp/vpx_dsp_rtcd.c
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#include "./vpx_config.h"
-#define RTCD_C
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_ports/vpx_once.h"
-
-void vpx_dsp_rtcd() {
-  once(setup_rtcd_internal);
-}
diff --git a/thirdparty/libvpx/vpx_dsp/vpx_filter.h b/thirdparty/libvpx/vpx_dsp/vpx_filter.h
deleted file mode 100644
index 2617febf3b..0000000000
--- a/thirdparty/libvpx/vpx_dsp/vpx_filter.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_DSP_VPX_FILTER_H_
-#define VPX_DSP_VPX_FILTER_H_
-
-#include "vpx/vpx_integer.h"
-
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define FILTER_BITS 7
-
-#define SUBPEL_BITS 4
-#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
-#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
-#define SUBPEL_TAPS 8
-
-typedef int16_t InterpKernel[SUBPEL_TAPS];
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VPX_DSP_VPX_FILTER_H_
diff --git a/thirdparty/libvpx/vpx_dsp/x86/convolve.h b/thirdparty/libvpx/vpx_dsp/x86/convolve.h
deleted file mode 100644
index 7e43eb7c72..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/convolve.h
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#ifndef VPX_DSP_X86_CONVOLVE_H_
-#define VPX_DSP_X86_CONVOLVE_H_
-
-#include <assert.h>
-
-#include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
-#include "vpx_ports/mem.h"
-
-typedef void filter8_1dfunction (
-  const uint8_t *src_ptr,
-  ptrdiff_t src_pitch,
-  uint8_t *output_ptr,
-  ptrdiff_t out_pitch,
-  uint32_t output_height,
-  const int16_t *filter
-);
-
-#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
-  void vpx_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
-                                    uint8_t *dst, ptrdiff_t dst_stride, \
-                                    const int16_t *filter_x, int x_step_q4, \
-                                    const int16_t *filter_y, int y_step_q4, \
-                                    int w, int h) { \
-  assert(filter[3] != 128); \
-  assert(step_q4 == 16); \
-  if (filter[0] | filter[1] | filter[2]) { \
-    while (w >= 16) { \
-      vpx_filter_block1d16_##dir##8_##avg##opt(src_start, \
-                                               src_stride, \
-                                               dst, \
-                                               dst_stride, \
-                                               h, \
-                                               filter); \
-      src += 16; \
-      dst += 16; \
-      w -= 16; \
-    } \
-    if (w == 8) { \
-      vpx_filter_block1d8_##dir##8_##avg##opt(src_start, \
-                                              src_stride, \
-                                              dst, \
-                                              dst_stride, \
-                                              h, \
-                                              filter); \
-    } else if (w == 4) { \
-      vpx_filter_block1d4_##dir##8_##avg##opt(src_start, \
-                                              src_stride, \
-                                              dst, \
-                                              dst_stride, \
-                                              h, \
-                                              filter); \
-    } \
-  } else { \
-    while (w >= 16) { \
-      vpx_filter_block1d16_##dir##2_##avg##opt(src, \
-                                               src_stride, \
-                                               dst, \
-                                               dst_stride, \
-                                               h, \
-                                               filter); \
-      src += 16; \
-      dst += 16; \
-      w -= 16; \
-    } \
-    if (w == 8) { \
-      vpx_filter_block1d8_##dir##2_##avg##opt(src, \
-                                              src_stride, \
-                                              dst, \
-                                              dst_stride, \
-                                              h, \
-                                              filter); \
-    } else if (w == 4) { \
-      vpx_filter_block1d4_##dir##2_##avg##opt(src, \
-                                              src_stride, \
-                                              dst, \
-                                              dst_stride, \
-                                              h, \
-                                              filter); \
-    } \
-  } \
-}
-
-#define FUN_CONV_2D(avg, opt) \
-void vpx_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
-                              uint8_t *dst, ptrdiff_t dst_stride, \
-                              const int16_t *filter_x, int x_step_q4, \
-                              const int16_t *filter_y, int y_step_q4, \
-                              int w, int h) { \
-  assert(filter_x[3] != 128); \
-  assert(filter_y[3] != 128); \
-  assert(w <= 64); \
-  assert(h <= 64); \
-  assert(x_step_q4 == 16); \
-  assert(y_step_q4 == 16); \
-  if (filter_x[0] | filter_x[1] | filter_x[2]) { \
-    DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \
-    vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
-                              filter_x, x_step_q4, filter_y, y_step_q4, \
-                              w, h + 7); \
-    vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
-                                    filter_x, x_step_q4, filter_y, \
-                                    y_step_q4, w, h); \
-  } else { \
-    DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \
-    vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
-                              filter_x, x_step_q4, filter_y, y_step_q4, \
-                              w, h + 1); \
-    vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
-                                    filter_x, x_step_q4, filter_y, \
-                                    y_step_q4, w, h); \
-  } \
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-
-typedef void highbd_filter8_1dfunction (
-  const uint16_t *src_ptr,
-  const ptrdiff_t src_pitch,
-  uint16_t *output_ptr,
-  ptrdiff_t out_pitch,
-  unsigned int output_height,
-  const int16_t *filter,
-  int bd
-);
-
-#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
-  void vpx_highbd_convolve8_##name##_##opt(const uint8_t *src8, \
-                                           ptrdiff_t src_stride, \
-                                           uint8_t *dst8, \
-                                           ptrdiff_t dst_stride, \
-                                           const int16_t *filter_x, \
-                                           int x_step_q4, \
-                                           const int16_t *filter_y, \
-                                           int y_step_q4, \
-                                           int w, int h, int bd) { \
-  if (step_q4 == 16 && filter[3] != 128) { \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
-    if (filter[0] | filter[1] | filter[2]) { \
-      while (w >= 16) { \
-        vpx_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \
-                                                        src_stride, \
-                                                        dst, \
-                                                        dst_stride, \
-                                                        h, \
-                                                        filter, \
-                                                        bd); \
-        src += 16; \
-        dst += 16; \
-        w -= 16; \
-      } \
-      while (w >= 8) { \
-        vpx_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \
-                                                       src_stride, \
-                                                       dst, \
-                                                       dst_stride, \
-                                                       h, \
-                                                       filter, \
-                                                       bd); \
-        src += 8; \
-        dst += 8; \
-        w -= 8; \
-      } \
-      while (w >= 4) { \
-        vpx_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \
-                                                       src_stride, \
-                                                       dst, \
-                                                       dst_stride, \
-                                                       h, \
-                                                       filter, \
-                                                       bd); \
-        src += 4; \
-        dst += 4; \
-        w -= 4; \
-      } \
-    } else { \
-      while (w >= 16) { \
-        vpx_highbd_filter_block1d16_##dir##2_##avg##opt(src, \
-                                                        src_stride, \
-                                                        dst, \
-                                                        dst_stride, \
-                                                        h, \
-                                                        filter, \
-                                                        bd); \
-        src += 16; \
-        dst += 16; \
-        w -= 16; \
-      } \
-      while (w >= 8) { \
-        vpx_highbd_filter_block1d8_##dir##2_##avg##opt(src, \
-                                                       src_stride, \
-                                                       dst, \
-                                                       dst_stride, \
-                                                       h, \
-                                                       filter, \
-                                                       bd); \
-        src += 8; \
-        dst += 8; \
-        w -= 8; \
-      } \
-      while (w >= 4) { \
-        vpx_highbd_filter_block1d4_##dir##2_##avg##opt(src, \
-                                                       src_stride, \
-                                                       dst, \
-                                                       dst_stride, \
-                                                       h, \
-                                                       filter, \
-                                                       bd); \
-        src += 4; \
-        dst += 4; \
-        w -= 4; \
-      } \
-    } \
-  } \
-  if (w) { \
-    vpx_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \
-                                    filter_x, x_step_q4, filter_y, y_step_q4, \
-                                    w, h, bd); \
-  } \
-}
-
-#define HIGH_FUN_CONV_2D(avg, opt) \
-void vpx_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
-                                     uint8_t *dst, ptrdiff_t dst_stride, \
-                                     const int16_t *filter_x, int x_step_q4, \
-                                     const int16_t *filter_y, int y_step_q4, \
-                                     int w, int h, int bd) { \
-  assert(w <= 64); \
-  assert(h <= 64); \
-  if (x_step_q4 == 16 && y_step_q4 == 16) { \
-    if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) { \
-      DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
-      vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
-                                       CONVERT_TO_BYTEPTR(fdata2), 64, \
-                                       filter_x, x_step_q4, \
-                                       filter_y, y_step_q4, \
-                                       w, h + 7, bd); \
-      vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \
-                                             64, dst, dst_stride, \
-                                             filter_x, x_step_q4, \
-                                             filter_y, y_step_q4, \
-                                             w, h, bd); \
-    } else { \
-      DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \
-      vpx_highbd_convolve8_horiz_##opt(src, src_stride, \
-                                       CONVERT_TO_BYTEPTR(fdata2), 64, \
-                                       filter_x, x_step_q4, \
-                                       filter_y, y_step_q4, \
-                                       w, h + 1, bd); \
-      vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \
-                                             dst, dst_stride, \
-                                             filter_x, x_step_q4, \
-                                             filter_y, y_step_q4, \
-                                             w, h, bd); \
-    } \
-  } else { \
-    vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
-                                  filter_x, x_step_q4, filter_y, y_step_q4, w, \
-                                  h, bd); \
-  } \
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-#endif  // VPX_DSP_X86_CONVOLVE_H_
diff --git a/thirdparty/libvpx/vpx_dsp/x86/intrapred_sse2.asm b/thirdparty/libvpx/vpx_dsp/x86/intrapred_sse2.asm
deleted file mode 100644
index cd6a6ae982..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/intrapred_sse2.asm
+++ /dev/null
@@ -1,860 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pb_1: times 16 db 1
-pw_4:  times 8 dw 4
-pw_8:  times 8 dw 8
-pw_16: times 8 dw 16
-pw_32: times 8 dw 32
-dc_128: times 16 db 128
-pw2_4:  times 8 dw 2
-pw2_8:  times 8 dw 4
-pw2_16:  times 8 dw 8
-pw2_32:  times 8 dw 16
-
-SECTION .text
-
-; ------------------------------------------
-; input: x, y, z, result
-;
-; trick from pascal
-; (x+2y+z+2)>>2 can be calculated as:
-; result = avg(x,z)
-; result -= xor(x,z) & 1
-; result = avg(result,y)
-; ------------------------------------------
-%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
-  pavgb               %4, %1, %3
-  pxor                %3, %1
-  pand                %3, [GLOBAL(pb_1)]
-  psubb               %4, %3
-  pavgb               %4, %2
-%endmacro
-
-INIT_XMM sse2
-cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset
-  GET_GOT     goffsetq
-
-  movq                 m0, [aboveq]
-  DEFINE_ARGS dst, stride, temp
-  psrldq               m1, m0, 1
-  psrldq               m2, m0, 2
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
-
-  ; store 4 lines
-  movd   [dstq          ], m3
-  psrlq                m3, 8
-  movd   [dstq+strideq  ], m3
-  lea                dstq, [dstq+strideq*2]
-  psrlq                m3, 8
-  movd   [dstq          ], m3
-  psrlq                m3, 8
-  movd   [dstq+strideq  ], m3
-  psrlq                m0, 56
-  movd              tempq, m0
-  mov    [dstq+strideq+3], tempb
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset
-  GET_GOT     goffsetq
-
-  movu                m1, [aboveq]
-  pslldq              m0, m1, 1
-  psrldq              m2, m1, 1
-  DEFINE_ARGS dst, stride, stride3
-  lea           stride3q, [strideq*3]
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
-  punpckhbw           m0, m0 ; 7 7
-  punpcklwd           m0, m0 ; 7 7 7 7
-  punpckldq           m0, m0 ; 7 7 7 7 7 7 7 7
-  punpcklqdq          m3, m0 ; -1 0 1 2 3 4 5 6 7 7 7 7 7 7 7 7
-
- ; store 4 lines
-  psrldq                m3, 1
-  movq    [dstq          ], m3
-  psrldq                m3, 1
-  movq    [dstq+strideq  ], m3
-  psrldq                m3, 1
-  movq    [dstq+strideq*2], m3
-  psrldq                m3, 1
-  movq    [dstq+stride3q ], m3
-  lea                 dstq, [dstq+strideq*4]
-
-  ; store next 4 lines
-  psrldq                m3, 1
-  movq    [dstq          ], m3
-  psrldq                m3, 1
-  movq    [dstq+strideq  ], m3
-  psrldq                m3, 1
-  movq    [dstq+strideq*2], m3
-  psrldq                m3, 1
-  movq    [dstq+stride3q ], m3
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal d207_predictor_4x4, 4, 4, 5, dst, stride, unused, left, goffset
-  GET_GOT     goffsetq
-
-  movd                m0, [leftq]                ; abcd [byte]
-  punpcklbw           m4, m0, m0                 ; aabb ccdd
-  punpcklwd           m4, m4                     ; aaaa bbbb cccc dddd
-  psrldq              m4, 12                     ; dddd
-  punpckldq           m0, m4                     ; abcd dddd
-  psrldq              m1, m0, 1                  ; bcdd
-  psrldq              m2, m0, 2                  ; cddd
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3   ; a2bc b2cd c3d d
-  pavgb               m1, m0                     ; ab, bc, cd, d [byte]
-
-  punpcklbw           m1, m3             ; ab, a2bc, bc, b2cd, cd, c3d, d, d
-  movd    [dstq        ], m1
-  psrlq               m1, 16             ; bc, b2cd, cd, c3d, d, d
-  movd    [dstq+strideq], m1
-
-  lea               dstq, [dstq+strideq*2]
-  psrlq               m1, 16             ; cd, c3d, d, d
-  movd    [dstq        ], m1
-  movd    [dstq+strideq], m4             ; d, d, d, d
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  movd                  m2, [leftq]
-  movd                  m0, [aboveq]
-  pxor                  m1, m1
-  punpckldq             m0, m2
-  psadbw                m0, m1
-  paddw                 m0, [GLOBAL(pw_4)]
-  psraw                 m0, 3
-  pshuflw               m0, m0, 0x0
-  packuswb              m0, m0
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-  lea                 dstq, [dstq+strideq*2]
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset
-  movifnidn          leftq, leftmp
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  movd                  m0, [leftq]
-  psadbw                m0, m1
-  paddw                 m0, [GLOBAL(pw2_4)]
-  psraw                 m0, 2
-  pshuflw               m0, m0, 0x0
-  packuswb              m0, m0
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-  lea                 dstq, [dstq+strideq*2]
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  movd                  m0, [aboveq]
-  psadbw                m0, m1
-  paddw                 m0, [GLOBAL(pw2_4)]
-  psraw                 m0, 2
-  pshuflw               m0, m0, 0x0
-  packuswb              m0, m0
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-  lea                 dstq, [dstq+strideq*2]
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  movq                  m0, [aboveq]
-  movq                  m2, [leftq]
-  DEFINE_ARGS dst, stride, stride3
-  lea             stride3q, [strideq*3]
-  psadbw                m0, m1
-  psadbw                m2, m1
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw_8)]
-  psraw                 m0, 4
-  punpcklbw             m0, m0
-  pshuflw               m0, m0, 0x0
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  movq                  m0, [aboveq]
-  DEFINE_ARGS dst, stride, stride3
-  lea             stride3q, [strideq*3]
-  psadbw                m0, m1
-  paddw                 m0, [GLOBAL(pw2_8)]
-  psraw                 m0, 3
-  punpcklbw             m0, m0
-  pshuflw               m0, m0, 0x0
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset
-  movifnidn          leftq, leftmp
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  movq                  m0, [leftq]
-  DEFINE_ARGS dst, stride, stride3
-  lea             stride3q, [strideq*3]
-  psadbw                m0, m1
-  paddw                 m0, [GLOBAL(pw2_8)]
-  psraw                 m0, 3
-  punpcklbw             m0, m0
-  pshuflw               m0, m0, 0x0
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  DEFINE_ARGS dst, stride, stride3
-  lea             stride3q, [strideq*3]
-  movd     m0,        [GLOBAL(dc_128)]
-  movd    [dstq          ], m0
-  movd    [dstq+strideq  ], m0
-  movd    [dstq+strideq*2], m0
-  movd    [dstq+stride3q ], m0
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  DEFINE_ARGS dst, stride, stride3
-  lea             stride3q, [strideq*3]
-  movq    m0,        [GLOBAL(dc_128)]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [aboveq]
-  mova                  m2, [leftq]
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 4
-  psadbw                m0, m1
-  psadbw                m2, m1
-  paddw                 m0, m2
-  movhlps               m2, m0
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw_16)]
-  psraw                 m0, 5
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-  packuswb              m0, m0
-.loop:
-  mova    [dstq          ], m0
-  mova    [dstq+strideq  ], m0
-  mova    [dstq+strideq*2], m0
-  mova    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-
-INIT_XMM sse2
-cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [aboveq]
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 4
-  psadbw                m0, m1
-  movhlps               m2, m0
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw2_16)]
-  psraw                 m0, 4
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-  packuswb              m0, m0
-.loop:
-  mova    [dstq          ], m0
-  mova    [dstq+strideq  ], m0
-  mova    [dstq+strideq*2], m0
-  mova    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM sse2
-cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [leftq]
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 4
-  psadbw                m0, m1
-  movhlps               m2, m0
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw2_16)]
-  psraw                 m0, 4
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-  packuswb              m0, m0
-.loop:
-  mova    [dstq          ], m0
-  mova    [dstq+strideq  ], m0
-  mova    [dstq+strideq*2], m0
-  mova    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM sse2
-cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 4
-  mova    m0,        [GLOBAL(dc_128)]
-.loop:
-  mova    [dstq          ], m0
-  mova    [dstq+strideq  ], m0
-  mova    [dstq+strideq*2], m0
-  mova    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-  RESTORE_GOT
-  RET
-
-
-INIT_XMM sse2
-cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [aboveq]
-  mova                  m2, [aboveq+16]
-  mova                  m3, [leftq]
-  mova                  m4, [leftq+16]
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 8
-  psadbw                m0, m1
-  psadbw                m2, m1
-  psadbw                m3, m1
-  psadbw                m4, m1
-  paddw                 m0, m2
-  paddw                 m0, m3
-  paddw                 m0, m4
-  movhlps               m2, m0
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw_32)]
-  psraw                 m0, 6
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-  packuswb              m0, m0
-.loop:
-  mova [dstq             ], m0
-  mova [dstq          +16], m0
-  mova [dstq+strideq     ], m0
-  mova [dstq+strideq  +16], m0
-  mova [dstq+strideq*2   ], m0
-  mova [dstq+strideq*2+16], m0
-  mova [dstq+stride3q    ], m0
-  mova [dstq+stride3q +16], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM sse2
-cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [aboveq]
-  mova                  m2, [aboveq+16]
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 8
-  psadbw                m0, m1
-  psadbw                m2, m1
-  paddw                 m0, m2
-  movhlps               m2, m0
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw2_32)]
-  psraw                 m0, 5
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-  packuswb              m0, m0
-.loop:
-  mova [dstq             ], m0
-  mova [dstq          +16], m0
-  mova [dstq+strideq     ], m0
-  mova [dstq+strideq  +16], m0
-  mova [dstq+strideq*2   ], m0
-  mova [dstq+strideq*2+16], m0
-  mova [dstq+stride3q    ], m0
-  mova [dstq+stride3q +16], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM sse2
-cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [leftq]
-  mova                  m2, [leftq+16]
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 8
-  psadbw                m0, m1
-  psadbw                m2, m1
-  paddw                 m0, m2
-  movhlps               m2, m0
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw2_32)]
-  psraw                 m0, 5
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-  packuswb              m0, m0
-.loop:
-  mova [dstq             ], m0
-  mova [dstq          +16], m0
-  mova [dstq+strideq     ], m0
-  mova [dstq+strideq  +16], m0
-  mova [dstq+strideq*2   ], m0
-  mova [dstq+strideq*2+16], m0
-  mova [dstq+stride3q    ], m0
-  mova [dstq+stride3q +16], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM sse2
-cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 8
-  mova    m0,        [GLOBAL(dc_128)]
-.loop:
-  mova [dstq             ], m0
-  mova [dstq          +16], m0
-  mova [dstq+strideq     ], m0
-  mova [dstq+strideq  +16], m0
-  mova [dstq+strideq*2   ], m0
-  mova [dstq+strideq*2+16], m0
-  mova [dstq+stride3q    ], m0
-  mova [dstq+stride3q +16], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above
-  movd                  m0, [aboveq]
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-  lea                 dstq, [dstq+strideq*2]
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-  RET
-
-INIT_XMM sse2
-cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above
-  movq                  m0, [aboveq]
-  DEFINE_ARGS dst, stride, stride3
-  lea             stride3q, [strideq*3]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  RET
-
-INIT_XMM sse2
-cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above
-  mova                  m0, [aboveq]
-  DEFINE_ARGS dst, stride, stride3, nlines4
-  lea             stride3q, [strideq*3]
-  mov              nlines4d, 4
-.loop:
-  mova    [dstq          ], m0
-  mova    [dstq+strideq  ], m0
-  mova    [dstq+strideq*2], m0
-  mova    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec             nlines4d
-  jnz .loop
-  REP_RET
-
-INIT_XMM sse2
-cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above
-  mova                  m0, [aboveq]
-  mova                  m1, [aboveq+16]
-  DEFINE_ARGS dst, stride, stride3, nlines4
-  lea             stride3q, [strideq*3]
-  mov              nlines4d, 8
-.loop:
-  mova [dstq             ], m0
-  mova [dstq          +16], m1
-  mova [dstq+strideq     ], m0
-  mova [dstq+strideq  +16], m1
-  mova [dstq+strideq*2   ], m0
-  mova [dstq+strideq*2+16], m1
-  mova [dstq+stride3q    ], m0
-  mova [dstq+stride3q +16], m1
-  lea                 dstq, [dstq+strideq*4]
-  dec             nlines4d
-  jnz .loop
-  REP_RET
-
-INIT_XMM sse2
-cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left
-  movifnidn          leftq, leftmp
-  movd                  m0, [leftq]
-  punpcklbw             m0, m0
-  punpcklbw             m0, m0
-  pshufd                m1, m0, 0x1
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m1
-  pshufd                m2, m0, 0x2
-  lea                 dstq, [dstq+strideq*2]
-  pshufd                m3, m0, 0x3
-  movd      [dstq        ], m2
-  movd      [dstq+strideq], m3
-  RET
-
-INIT_XMM sse2
-cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left
-  movifnidn          leftq, leftmp
-  mov                lineq, -2
-  DEFINE_ARGS  dst, stride, line, left, stride3
-  lea             stride3q, [strideq*3]
-  movq                  m0, [leftq    ]
-  punpcklbw             m0, m0              ; l1 l1 l2 l2 ... l8 l8
-.loop:
-  pshuflw               m1, m0, 0x0         ; l1 l1 l1 l1 l1 l1 l1 l1
-  pshuflw               m2, m0, 0x55        ; l2 l2 l2 l2 l2 l2 l2 l2
-  movq      [dstq        ], m1
-  movq      [dstq+strideq], m2
-  pshuflw               m1, m0, 0xaa
-  pshuflw               m2, m0, 0xff
-  movq    [dstq+strideq*2], m1
-  movq    [dstq+stride3q ], m2
-  pshufd                m0, m0, 0xe         ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8
-  inc                lineq
-  lea                 dstq, [dstq+strideq*4]
-  jnz .loop
-  REP_RET
-
-INIT_XMM sse2
-cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left
-  movifnidn          leftq, leftmp
-  mov                lineq, -4
-  DEFINE_ARGS dst, stride, line, left, stride3
-  lea             stride3q, [strideq*3]
-.loop:
-  movd                  m0, [leftq]
-  punpcklbw             m0, m0
-  punpcklbw             m0, m0              ; l1 to l4 each repeated 4 times
-  pshufd            m1, m0, 0x0             ; l1 repeated 16 times
-  pshufd            m2, m0, 0x55            ; l2 repeated 16 times
-  mova    [dstq          ], m1
-  mova    [dstq+strideq  ], m2
-  pshufd            m1, m0, 0xaa
-  pshufd            m2, m0, 0xff
-  mova    [dstq+strideq*2], m1
-  mova    [dstq+stride3q ], m2
-  inc                lineq
-  lea                leftq, [leftq+4       ]
-  lea                 dstq, [dstq+strideq*4]
-  jnz .loop
-  REP_RET
-
-INIT_XMM sse2
-cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left
-  movifnidn              leftq, leftmp
-  mov                    lineq, -8
-  DEFINE_ARGS dst, stride, line, left, stride3
-  lea                 stride3q, [strideq*3]
-.loop:
-  movd                      m0, [leftq]
-  punpcklbw                 m0, m0
-  punpcklbw                 m0, m0              ; l1 to l4 each repeated 4 times
-  pshufd                m1, m0, 0x0             ; l1 repeated 16 times
-  pshufd                m2, m0, 0x55            ; l2 repeated 16 times
-  mova     [dstq             ], m1
-  mova     [dstq+16          ], m1
-  mova     [dstq+strideq     ], m2
-  mova     [dstq+strideq+16  ], m2
-  pshufd                m1, m0, 0xaa
-  pshufd                m2, m0, 0xff
-  mova     [dstq+strideq*2   ], m1
-  mova     [dstq+strideq*2+16], m1
-  mova     [dstq+stride3q    ], m2
-  mova     [dstq+stride3q+16 ], m2
-  inc                    lineq
-  lea                    leftq, [leftq+4       ]
-  lea                     dstq, [dstq+strideq*4]
-  jnz .loop
-  REP_RET
-
-INIT_XMM sse2
-cglobal tm_predictor_4x4, 4, 4, 5, dst, stride, above, left
-  pxor                  m1, m1
-  movq                  m0, [aboveq-1]; [63:0] tl t1 t2 t3 t4 x x x
-  punpcklbw             m0, m1
-  pshuflw               m2, m0, 0x0   ; [63:0] tl tl tl tl [word]
-  psrldq                m0, 2
-  psubw                 m0, m2        ; [63:0] t1-tl t2-tl t3-tl t4-tl [word]
-  movd                  m2, [leftq]
-  punpcklbw             m2, m1
-  pshuflw               m4, m2, 0x0   ; [63:0] l1 l1 l1 l1 [word]
-  pshuflw               m3, m2, 0x55  ; [63:0] l2 l2 l2 l2 [word]
-  paddw                 m4, m0
-  paddw                 m3, m0
-  packuswb              m4, m4
-  packuswb              m3, m3
-  movd      [dstq        ], m4
-  movd      [dstq+strideq], m3
-  lea                 dstq, [dstq+strideq*2]
-  pshuflw               m4, m2, 0xaa
-  pshuflw               m3, m2, 0xff
-  paddw                 m4, m0
-  paddw                 m3, m0
-  packuswb              m4, m4
-  packuswb              m3, m3
-  movd      [dstq        ], m4
-  movd      [dstq+strideq], m3
-  RET
-
-INIT_XMM sse2
-cglobal tm_predictor_8x8, 4, 4, 5, dst, stride, above, left
-  pxor                  m1, m1
-  movd                  m2, [aboveq-1]
-  movq                  m0, [aboveq]
-  punpcklbw             m2, m1
-  punpcklbw             m0, m1        ; t1 t2 t3 t4 t5 t6 t7 t8 [word]
-  pshuflw               m2, m2, 0x0   ; [63:0] tl tl tl tl [word]
-  DEFINE_ARGS dst, stride, line, left
-  mov                lineq, -4
-  punpcklqdq            m2, m2        ; tl tl tl tl tl tl tl tl [word]
-  psubw                 m0, m2        ; t1-tl t2-tl ... t8-tl [word]
-  movq                  m2, [leftq]
-  punpcklbw             m2, m1        ; l1 l2 l3 l4 l5 l6 l7 l8 [word]
-.loop
-  pshuflw               m4, m2, 0x0   ; [63:0] l1 l1 l1 l1 [word]
-  pshuflw               m3, m2, 0x55  ; [63:0] l2 l2 l2 l2 [word]
-  punpcklqdq            m4, m4        ; l1 l1 l1 l1 l1 l1 l1 l1 [word]
-  punpcklqdq            m3, m3        ; l2 l2 l2 l2 l2 l2 l2 l2 [word]
-  paddw                 m4, m0
-  paddw                 m3, m0
-  packuswb              m4, m3
-  movq      [dstq        ], m4
-  movhps    [dstq+strideq], m4
-  lea                 dstq, [dstq+strideq*2]
-  psrldq                m2, 4
-  inc                lineq
-  jnz .loop
-  REP_RET
-
-INIT_XMM sse2
-cglobal tm_predictor_16x16, 4, 5, 8, dst, stride, above, left
-  pxor                  m1, m1
-  mova                  m2, [aboveq-16];
-  mova                  m0, [aboveq]   ; t1 t2 ... t16 [byte]
-  punpckhbw             m2, m1         ; [127:112] tl [word]
-  punpckhbw             m4, m0, m1
-  punpcklbw             m0, m1         ; m0:m4 t1 t2 ... t16 [word]
-  DEFINE_ARGS dst, stride, line, left, stride8
-  mov                lineq, -8
-  pshufhw               m2, m2, 0xff
-  mova                  m3, [leftq]    ; l1 l2 ... l16 [byte]
-  punpckhqdq            m2, m2         ; tl repeated 8 times [word]
-  psubw                 m0, m2
-  psubw                 m4, m2         ; m0:m4 t1-tl t2-tl ... t16-tl [word]
-  punpckhbw             m5, m3, m1
-  punpcklbw             m3, m1         ; m3:m5 l1 l2 ... l16 [word]
-  lea             stride8q, [strideq*8]
-.loop:
-  pshuflw               m6, m3, 0x0
-  pshuflw               m7, m5, 0x0
-  punpcklqdq            m6, m6         ; l1 repeated 8 times [word]
-  punpcklqdq            m7, m7         ; l8 repeated 8 times [word]
-  paddw                 m1, m6, m0
-  paddw                 m6, m4         ; m1:m6 ti-tl+l1 [i=1,15] [word]
-  psrldq                m5, 2
-  packuswb              m1, m6
-  mova     [dstq         ], m1
-  paddw                 m1, m7, m0
-  paddw                 m7, m4         ; m1:m7 ti-tl+l8 [i=1,15] [word]
-  psrldq                m3, 2
-  packuswb              m1, m7
-  mova     [dstq+stride8q], m1
-  inc                lineq
-  lea                 dstq, [dstq+strideq]
-  jnz .loop
-  REP_RET
-
-INIT_XMM sse2
-cglobal tm_predictor_32x32, 4, 4, 8, dst, stride, above, left
-  pxor                  m1, m1
-  movd                  m2, [aboveq-1]
-  mova                  m0, [aboveq]
-  mova                  m4, [aboveq+16]
-  punpcklbw             m2, m1
-  punpckhbw             m3, m0, m1
-  punpckhbw             m5, m4, m1
-  punpcklbw             m0, m1
-  punpcklbw             m4, m1
-  pshuflw               m2, m2, 0x0
-  DEFINE_ARGS dst, stride, line, left
-  mov                lineq, -16
-  punpcklqdq            m2, m2
-  add                leftq, 32
-  psubw                 m0, m2
-  psubw                 m3, m2
-  psubw                 m4, m2
-  psubw                 m5, m2
-.loop:
-  movd                  m2, [leftq+lineq*2]
-  pxor                  m1, m1
-  punpcklbw             m2, m1
-  pshuflw               m7, m2, 0x55
-  pshuflw               m2, m2, 0x0
-  punpcklqdq            m2, m2
-  punpcklqdq            m7, m7
-  paddw                 m6, m2, m3
-  paddw                 m1, m2, m0
-  packuswb              m1, m6
-  mova   [dstq           ], m1
-  paddw                 m6, m2, m5
-  paddw                 m1, m2, m4
-  packuswb              m1, m6
-  mova   [dstq+16        ], m1
-  paddw                 m6, m7, m3
-  paddw                 m1, m7, m0
-  packuswb              m1, m6
-  mova   [dstq+strideq   ], m1
-  paddw                 m6, m7, m5
-  paddw                 m1, m7, m4
-  packuswb              m1, m6
-  mova   [dstq+strideq+16], m1
-  lea                 dstq, [dstq+strideq*2]
-  inc                lineq
-  jnz .loop
-  REP_RET
diff --git a/thirdparty/libvpx/vpx_dsp/x86/intrapred_ssse3.asm b/thirdparty/libvpx/vpx_dsp/x86/intrapred_ssse3.asm
deleted file mode 100644
index 5e0139fa8d..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/intrapred_ssse3.asm
+++ /dev/null
@@ -1,871 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-
-pb_1: times 16 db 1
-sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
-sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
-sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
-sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
-sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
-sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0
-sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
-sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-
-SECTION .text
-
-INIT_XMM ssse3
-cglobal d45_predictor_16x16, 3, 6, 4, dst, stride, above, dst8, line, goffset
-  GET_GOT     goffsetq
-
-  mova                   m0, [aboveq]
-  DEFINE_ARGS dst, stride, stride3, dst8, line
-  lea              stride3q, [strideq*3]
-  lea                 dst8q, [dstq+strideq*8]
-  mova                   m1, [GLOBAL(sh_b123456789abcdeff)]
-  pshufb                 m2, m0, [GLOBAL(sh_b23456789abcdefff)]
-  pavgb                  m3, m2, m0
-  pxor                   m2, m0
-  pshufb                 m0, m1
-  pand                   m2, [GLOBAL(pb_1)]
-  psubb                  m3, m2
-  pavgb                  m0, m3
-
-  ; first 4 lines and first half of 3rd 4 lines
-  mov                 lined, 2
-.loop:
-  mova   [dstq            ], m0
-  movhps [dst8q           ], m0
-  pshufb                 m0, m1
-  mova   [dstq +strideq   ], m0
-  movhps [dst8q+strideq   ], m0
-  pshufb                 m0, m1
-  mova   [dstq +strideq*2 ], m0
-  movhps [dst8q+strideq*2 ], m0
-  pshufb                 m0, m1
-  mova   [dstq +stride3q  ], m0
-  movhps [dst8q+stride3q  ], m0
-  pshufb                 m0, m1
-  lea                  dstq, [dstq +strideq*4]
-  lea                 dst8q, [dst8q+strideq*4]
-  dec                 lined
-  jnz .loop
-
-  ; bottom-right 8x8 block
-  movhps [dstq          +8], m0
-  movhps [dstq+strideq  +8], m0
-  movhps [dstq+strideq*2+8], m0
-  movhps [dstq+stride3q +8], m0
-  lea                  dstq, [dstq+strideq*4]
-  movhps [dstq          +8], m0
-  movhps [dstq+strideq  +8], m0
-  movhps [dstq+strideq*2+8], m0
-  movhps [dstq+stride3q +8], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM ssse3
-cglobal d45_predictor_32x32, 3, 6, 7, dst, stride, above, dst16, line, goffset
-  GET_GOT     goffsetq
-
-  mova                   m0, [aboveq]
-  mova                   m4, [aboveq+16]
-  DEFINE_ARGS dst, stride, stride3, dst16, line
-  lea              stride3q, [strideq*3]
-  lea                dst16q, [dstq  +strideq*8]
-  lea                dst16q, [dst16q+strideq*8]
-  mova                   m1, [GLOBAL(sh_b123456789abcdeff)]
-  pshufb                 m2, m4, [GLOBAL(sh_b23456789abcdefff)]
-  pavgb                  m3, m2, m4
-  pxor                   m2, m4
-  palignr                m5, m4, m0, 1
-  palignr                m6, m4, m0, 2
-  pshufb                 m4, m1
-  pand                   m2, [GLOBAL(pb_1)]
-  psubb                  m3, m2
-  pavgb                  m4, m3
-  pavgb                  m3, m0, m6
-  pxor                   m0, m6
-  pand                   m0, [GLOBAL(pb_1)]
-  psubb                  m3, m0
-  pavgb                  m5, m3
-
-  ; write 4x4 lines (and the first half of the second 4x4 lines)
-  mov                  lined, 4
-.loop:
-  mova [dstq               ], m5
-  mova [dstq            +16], m4
-  mova [dst16q             ], m4
-  palignr                 m3, m4, m5, 1
-  pshufb                  m4, m1
-  mova [dstq  +strideq     ], m3
-  mova [dstq  +strideq  +16], m4
-  mova [dst16q+strideq     ], m4
-  palignr                 m5, m4, m3, 1
-  pshufb                  m4, m1
-  mova [dstq  +strideq*2   ], m5
-  mova [dstq  +strideq*2+16], m4
-  mova [dst16q+strideq*2   ], m4
-  palignr                 m3, m4, m5, 1
-  pshufb                  m4, m1
-  mova [dstq  +stride3q    ], m3
-  mova [dstq  +stride3q +16], m4
-  mova [dst16q+stride3q    ], m4
-  palignr                 m5, m4, m3, 1
-  pshufb                  m4, m1
-  lea                  dstq, [dstq  +strideq*4]
-  lea                dst16q, [dst16q+strideq*4]
-  dec                 lined
-  jnz .loop
-
-  ; write second half of second 4x4 lines
-  mova [dstq            +16], m4
-  mova [dstq  +strideq  +16], m4
-  mova [dstq  +strideq*2+16], m4
-  mova [dstq  +stride3q +16], m4
-  lea                  dstq, [dstq  +strideq*4]
-  mova [dstq            +16], m4
-  mova [dstq  +strideq  +16], m4
-  mova [dstq  +strideq*2+16], m4
-  mova [dstq  +stride3q +16], m4
-  lea                  dstq, [dstq  +strideq*4]
-  mova [dstq            +16], m4
-  mova [dstq  +strideq  +16], m4
-  mova [dstq  +strideq*2+16], m4
-  mova [dstq  +stride3q +16], m4
-  lea                  dstq, [dstq  +strideq*4]
-  mova [dstq            +16], m4
-  mova [dstq  +strideq  +16], m4
-  mova [dstq  +strideq*2+16], m4
-  mova [dstq  +stride3q +16], m4
-
-  RESTORE_GOT
-  RET
-
-; ------------------------------------------
-; input: x, y, z, result
-;
-; trick from pascal
-; (x+2y+z+2)>>2 can be calculated as:
-; result = avg(x,z)
-; result -= xor(x,z) & 1
-; result = avg(result,y)
-; ------------------------------------------
-%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
-  pavgb               %4, %1, %3
-  pxor                %3, %1
-  pand                %3, [GLOBAL(pb_1)]
-  psubb               %4, %3
-  pavgb               %4, %2
-%endmacro
-
-INIT_XMM ssse3
-cglobal d63_predictor_4x4, 3, 4, 5, dst, stride, above, goffset
-  GET_GOT     goffsetq
-
-  movq                m3, [aboveq]
-  pshufb              m1, m3, [GLOBAL(sh_b23456777)]
-  pshufb              m2, m3, [GLOBAL(sh_b12345677)]
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4
-  pavgb               m3, m2
-
-  ; store 4 lines
-  movd    [dstq        ], m3
-  movd    [dstq+strideq], m4
-  lea               dstq, [dstq+strideq*2]
-  psrldq              m3, 1
-  psrldq              m4, 1
-  movd    [dstq        ], m3
-  movd    [dstq+strideq], m4
-  RESTORE_GOT
-  RET
-
-INIT_XMM ssse3
-cglobal d63_predictor_8x8, 3, 4, 5, dst, stride, above, goffset
-  GET_GOT     goffsetq
-
-  movq                m3, [aboveq]
-  DEFINE_ARGS dst, stride, stride3
-  lea           stride3q, [strideq*3]
-  pshufb              m1, m3, [GLOBAL(sh_b2345677777777777)]
-  pshufb              m0, m3, [GLOBAL(sh_b0123456777777777)]
-  pshufb              m2, m3, [GLOBAL(sh_b1234567777777777)]
-  pshufb              m3, [GLOBAL(sh_b0123456777777777)]
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m4
-  pavgb               m3, m2
-
-  ; store 4 lines
-  movq    [dstq        ], m3
-  movq    [dstq+strideq], m4
-  psrldq              m3, 1
-  psrldq              m4, 1
-  movq  [dstq+strideq*2], m3
-  movq  [dstq+stride3q ], m4
-  lea               dstq, [dstq+strideq*4]
-  psrldq              m3, 1
-  psrldq              m4, 1
-
-  ; store 4 lines
-  movq    [dstq        ], m3
-  movq    [dstq+strideq], m4
-  psrldq              m3, 1
-  psrldq              m4, 1
-  movq  [dstq+strideq*2], m3
-  movq  [dstq+stride3q ], m4
-  RESTORE_GOT
-  RET
-
-INIT_XMM ssse3
-cglobal d63_predictor_16x16, 3, 5, 5, dst, stride, above, line, goffset
-  GET_GOT     goffsetq
-
-  mova                m0, [aboveq]
-  DEFINE_ARGS dst, stride, stride3, line
-  lea           stride3q, [strideq*3]
-  mova                m1, [GLOBAL(sh_b123456789abcdeff)]
-  pshufb              m2, m0, [GLOBAL(sh_b23456789abcdefff)]
-  pshufb              m3, m0, m1
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m3, m2, m4
-  pavgb               m0, m3
-
-  mov              lined, 4
-.loop:
-  mova  [dstq          ], m0
-  mova  [dstq+strideq  ], m4
-  pshufb              m0, m1
-  pshufb              m4, m1
-  mova  [dstq+strideq*2], m0
-  mova  [dstq+stride3q ], m4
-  pshufb              m0, m1
-  pshufb              m4, m1
-  lea               dstq, [dstq+strideq*4]
-  dec              lined
-  jnz .loop
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM ssse3
-cglobal d63_predictor_32x32, 3, 5, 8, dst, stride, above, line, goffset
-  GET_GOT     goffsetq
-
-  mova                   m0, [aboveq]
-  mova                   m7, [aboveq+16]
-  DEFINE_ARGS dst, stride, stride3, line
-  mova                   m1, [GLOBAL(sh_b123456789abcdeff)]
-  lea              stride3q, [strideq*3]
-  pshufb                 m2, m7, [GLOBAL(sh_b23456789abcdefff)]
-  pshufb                 m3, m7, m1
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m2, m4
-  palignr                m6, m7, m0, 1
-  palignr                m5, m7, m0, 2
-  pavgb                  m7, m3
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m6, m5, m2
-  pavgb                  m0, m6
-
-  mov                 lined, 8
-.loop:
-  mova  [dstq             ], m0
-  mova  [dstq          +16], m7
-  mova  [dstq+strideq     ], m2
-  mova  [dstq+strideq  +16], m4
-  palignr                m3, m7, m0, 1
-  palignr                m5, m4, m2, 1
-  pshufb                 m7, m1
-  pshufb                 m4, m1
-
-  mova  [dstq+strideq*2   ], m3
-  mova  [dstq+strideq*2+16], m7
-  mova  [dstq+stride3q    ], m5
-  mova  [dstq+stride3q +16], m4
-  palignr                m0, m7, m3, 1
-  palignr                m2, m4, m5, 1
-  pshufb                 m7, m1
-  pshufb                 m4, m1
-  lea                  dstq, [dstq+strideq*4]
-  dec                 lined
-  jnz .loop
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM ssse3
-cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-  movd                m0, [leftq]               ; l1, l2, l3, l4
-  movd                m1, [aboveq-1]            ; tl, t1, t2, t3
-  punpckldq           m0, m1                    ; l1, l2, l3, l4, tl, t1, t2, t3
-  pshufb              m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3
-  psrldq              m1, m0, 1                 ; l3, l2, l1, tl, t1, t2, t3
-  psrldq              m2, m0, 2                 ; l2, l1, tl, t1, t2, t3
-  ; comments below are for a predictor like this
-  ; A1 B1 C1 D1
-  ; A2 B2 A1 B1
-  ; A3 B3 A2 B2
-  ; A4 B4 A3 B3
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3  ; 3-tap avg B4 B3 B2 B1 C1 D1
-  pavgb               m1, m0                    ; 2-tap avg A4 A3 A2 A1
-
-  punpcklqdq          m3, m1                    ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 ..
-
-  DEFINE_ARGS dst, stride, stride3
-  lea           stride3q, [strideq*3]
-  pshufb              m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 ..
-  movd  [dstq+stride3q ], m3
-  psrldq              m3, 2                     ; A3 B3 A2 B2 A1 B1 C1 D1 ..
-  movd  [dstq+strideq*2], m3
-  psrldq              m3, 2                     ; A2 B2 A1 B1 C1 D1 ..
-  movd  [dstq+strideq  ], m3
-  psrldq              m3, 2                     ; A1 B1 C1 D1 ..
-  movd  [dstq          ], m3
-  RESTORE_GOT
-  RET
-
-INIT_XMM ssse3
-cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-  movq                m0, [leftq]                     ; [0- 7] l1-8 [byte]
-  movhps              m0, [aboveq-1]                  ; [8-15] tl, t1-7 [byte]
-  pshufb              m1, m0, [GLOBAL(sh_b76543210)]  ; l8-1 [word]
-  pshufb              m2, m0, [GLOBAL(sh_b65432108)]  ; l7-1,tl [word]
-  pshufb              m3, m0, [GLOBAL(sh_b54321089)]  ; l6-1,tl,t1 [word]
-  pshufb              m0, [GLOBAL(sh_b89abcdef)]      ; tl,t1-7 [word]
-  psrldq              m4, m0, 1                       ; t1-7 [word]
-  psrldq              m5, m0, 2                       ; t2-7 [word]
-  ; comments below are for a predictor like this
-  ; A1 B1 C1 D1 E1 F1 G1 H1
-  ; A2 B2 A1 B1 C1 D1 E1 F1
-  ; A3 B3 A2 B2 A1 B1 C1 D1
-  ; A4 B4 A3 B3 A2 B2 A1 B1
-  ; A5 B5 A4 B4 A3 B3 A2 B2
-  ; A6 B6 A5 B5 A4 B4 A3 B3
-  ; A7 B7 A6 B6 A5 B5 A4 B4
-  ; A8 B8 A7 B7 A6 B6 A5 B5
-  pavgb               m6, m1, m2                ; 2-tap avg A8-A1
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7  ; 3-tap avg C-H1
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0  ; 3-tap avg B8-1
-
-  punpcklbw           m6, m0                    ; A-B8, A-B7 ... A-B2, A-B1
-
-  DEFINE_ARGS dst, stride, stride3
-  lea           stride3q, [strideq*3]
-
-  movhps [dstq+stride3q], m6                    ; A-B4, A-B3, A-B2, A-B1
-  palignr             m0, m7, m6, 10            ; A-B3, A-B2, A-B1, C-H1
-  movq  [dstq+strideq*2], m0
-  psrldq              m0, 2                     ; A-B2, A-B1, C-H1
-  movq  [dstq+strideq  ], m0
-  psrldq              m0, 2                     ; A-H1
-  movq  [dstq          ], m0
-  lea               dstq, [dstq+strideq*4]
-  movq  [dstq+stride3q ], m6                    ; A-B8, A-B7, A-B6, A-B5
-  psrldq              m6, 2                     ; A-B7, A-B6, A-B5, A-B4
-  movq  [dstq+strideq*2], m6
-  psrldq              m6, 2                     ; A-B6, A-B5, A-B4, A-B3
-  movq  [dstq+strideq  ], m6
-  psrldq              m6, 2                     ; A-B5, A-B4, A-B3, A-B2
-  movq  [dstq          ], m6
-  RESTORE_GOT
-  RET
-
-INIT_XMM ssse3
-cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-  mova                m0, [leftq]
-  movu                m7, [aboveq-1]
-  ; comments below are for a predictor like this
-  ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1
-  ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1
-  ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1
-  ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1
-  ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1
-  ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1
-  ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1
-  ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1
-  ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2
-  ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3
-  ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4
-  ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5
-  ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6
-  ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7
-  ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8
-  ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9
-  pshufb              m6, m7, [GLOBAL(sh_bfedcba9876543210)]
-  palignr             m5, m0, m6, 15
-  palignr             m3, m0, m6, 14
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4          ; 3-tap avg B3-Bg
-  pshufb              m1, m0, [GLOBAL(sh_b123456789abcdeff)]
-  pavgb               m5, m0                            ; A1 - Ag
-
-  punpcklbw           m0, m4, m5                        ; A-B8 ... A-B1
-  punpckhbw           m4, m5                            ; A-B9 ... A-Bg
-
-  pshufb              m3, m7, [GLOBAL(sh_b123456789abcdeff)]
-  pshufb              m5, m7, [GLOBAL(sh_b23456789abcdefff)]
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1          ; 3-tap avg C1-P1
-
-  pshufb              m6, m0, [GLOBAL(sh_bfedcba9876543210)]
-  DEFINE_ARGS dst, stride, stride3
-  lea           stride3q, [strideq*3]
-  palignr             m2, m1, m6, 14
-  mova  [dstq          ], m2
-  palignr             m2, m1, m6, 12
-  mova  [dstq+strideq  ], m2
-  palignr             m2, m1, m6, 10
-  mova  [dstq+strideq*2], m2
-  palignr             m2, m1, m6, 8
-  mova  [dstq+stride3q ], m2
-  lea               dstq, [dstq+strideq*4]
-  palignr             m2, m1, m6, 6
-  mova  [dstq          ], m2
-  palignr             m2, m1, m6, 4
-  mova  [dstq+strideq  ], m2
-  palignr             m2, m1, m6, 2
-  mova  [dstq+strideq*2], m2
-  pshufb              m4, [GLOBAL(sh_bfedcba9876543210)]
-  mova  [dstq+stride3q ], m6
-  lea               dstq, [dstq+strideq*4]
-
-  palignr             m2, m6, m4, 14
-  mova  [dstq          ], m2
-  palignr             m2, m6, m4, 12
-  mova  [dstq+strideq  ], m2
-  palignr             m2, m6, m4, 10
-  mova  [dstq+strideq*2], m2
-  palignr             m2, m6, m4, 8
-  mova  [dstq+stride3q ], m2
-  lea               dstq, [dstq+strideq*4]
-  palignr             m2, m6, m4, 6
-  mova  [dstq          ], m2
-  palignr             m2, m6, m4, 4
-  mova  [dstq+strideq  ], m2
-  palignr             m2, m6, m4, 2
-  mova  [dstq+strideq*2], m2
-  mova  [dstq+stride3q ], m4
-  RESTORE_GOT
-  RET
-
-INIT_XMM ssse3
-cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-  mova                  m0, [leftq]
-  movu                  m7, [aboveq-1]
-  movu                  m1, [aboveq+15]
-
-  pshufb                m4, m1, [GLOBAL(sh_b123456789abcdeff)]
-  pshufb                m6, m1, [GLOBAL(sh_b23456789abcdefff)]
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2          ; 3-tap avg above [high]
-
-  palignr               m3, m1, m7, 1
-  palignr               m5, m1, m7, 2
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1          ; 3-tap avg above [low]
-
-  pshufb                m7, [GLOBAL(sh_bfedcba9876543210)]
-  palignr               m5, m0, m7, 15
-  palignr               m3, m0, m7, 14
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4          ; 3-tap avg B3-Bg
-  pavgb                 m5, m0                            ; A1 - Ag
-  punpcklbw             m6, m4, m5                        ; A-B8 ... A-B1
-  punpckhbw             m4, m5                            ; A-B9 ... A-Bg
-  pshufb                m6, [GLOBAL(sh_bfedcba9876543210)]
-  pshufb                m4, [GLOBAL(sh_bfedcba9876543210)]
-
-  DEFINE_ARGS dst, stride, stride3, left, line
-  lea             stride3q, [strideq*3]
-
-  palignr               m5, m2, m1, 14
-  palignr               m7, m1, m6, 14
-  mova  [dstq            ], m7
-  mova  [dstq+16         ], m5
-  palignr               m5, m2, m1, 12
-  palignr               m7, m1, m6, 12
-  mova  [dstq+strideq    ], m7
-  mova  [dstq+strideq+16 ], m5
-  palignr                m5, m2, m1, 10
-  palignr                m7, m1, m6, 10
-  mova  [dstq+strideq*2   ], m7
-  mova  [dstq+strideq*2+16], m5
-  palignr                m5, m2, m1, 8
-  palignr                m7, m1, m6, 8
-  mova  [dstq+stride3q    ], m7
-  mova  [dstq+stride3q+16 ], m5
-  lea                  dstq, [dstq+strideq*4]
-  palignr                m5, m2, m1, 6
-  palignr                m7, m1, m6, 6
-  mova  [dstq             ], m7
-  mova  [dstq+16          ], m5
-  palignr                m5, m2, m1, 4
-  palignr                m7, m1, m6, 4
-  mova  [dstq+strideq     ], m7
-  mova  [dstq+strideq+16  ], m5
-  palignr                m5, m2, m1, 2
-  palignr                m7, m1, m6, 2
-  mova  [dstq+strideq*2   ], m7
-  mova  [dstq+strideq*2+16], m5
-  mova  [dstq+stride3q    ], m6
-  mova  [dstq+stride3q+16 ], m1
-  lea                  dstq, [dstq+strideq*4]
-
-  palignr                m5, m1, m6, 14
-  palignr                m3, m6, m4, 14
-  mova  [dstq             ], m3
-  mova  [dstq+16          ], m5
-  palignr                m5, m1, m6, 12
-  palignr                m3, m6, m4, 12
-  mova  [dstq+strideq     ], m3
-  mova  [dstq+strideq+16  ], m5
-  palignr                m5, m1, m6, 10
-  palignr                m3, m6, m4, 10
-  mova  [dstq+strideq*2   ], m3
-  mova  [dstq+strideq*2+16], m5
-  palignr                m5, m1, m6, 8
-  palignr                m3, m6, m4, 8
-  mova  [dstq+stride3q    ], m3
-  mova  [dstq+stride3q+16 ], m5
-  lea                  dstq, [dstq+strideq*4]
-  palignr                m5, m1, m6, 6
-  palignr                m3, m6, m4, 6
-  mova  [dstq             ], m3
-  mova  [dstq+16          ], m5
-  palignr                m5, m1, m6, 4
-  palignr                m3, m6, m4, 4
-  mova  [dstq+strideq     ], m3
-  mova  [dstq+strideq+16  ], m5
-  palignr                m5, m1, m6, 2
-  palignr                m3, m6, m4, 2
-  mova  [dstq+strideq*2   ], m3
-  mova  [dstq+strideq*2+16], m5
-  mova  [dstq+stride3q    ], m4
-  mova  [dstq+stride3q+16 ], m6
-  lea               dstq, [dstq+strideq*4]
-
-  mova                   m7, [leftq]
-  mova                   m3, [leftq+16]
-  palignr                m5, m3, m7, 15
-  palignr                m0, m3, m7, 14
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2          ; 3-tap avg Bh -
-  pavgb                  m5, m3                            ; Ah -
-  punpcklbw              m3, m2, m5                        ; A-B8 ... A-B1
-  punpckhbw              m2, m5                            ; A-B9 ... A-Bg
-  pshufb                 m3, [GLOBAL(sh_bfedcba9876543210)]
-  pshufb                 m2, [GLOBAL(sh_bfedcba9876543210)]
-
-  palignr                m7, m6, m4, 14
-  palignr                m0, m4, m3, 14
-  mova  [dstq             ], m0
-  mova  [dstq+16          ], m7
-  palignr                m7, m6, m4, 12
-  palignr                m0, m4, m3, 12
-  mova  [dstq+strideq     ], m0
-  mova  [dstq+strideq+16  ], m7
-  palignr                m7, m6, m4, 10
-  palignr                m0, m4, m3, 10
-  mova  [dstq+strideq*2   ], m0
-  mova  [dstq+strideq*2+16], m7
-  palignr                m7, m6, m4, 8
-  palignr                m0, m4, m3, 8
-  mova  [dstq+stride3q    ], m0
-  mova  [dstq+stride3q+16 ], m7
-  lea                  dstq, [dstq+strideq*4]
-  palignr                m7, m6, m4, 6
-  palignr                m0, m4, m3, 6
-  mova  [dstq             ], m0
-  mova  [dstq+16          ], m7
-  palignr                m7, m6, m4, 4
-  palignr                m0, m4, m3, 4
-  mova  [dstq+strideq     ], m0
-  mova  [dstq+strideq+16  ], m7
-  palignr                m7, m6, m4, 2
-  palignr                m0, m4, m3, 2
-  mova  [dstq+strideq*2   ], m0
-  mova  [dstq+strideq*2+16], m7
-  mova  [dstq+stride3q    ], m3
-  mova  [dstq+stride3q+16 ], m4
-  lea                  dstq, [dstq+strideq*4]
-
-  palignr                m7, m4, m3, 14
-  palignr                m0, m3, m2, 14
-  mova  [dstq             ], m0
-  mova  [dstq+16          ], m7
-  palignr                m7, m4, m3, 12
-  palignr                m0, m3, m2, 12
-  mova  [dstq+strideq     ], m0
-  mova  [dstq+strideq+16  ], m7
-  palignr                m7, m4, m3, 10
-  palignr                m0, m3, m2, 10
-  mova  [dstq+strideq*2   ], m0
-  mova  [dstq+strideq*2+16], m7
-  palignr                m7, m4, m3, 8
-  palignr                m0, m3, m2, 8
-  mova  [dstq+stride3q    ], m0
-  mova  [dstq+stride3q+16 ], m7
-  lea                  dstq, [dstq+strideq*4]
-  palignr                m7, m4, m3, 6
-  palignr                m0, m3, m2, 6
-  mova  [dstq             ], m0
-  mova  [dstq+16          ], m7
-  palignr                m7, m4, m3, 4
-  palignr                m0, m3, m2, 4
-  mova  [dstq+strideq     ], m0
-  mova  [dstq+strideq+16  ], m7
-  palignr                m7, m4, m3, 2
-  palignr                m0, m3, m2, 2
-  mova  [dstq+strideq*2   ], m0
-  mova  [dstq+strideq*2+16], m7
-  mova  [dstq+stride3q    ], m2
-  mova  [dstq+stride3q+16 ], m3
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM ssse3
-cglobal d207_predictor_8x8, 4, 5, 4, dst, stride, stride3, left, goffset
-  GET_GOT     goffsetq
-  movq                m3, [leftq]            ; abcdefgh [byte]
-  lea           stride3q, [strideq*3]
-
-  pshufb              m1, m3, [GLOBAL(sh_b2345677777777777)]
-  pshufb              m0, m3, [GLOBAL(sh_b0123456777777777)]
-  pshufb              m2, m3, [GLOBAL(sh_b1234567777777777)]
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m3
-  pavgb               m0, m2
-  punpcklbw           m0, m3        ; interleaved output
-
-  movq  [dstq          ], m0
-  psrldq              m0, 2
-  movq  [dstq+strideq  ], m0
-  psrldq              m0, 2
-  movq  [dstq+strideq*2], m0
-  psrldq              m0, 2
-  movq  [dstq+stride3q ], m0
-  lea               dstq, [dstq+strideq*4]
-  pshufhw             m0, m0, q0000 ; de, d2ef, ef, e2fg, fg, f2gh, gh, g3h, 8xh
-  psrldq              m0, 2
-  movq  [dstq          ], m0
-  psrldq              m0, 2
-  movq  [dstq+strideq  ], m0
-  psrldq              m0, 2
-  movq  [dstq+strideq*2], m0
-  psrldq              m0, 2
-  movq  [dstq+stride3q ], m0
-  RESTORE_GOT
-  RET
-
-INIT_XMM ssse3
-cglobal d207_predictor_16x16, 4, 5, 5, dst, stride, stride3, left, goffset
-  GET_GOT     goffsetq
-  lea           stride3q, [strideq*3]
-  mova                m0, [leftq]            ; abcdefghijklmnop [byte]
-  pshufb              m1, m0, [GLOBAL(sh_b123456789abcdeff)] ; bcdefghijklmnopp
-  pshufb              m2, m0, [GLOBAL(sh_b23456789abcdefff)]
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
-  pavgb               m1, m0                 ; ab, bc, cd .. no, op, pp [byte]
-
-  punpckhbw           m4, m1, m3    ; interleaved input
-  punpcklbw           m1, m3        ; interleaved output
-  mova  [dstq          ], m1
-  palignr             m3, m4, m1, 2
-  mova  [dstq+strideq  ], m3
-  palignr             m3, m4, m1, 4
-  mova  [dstq+strideq*2], m3
-  palignr             m3, m4, m1, 6
-  mova  [dstq+stride3q ], m3
-  lea               dstq, [dstq+strideq*4]
-  palignr             m3, m4, m1, 8
-  mova  [dstq          ], m3
-  palignr             m3, m4, m1, 10
-  mova  [dstq+strideq  ], m3
-  palignr             m3, m4, m1, 12
-  mova  [dstq+strideq*2], m3
-  palignr             m3, m4, m1, 14
-  mova  [dstq+stride3q ], m3
-  DEFINE_ARGS dst, stride, stride3, line
-  mov              lined, 2
-  mova                m0, [GLOBAL(sh_b23456789abcdefff)]
-.loop:
-  lea               dstq, [dstq+strideq*4]
-  mova  [dstq          ], m4
-  pshufb              m4, m0
-  mova  [dstq+strideq  ], m4
-  pshufb              m4, m0
-  mova  [dstq+strideq*2], m4
-  pshufb              m4, m0
-  mova  [dstq+stride3q ], m4
-  pshufb              m4, m0
-  dec              lined
-  jnz .loop
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM ssse3
-cglobal d207_predictor_32x32, 4, 5, 8, dst, stride, stride3, left, goffset
-  GET_GOT     goffsetq
-  lea           stride3q, [strideq*3]
-  mova                m1, [leftq]              ;  0-15 [byte]
-  mova                m2, [leftq+16]           ; 16-31 [byte]
-  pshufb              m0, m2, [GLOBAL(sh_b23456789abcdefff)]
-  pshufb              m4, m2, [GLOBAL(sh_b123456789abcdeff)]
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m2, m4, m0, m3
-  palignr             m6, m2, m1, 1
-  palignr             m5, m2, m1, 2
-  pavgb               m2, m4         ; high 16px even lines
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m6, m5, m0
-  pavgb                   m1, m6         ; low 16px even lines
-
-  punpckhbw               m6, m1, m0               ; interleaved output 2
-  punpcklbw               m1, m0                   ; interleaved output 1
-
-  punpckhbw               m7, m2, m3               ; interleaved output 4
-  punpcklbw               m2, m3                   ; interleaved output 3
-
-  ; output 1st 8 lines (and half of 2nd 8 lines)
-  DEFINE_ARGS dst, stride, stride3, dst8
-  lea                  dst8q, [dstq+strideq*8]
-  mova  [dstq              ], m1
-  mova  [dstq           +16], m6
-  mova  [dst8q             ], m6
-  palignr             m0, m6, m1, 2
-  palignr             m4, m2, m6, 2
-  mova  [dstq +strideq     ], m0
-  mova  [dstq +strideq  +16], m4
-  mova  [dst8q+strideq     ], m4
-  palignr             m0, m6, m1, 4
-  palignr             m4, m2, m6, 4
-  mova  [dstq +strideq*2   ], m0
-  mova  [dstq +strideq*2+16], m4
-  mova  [dst8q+strideq*2   ], m4
-  palignr             m0, m6, m1, 6
-  palignr             m4, m2, m6, 6
-  mova  [dstq +stride3q    ], m0
-  mova  [dstq +stride3q +16], m4
-  mova  [dst8q+stride3q    ], m4
-  lea               dstq, [dstq +strideq*4]
-  lea              dst8q, [dst8q+strideq*4]
-  palignr             m0, m6, m1, 8
-  palignr             m4, m2, m6, 8
-  mova  [dstq              ], m0
-  mova  [dstq           +16], m4
-  mova  [dst8q             ], m4
-  palignr             m0, m6, m1, 10
-  palignr             m4, m2, m6, 10
-  mova  [dstq +strideq     ], m0
-  mova  [dstq +strideq  +16], m4
-  mova  [dst8q+strideq     ], m4
-  palignr             m0, m6, m1, 12
-  palignr             m4, m2, m6, 12
-  mova  [dstq +strideq*2   ], m0
-  mova  [dstq +strideq*2+16], m4
-  mova  [dst8q+strideq*2   ], m4
-  palignr             m0, m6, m1, 14
-  palignr             m4, m2, m6, 14
-  mova  [dstq +stride3q    ], m0
-  mova  [dstq +stride3q +16], m4
-  mova  [dst8q+stride3q    ], m4
-  lea               dstq, [dstq+strideq*4]
-  lea              dst8q, [dst8q+strideq*4]
-
-  ; output 2nd half of 2nd 8 lines and half of 3rd 8 lines
-  mova  [dstq           +16], m2
-  mova  [dst8q             ], m2
-  palignr             m4, m7, m2, 2
-  mova  [dstq +strideq  +16], m4
-  mova  [dst8q+strideq     ], m4
-  palignr             m4, m7, m2, 4
-  mova  [dstq +strideq*2+16], m4
-  mova  [dst8q+strideq*2   ], m4
-  palignr             m4, m7, m2, 6
-  mova  [dstq +stride3q +16], m4
-  mova  [dst8q+stride3q    ], m4
-  lea               dstq, [dstq+strideq*4]
-  lea              dst8q, [dst8q+strideq*4]
-  palignr             m4, m7, m2, 8
-  mova  [dstq           +16], m4
-  mova  [dst8q             ], m4
-  palignr             m4, m7, m2, 10
-  mova  [dstq +strideq  +16], m4
-  mova  [dst8q+strideq     ], m4
-  palignr             m4, m7, m2, 12
-  mova  [dstq +strideq*2+16], m4
-  mova  [dst8q+strideq*2   ], m4
-  palignr             m4, m7, m2, 14
-  mova  [dstq +stride3q +16], m4
-  mova  [dst8q+stride3q    ], m4
-  lea               dstq, [dstq+strideq*4]
-  lea              dst8q, [dst8q+strideq*4]
-
-  ; output 2nd half of 3rd 8 lines and half of 4th 8 lines
-  mova                m0, [GLOBAL(sh_b23456789abcdefff)]
-  mova  [dstq           +16], m7
-  mova  [dst8q             ], m7
-  pshufb              m7, m0
-  mova  [dstq +strideq  +16], m7
-  mova  [dst8q+strideq     ], m7
-  pshufb              m7, m0
-  mova  [dstq +strideq*2+16], m7
-  mova  [dst8q+strideq*2   ], m7
-  pshufb              m7, m0
-  mova  [dstq +stride3q +16], m7
-  mova  [dst8q+stride3q    ], m7
-  pshufb              m7, m0
-  lea               dstq, [dstq+strideq*4]
-  lea              dst8q, [dst8q+strideq*4]
-  mova  [dstq           +16], m7
-  mova  [dst8q             ], m7
-  pshufb              m7, m0
-  mova  [dstq +strideq  +16], m7
-  mova  [dst8q+strideq     ], m7
-  pshufb              m7, m0
-  mova  [dstq +strideq*2+16], m7
-  mova  [dst8q+strideq*2   ], m7
-  pshufb              m7, m0
-  mova  [dstq +stride3q +16], m7
-  mova  [dst8q+stride3q    ], m7
-  pshufb              m7, m0
-  lea               dstq, [dstq+strideq*4]
-
-  ; output last half of 4th 8 lines
-  mova  [dstq           +16], m7
-  mova  [dstq +strideq  +16], m7
-  mova  [dstq +strideq*2+16], m7
-  mova  [dstq +stride3q +16], m7
-  lea               dstq, [dstq+strideq*4]
-  mova  [dstq           +16], m7
-  mova  [dstq +strideq  +16], m7
-  mova  [dstq +strideq*2+16], m7
-  mova  [dstq +stride3q +16], m7
-
-  ; done!
-  RESTORE_GOT
-  RET
diff --git a/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_sse2.c b/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
deleted file mode 100644
index df5068c624..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
+++ /dev/null
@@ -1,4046 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/x86/inv_txfm_sse2.h"
-#include "vpx_dsp/x86/txfm_common_sse2.h"
-
-#define RECON_AND_STORE4X4(dest, in_x) \
-{                                                     \
-  __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
-  d0 = _mm_unpacklo_epi8(d0, zero); \
-  d0 = _mm_add_epi16(in_x, d0); \
-  d0 = _mm_packus_epi16(d0, d0); \
-  *(int *)(dest) = _mm_cvtsi128_si32(d0); \
-}
-
-void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
-                             int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i eight = _mm_set1_epi16(8);
-  const __m128i cst = _mm_setr_epi16(
-      (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,
-      (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
-      (int16_t)cospi_8_64, (int16_t)cospi_24_64);
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i input0, input1, input2, input3;
-
-  // Rows
-  input0 = load_input_data(input);
-  input2 = load_input_data(input + 8);
-
-  // Construct i3, i1, i3, i1, i2, i0, i2, i0
-  input0 = _mm_shufflelo_epi16(input0, 0xd8);
-  input0 = _mm_shufflehi_epi16(input0, 0xd8);
-  input2 = _mm_shufflelo_epi16(input2, 0xd8);
-  input2 = _mm_shufflehi_epi16(input2, 0xd8);
-
-  input1 = _mm_unpackhi_epi32(input0, input0);
-  input0 = _mm_unpacklo_epi32(input0, input0);
-  input3 = _mm_unpackhi_epi32(input2, input2);
-  input2 = _mm_unpacklo_epi32(input2, input2);
-
-  // Stage 1
-  input0 = _mm_madd_epi16(input0, cst);
-  input1 = _mm_madd_epi16(input1, cst);
-  input2 = _mm_madd_epi16(input2, cst);
-  input3 = _mm_madd_epi16(input3, cst);
-
-  input0 = _mm_add_epi32(input0, rounding);
-  input1 = _mm_add_epi32(input1, rounding);
-  input2 = _mm_add_epi32(input2, rounding);
-  input3 = _mm_add_epi32(input3, rounding);
-
-  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
-  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
-  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
-  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
-
-  // Stage 2
-  input0 = _mm_packs_epi32(input0, input1);
-  input1 = _mm_packs_epi32(input2, input3);
-
-  // Transpose
-  input2 = _mm_unpacklo_epi16(input0, input1);
-  input3 = _mm_unpackhi_epi16(input0, input1);
-  input0 = _mm_unpacklo_epi32(input2, input3);
-  input1 = _mm_unpackhi_epi32(input2, input3);
-
-  // Switch column2, column 3, and then, we got:
-  // input2: column1, column 0;  input3: column2, column 3.
-  input1 = _mm_shuffle_epi32(input1, 0x4e);
-  input2 = _mm_add_epi16(input0, input1);
-  input3 = _mm_sub_epi16(input0, input1);
-
-  // Columns
-  // Construct i3, i1, i3, i1, i2, i0, i2, i0
-  input0 = _mm_unpacklo_epi32(input2, input2);
-  input1 = _mm_unpackhi_epi32(input2, input2);
-  input2 = _mm_unpackhi_epi32(input3, input3);
-  input3 = _mm_unpacklo_epi32(input3, input3);
-
-  // Stage 1
-  input0 = _mm_madd_epi16(input0, cst);
-  input1 = _mm_madd_epi16(input1, cst);
-  input2 = _mm_madd_epi16(input2, cst);
-  input3 = _mm_madd_epi16(input3, cst);
-
-  input0 = _mm_add_epi32(input0, rounding);
-  input1 = _mm_add_epi32(input1, rounding);
-  input2 = _mm_add_epi32(input2, rounding);
-  input3 = _mm_add_epi32(input3, rounding);
-
-  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
-  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
-  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
-  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
-
-  // Stage 2
-  input0 = _mm_packs_epi32(input0, input2);
-  input1 = _mm_packs_epi32(input1, input3);
-
-  // Transpose
-  input2 = _mm_unpacklo_epi16(input0, input1);
-  input3 = _mm_unpackhi_epi16(input0, input1);
-  input0 = _mm_unpacklo_epi32(input2, input3);
-  input1 = _mm_unpackhi_epi32(input2, input3);
-
-  // Switch column2, column 3, and then, we got:
-  // input2: column1, column 0;  input3: column2, column 3.
-  input1 = _mm_shuffle_epi32(input1, 0x4e);
-  input2 = _mm_add_epi16(input0, input1);
-  input3 = _mm_sub_epi16(input0, input1);
-
-  // Final round and shift
-  input2 = _mm_add_epi16(input2, eight);
-  input3 = _mm_add_epi16(input3, eight);
-
-  input2 = _mm_srai_epi16(input2, 4);
-  input3 = _mm_srai_epi16(input3, 4);
-
-  // Reconstruction and Store
-  {
-    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
-    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
-    d0 = _mm_unpacklo_epi32(d0,
-                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
-    d2 = _mm_unpacklo_epi32(
-        _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);
-    d0 = _mm_unpacklo_epi8(d0, zero);
-    d2 = _mm_unpacklo_epi8(d2, zero);
-    d0 = _mm_add_epi16(d0, input2);
-    d2 = _mm_add_epi16(d2, input3);
-    d0 = _mm_packus_epi16(d0, d2);
-    // store input0
-    *(int *)dest = _mm_cvtsi128_si32(d0);
-    // store input1
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
-    // store input2
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
-    // store input3
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
-  }
-}
-
-void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
-                            int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a;
-
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 4);
-
-  dc_value = _mm_set1_epi16(a);
-
-  RECON_AND_STORE4X4(dest + 0 * stride, dc_value);
-  RECON_AND_STORE4X4(dest + 1 * stride, dc_value);
-  RECON_AND_STORE4X4(dest + 2 * stride, dc_value);
-  RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
-}
-
-static INLINE void transpose_4x4(__m128i *res) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
-  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
-
-  res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
-  res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
-}
-
-void idct4_sse2(__m128i *in) {
-  const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i u[8], v[8];
-
-  transpose_4x4(in);
-  // stage 1
-  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
-  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-
-  u[0] = _mm_packs_epi32(v[0], v[1]);
-  u[1] = _mm_packs_epi32(v[3], v[2]);
-
-  // stage 2
-  in[0] = _mm_add_epi16(u[0], u[1]);
-  in[1] = _mm_sub_epi16(u[0], u[1]);
-  in[1] = _mm_shuffle_epi32(in[1], 0x4E);
-}
-
-void iadst4_sse2(__m128i *in) {
-  const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
-  const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
-  const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
-  const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
-  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
-  const __m128i kZero = _mm_set1_epi16(0);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i u[8], v[8], in7;
-
-  transpose_4x4(in);
-  in7 = _mm_srli_si128(in[1], 8);
-  in7 = _mm_add_epi16(in7, in[0]);
-  in7 = _mm_sub_epi16(in7, in[1]);
-
-  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
-  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
-  u[2] = _mm_unpacklo_epi16(in7, kZero);
-  u[3] = _mm_unpackhi_epi16(in[0], kZero);
-
-  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
-  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
-  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2
-  v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4
-  v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6
-  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2
-
-  u[0] = _mm_add_epi32(v[0], v[1]);
-  u[1] = _mm_add_epi32(v[3], v[4]);
-  u[2] = v[2];
-  u[3] = _mm_add_epi32(u[0], u[1]);
-  u[4] = _mm_slli_epi32(v[5], 2);
-  u[5] = _mm_add_epi32(u[3], v[5]);
-  u[6] = _mm_sub_epi32(u[5], u[4]);
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-
-  in[0] = _mm_packs_epi32(u[0], u[1]);
-  in[1] = _mm_packs_epi32(u[2], u[3]);
-}
-
-#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
-                      out0, out1, out2, out3, out4, out5, out6, out7) \
-  {                                                     \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
-    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
-    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
-    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
-    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
-    const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
-    const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
-                                                        \
-    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
-    const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
-    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
-    const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
-    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
-    const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
-    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
-    const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
-                                                            \
-    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
-    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
-    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
-    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
-    out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
-    out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
-    out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
-    out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
-  }
-
-#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \
-                         out0, out1, out2, out3) \
-  {                                              \
-    const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
-    const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
-    const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
-    \
-    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
-    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
-    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
-    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
-    \
-    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
-    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
-    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
-    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
-  }
-
-#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
-  {                                            \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
-    out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
-    out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
-  }
-
-// Define Macro for multiplying elements by constants and adding them together.
-#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
-                               cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
-  {   \
-      tmp0 = _mm_madd_epi16(lo_0, cst0); \
-      tmp1 = _mm_madd_epi16(hi_0, cst0); \
-      tmp2 = _mm_madd_epi16(lo_0, cst1); \
-      tmp3 = _mm_madd_epi16(hi_0, cst1); \
-      tmp4 = _mm_madd_epi16(lo_1, cst2); \
-      tmp5 = _mm_madd_epi16(hi_1, cst2); \
-      tmp6 = _mm_madd_epi16(lo_1, cst3); \
-      tmp7 = _mm_madd_epi16(hi_1, cst3); \
-      \
-      tmp0 = _mm_add_epi32(tmp0, rounding); \
-      tmp1 = _mm_add_epi32(tmp1, rounding); \
-      tmp2 = _mm_add_epi32(tmp2, rounding); \
-      tmp3 = _mm_add_epi32(tmp3, rounding); \
-      tmp4 = _mm_add_epi32(tmp4, rounding); \
-      tmp5 = _mm_add_epi32(tmp5, rounding); \
-      tmp6 = _mm_add_epi32(tmp6, rounding); \
-      tmp7 = _mm_add_epi32(tmp7, rounding); \
-      \
-      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-      tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
-      tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
-      tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
-      tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
-      \
-      res0 = _mm_packs_epi32(tmp0, tmp1); \
-      res1 = _mm_packs_epi32(tmp2, tmp3); \
-      res2 = _mm_packs_epi32(tmp4, tmp5); \
-      res3 = _mm_packs_epi32(tmp6, tmp7); \
-  }
-
-#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
-  {   \
-      tmp0 = _mm_madd_epi16(lo_0, cst0); \
-      tmp1 = _mm_madd_epi16(hi_0, cst0); \
-      tmp2 = _mm_madd_epi16(lo_0, cst1); \
-      tmp3 = _mm_madd_epi16(hi_0, cst1); \
-      \
-      tmp0 = _mm_add_epi32(tmp0, rounding); \
-      tmp1 = _mm_add_epi32(tmp1, rounding); \
-      tmp2 = _mm_add_epi32(tmp2, rounding); \
-      tmp3 = _mm_add_epi32(tmp3, rounding); \
-      \
-      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-      \
-      res0 = _mm_packs_epi32(tmp0, tmp1); \
-      res1 = _mm_packs_epi32(tmp2, tmp3); \
-  }
-
-#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \
-              out0, out1, out2, out3, out4, out5, out6, out7)  \
-  { \
-  /* Stage1 */      \
-  { \
-    const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
-    const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
-    const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
-    const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
-    \
-    MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
-                          stg1_1, stg1_2, stg1_3, stp1_4,      \
-                          stp1_7, stp1_5, stp1_6)              \
-  } \
-    \
-  /* Stage2 */ \
-  { \
-    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
-    const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
-    const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
-    const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
-    \
-    MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
-                           stg2_1, stg2_2, stg2_3, stp2_0,     \
-                           stp2_1, stp2_2, stp2_3)             \
-    \
-    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
-    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
-    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
-    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
-  } \
-    \
-  /* Stage3 */ \
-  { \
-    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
-    const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
-    \
-    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
-    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
-    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
-    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
-    \
-    tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
-    tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
-    tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
-    tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
-    \
-    tmp0 = _mm_add_epi32(tmp0, rounding); \
-    tmp1 = _mm_add_epi32(tmp1, rounding); \
-    tmp2 = _mm_add_epi32(tmp2, rounding); \
-    tmp3 = _mm_add_epi32(tmp3, rounding); \
-    \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-    \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
-  } \
-  \
-  /* Stage4  */ \
-  out0 = _mm_adds_epi16(stp1_0, stp2_7); \
-  out1 = _mm_adds_epi16(stp1_1, stp1_6); \
-  out2 = _mm_adds_epi16(stp1_2, stp1_5); \
-  out3 = _mm_adds_epi16(stp1_3, stp2_4); \
-  out4 = _mm_subs_epi16(stp1_3, stp2_4); \
-  out5 = _mm_subs_epi16(stp1_2, stp1_5); \
-  out6 = _mm_subs_epi16(stp1_1, stp1_6); \
-  out7 = _mm_subs_epi16(stp1_0, stp2_7); \
-  }
-
-void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
-                             int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-
-  // Load input data.
-  in0 = load_input_data(input);
-  in1 = load_input_data(input + 8 * 1);
-  in2 = load_input_data(input + 8 * 2);
-  in3 = load_input_data(input + 8 * 3);
-  in4 = load_input_data(input + 8 * 4);
-  in5 = load_input_data(input + 8 * 5);
-  in6 = load_input_data(input + 8 * 6);
-  in7 = load_input_data(input + 8 * 7);
-
-  // 2-D
-  for (i = 0; i < 2; i++) {
-    // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
-    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,
-                  in0, in1, in2, in3, in4, in5, in6, in7);
-
-    // 4-stage 1D idct8x8
-    IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
-          in0, in1, in2, in3, in4, in5, in6, in7);
-  }
-
-  // Final rounding and shift
-  in0 = _mm_adds_epi16(in0, final_rounding);
-  in1 = _mm_adds_epi16(in1, final_rounding);
-  in2 = _mm_adds_epi16(in2, final_rounding);
-  in3 = _mm_adds_epi16(in3, final_rounding);
-  in4 = _mm_adds_epi16(in4, final_rounding);
-  in5 = _mm_adds_epi16(in5, final_rounding);
-  in6 = _mm_adds_epi16(in6, final_rounding);
-  in7 = _mm_adds_epi16(in7, final_rounding);
-
-  in0 = _mm_srai_epi16(in0, 5);
-  in1 = _mm_srai_epi16(in1, 5);
-  in2 = _mm_srai_epi16(in2, 5);
-  in3 = _mm_srai_epi16(in3, 5);
-  in4 = _mm_srai_epi16(in4, 5);
-  in5 = _mm_srai_epi16(in5, 5);
-  in6 = _mm_srai_epi16(in6, 5);
-  in7 = _mm_srai_epi16(in7, 5);
-
-  RECON_AND_STORE(dest + 0 * stride, in0);
-  RECON_AND_STORE(dest + 1 * stride, in1);
-  RECON_AND_STORE(dest + 2 * stride, in2);
-  RECON_AND_STORE(dest + 3 * stride, in3);
-  RECON_AND_STORE(dest + 4 * stride, in4);
-  RECON_AND_STORE(dest + 5 * stride, in5);
-  RECON_AND_STORE(dest + 6 * stride, in6);
-  RECON_AND_STORE(dest + 7 * stride, in7);
-}
-
-void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
-                            int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a;
-
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 5);
-
-  dc_value = _mm_set1_epi16(a);
-
-  RECON_AND_STORE(dest + 0 * stride, dc_value);
-  RECON_AND_STORE(dest + 1 * stride, dc_value);
-  RECON_AND_STORE(dest + 2 * stride, dc_value);
-  RECON_AND_STORE(dest + 3 * stride, dc_value);
-  RECON_AND_STORE(dest + 4 * stride, dc_value);
-  RECON_AND_STORE(dest + 5 * stride, dc_value);
-  RECON_AND_STORE(dest + 6 * stride, dc_value);
-  RECON_AND_STORE(dest + 7 * stride, dc_value);
-}
-
-void idct8_sse2(__m128i *in) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
-  // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
-  TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
-                in0, in1, in2, in3, in4, in5, in6, in7);
-
-  // 4-stage 1D idct8x8
-  IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
-        in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
-}
-
-void iadst8_sse2(__m128i *in) {
-  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__const_0 = _mm_set1_epi16(0);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
-  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
-  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-
-  // transpose
-  array_transpose_8x8(in, in);
-
-  // properly aligned for butterfly input
-  in0 = in[7];
-  in1 = in[0];
-  in2 = in[5];
-  in3 = in[2];
-  in4 = in[3];
-  in5 = in[4];
-  in6 = in[1];
-  in7 = in[6];
-
-  // column transformation
-  // stage 1
-  // interleave and multiply/add into 32-bit integer
-  s0 = _mm_unpacklo_epi16(in0, in1);
-  s1 = _mm_unpackhi_epi16(in0, in1);
-  s2 = _mm_unpacklo_epi16(in2, in3);
-  s3 = _mm_unpackhi_epi16(in2, in3);
-  s4 = _mm_unpacklo_epi16(in4, in5);
-  s5 = _mm_unpackhi_epi16(in4, in5);
-  s6 = _mm_unpacklo_epi16(in6, in7);
-  s7 = _mm_unpackhi_epi16(in6, in7);
-
-  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
-  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
-  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
-  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
-  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
-  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
-  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
-  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
-  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
-  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
-  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
-  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
-  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
-  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
-  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
-  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
-
-  // addition
-  w0 = _mm_add_epi32(u0, u8);
-  w1 = _mm_add_epi32(u1, u9);
-  w2 = _mm_add_epi32(u2, u10);
-  w3 = _mm_add_epi32(u3, u11);
-  w4 = _mm_add_epi32(u4, u12);
-  w5 = _mm_add_epi32(u5, u13);
-  w6 = _mm_add_epi32(u6, u14);
-  w7 = _mm_add_epi32(u7, u15);
-  w8 = _mm_sub_epi32(u0, u8);
-  w9 = _mm_sub_epi32(u1, u9);
-  w10 = _mm_sub_epi32(u2, u10);
-  w11 = _mm_sub_epi32(u3, u11);
-  w12 = _mm_sub_epi32(u4, u12);
-  w13 = _mm_sub_epi32(u5, u13);
-  w14 = _mm_sub_epi32(u6, u14);
-  w15 = _mm_sub_epi32(u7, u15);
-
-  // shift and rounding
-  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
-  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
-  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
-  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
-  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
-  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
-  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
-  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
-  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
-  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
-  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
-  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
-  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
-  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
-  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
-  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
-  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
-  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
-  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
-  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
-
-  // back to 16-bit and pack 8 integers into __m128i
-  in[0] = _mm_packs_epi32(u0, u1);
-  in[1] = _mm_packs_epi32(u2, u3);
-  in[2] = _mm_packs_epi32(u4, u5);
-  in[3] = _mm_packs_epi32(u6, u7);
-  in[4] = _mm_packs_epi32(u8, u9);
-  in[5] = _mm_packs_epi32(u10, u11);
-  in[6] = _mm_packs_epi32(u12, u13);
-  in[7] = _mm_packs_epi32(u14, u15);
-
-  // stage 2
-  s0 = _mm_add_epi16(in[0], in[2]);
-  s1 = _mm_add_epi16(in[1], in[3]);
-  s2 = _mm_sub_epi16(in[0], in[2]);
-  s3 = _mm_sub_epi16(in[1], in[3]);
-  u0 = _mm_unpacklo_epi16(in[4], in[5]);
-  u1 = _mm_unpackhi_epi16(in[4], in[5]);
-  u2 = _mm_unpacklo_epi16(in[6], in[7]);
-  u3 = _mm_unpackhi_epi16(in[6], in[7]);
-
-  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
-  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
-  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
-  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
-  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
-  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
-  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
-  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
-
-  w0 = _mm_add_epi32(v0, v4);
-  w1 = _mm_add_epi32(v1, v5);
-  w2 = _mm_add_epi32(v2, v6);
-  w3 = _mm_add_epi32(v3, v7);
-  w4 = _mm_sub_epi32(v0, v4);
-  w5 = _mm_sub_epi32(v1, v5);
-  w6 = _mm_sub_epi32(v2, v6);
-  w7 = _mm_sub_epi32(v3, v7);
-
-  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
-  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
-  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
-  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
-  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-
-  // back to 16-bit intergers
-  s4 = _mm_packs_epi32(u0, u1);
-  s5 = _mm_packs_epi32(u2, u3);
-  s6 = _mm_packs_epi32(u4, u5);
-  s7 = _mm_packs_epi32(u6, u7);
-
-  // stage 3
-  u0 = _mm_unpacklo_epi16(s2, s3);
-  u1 = _mm_unpackhi_epi16(s2, s3);
-  u2 = _mm_unpacklo_epi16(s6, s7);
-  u3 = _mm_unpackhi_epi16(s6, s7);
-
-  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
-  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
-  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
-  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
-  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
-  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
-  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
-  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
-
-  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
-  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
-  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
-  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
-  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
-  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
-  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
-  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
-  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
-  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
-  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
-  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
-  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
-  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
-  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
-  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
-  s2 = _mm_packs_epi32(v0, v1);
-  s3 = _mm_packs_epi32(v2, v3);
-  s6 = _mm_packs_epi32(v4, v5);
-  s7 = _mm_packs_epi32(v6, v7);
-
-  in[0] = s0;
-  in[1] = _mm_sub_epi16(k__const_0, s4);
-  in[2] = s6;
-  in[3] = _mm_sub_epi16(k__const_0, s2);
-  in[4] = s3;
-  in[5] = _mm_sub_epi16(k__const_0, s7);
-  in[6] = s5;
-  in[7] = _mm_sub_epi16(k__const_0, s1);
-}
-
-void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
-                             int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
-  // Rows. Load 4-row input data.
-  in0 = load_input_data(input);
-  in1 = load_input_data(input + 8 * 1);
-  in2 = load_input_data(input + 8 * 2);
-  in3 = load_input_data(input + 8 * 3);
-
-  // 8x4 Transpose
-  TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
-  // Stage1
-  {
-    const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
-    const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
-
-    tmp0 = _mm_madd_epi16(lo_17, stg1_0);
-    tmp2 = _mm_madd_epi16(lo_17, stg1_1);
-    tmp4 = _mm_madd_epi16(lo_35, stg1_2);
-    tmp6 = _mm_madd_epi16(lo_35, stg1_3);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
-    stp1_5 = _mm_packs_epi32(tmp4, tmp6);
-  }
-
-  // Stage2
-  {
-    const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
-    const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
-
-    tmp0 = _mm_madd_epi16(lo_04, stg2_0);
-    tmp2 = _mm_madd_epi16(lo_04, stg2_1);
-    tmp4 = _mm_madd_epi16(lo_26, stg2_2);
-    tmp6 = _mm_madd_epi16(lo_26, stg2_3);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp2_0 = _mm_packs_epi32(tmp0, tmp2);
-    stp2_2 = _mm_packs_epi32(tmp6, tmp4);
-
-    tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
-    tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
-
-    stp2_4 = tmp0;
-    stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
-    stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
-  }
-
-  // Stage3
-  {
-    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
-
-    tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
-    tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
-
-    stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
-    stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
-
-    tmp0 = _mm_madd_epi16(lo_56, stg3_0);
-    tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
-    stp1_5 = _mm_packs_epi32(tmp0, tmp2);
-  }
-
-  // Stage4
-  tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
-  tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
-  tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
-  tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
-
-  TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
-
-  IDCT8(in0, in1, in2, in3, zero, zero, zero, zero,
-        in0, in1, in2, in3, in4, in5, in6, in7);
-  // Final rounding and shift
-  in0 = _mm_adds_epi16(in0, final_rounding);
-  in1 = _mm_adds_epi16(in1, final_rounding);
-  in2 = _mm_adds_epi16(in2, final_rounding);
-  in3 = _mm_adds_epi16(in3, final_rounding);
-  in4 = _mm_adds_epi16(in4, final_rounding);
-  in5 = _mm_adds_epi16(in5, final_rounding);
-  in6 = _mm_adds_epi16(in6, final_rounding);
-  in7 = _mm_adds_epi16(in7, final_rounding);
-
-  in0 = _mm_srai_epi16(in0, 5);
-  in1 = _mm_srai_epi16(in1, 5);
-  in2 = _mm_srai_epi16(in2, 5);
-  in3 = _mm_srai_epi16(in3, 5);
-  in4 = _mm_srai_epi16(in4, 5);
-  in5 = _mm_srai_epi16(in5, 5);
-  in6 = _mm_srai_epi16(in6, 5);
-  in7 = _mm_srai_epi16(in7, 5);
-
-  RECON_AND_STORE(dest + 0 * stride, in0);
-  RECON_AND_STORE(dest + 1 * stride, in1);
-  RECON_AND_STORE(dest + 2 * stride, in2);
-  RECON_AND_STORE(dest + 3 * stride, in3);
-  RECON_AND_STORE(dest + 4 * stride, in4);
-  RECON_AND_STORE(dest + 5 * stride, in5);
-  RECON_AND_STORE(dest + 6 * stride, in6);
-  RECON_AND_STORE(dest + 7 * stride, in7);
-}
-
-#define IDCT16 \
-  /* Stage2 */ \
-  { \
-    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
-    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
-    const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);   \
-    const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);   \
-    const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
-    const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
-    const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
-    const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
-    \
-    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
-                           stg2_0, stg2_1, stg2_2, stg2_3, \
-                           stp2_8, stp2_15, stp2_9, stp2_14) \
-    \
-    MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
-                           stg2_4, stg2_5, stg2_6, stg2_7, \
-                           stp2_10, stp2_13, stp2_11, stp2_12) \
-  } \
-    \
-  /* Stage3 */ \
-  { \
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
-    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
-    const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
-    const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
-    \
-    MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
-                           stg3_0, stg3_1, stg3_2, stg3_3, \
-                           stp1_4, stp1_7, stp1_5, stp1_6) \
-    \
-    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);  \
-    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);    \
-    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
-    stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
-    \
-    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
-    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
-    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
-  } \
-  \
-  /* Stage4 */ \
-  { \
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
-    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
-    const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
-    const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
-    \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
-    \
-    MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
-                           stg4_0, stg4_1, stg4_2, stg4_3, \
-                           stp2_0, stp2_1, stp2_2, stp2_3) \
-    \
-    stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
-    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
-    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
-    stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
-    \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
-                           stg4_4, stg4_5, stg4_6, stg4_7, \
-                           stp2_9, stp2_14, stp2_10, stp2_13) \
-  } \
-    \
-  /* Stage5 */ \
-  { \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
-    \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
-    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
-    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
-    \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
-    \
-    tmp0 = _mm_add_epi32(tmp0, rounding); \
-    tmp1 = _mm_add_epi32(tmp1, rounding); \
-    tmp2 = _mm_add_epi32(tmp2, rounding); \
-    tmp3 = _mm_add_epi32(tmp3, rounding); \
-    \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-    \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
-    \
-    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
-    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
-    \
-    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
-    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
-  } \
-    \
-  /* Stage6 */ \
-  { \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
-    \
-    stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
-    stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
-    \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
-                           stg6_0, stg4_0, stg6_0, stg4_0, \
-                           stp2_10, stp2_13, stp2_11, stp2_12) \
-  }
-
-#define IDCT16_10 \
-    /* Stage2 */ \
-    { \
-      const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
-      const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
-      const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
-      const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
-      \
-      MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \
-                             stg2_0, stg2_1, stg2_6, stg2_7, \
-                             stp1_8_0, stp1_15, stp1_11, stp1_12_0) \
-    } \
-      \
-    /* Stage3 */ \
-    { \
-      const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
-      const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
-      \
-      MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \
-                               stg3_0, stg3_1,  \
-                               stp2_4, stp2_7) \
-      \
-      stp1_9  =  stp1_8_0; \
-      stp1_10 =  stp1_11;  \
-      \
-      stp1_13 = stp1_12_0; \
-      stp1_14 = stp1_15;   \
-    } \
-    \
-    /* Stage4 */ \
-    { \
-      const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
-      const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
-      \
-      const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
-      const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
-      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
-      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
-      \
-      MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \
-                               stg4_0, stg4_1, \
-                               stp1_0, stp1_1) \
-      stp2_5 = stp2_4; \
-      stp2_6 = stp2_7; \
-      \
-      MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
-                             stg4_4, stg4_5, stg4_6, stg4_7, \
-                             stp2_9, stp2_14, stp2_10, stp2_13) \
-    } \
-      \
-    /* Stage5 */ \
-    { \
-      const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
-      const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
-      \
-      stp1_2 = stp1_1; \
-      stp1_3 = stp1_0; \
-      \
-      tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
-      tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
-      tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
-      tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
-      \
-      tmp0 = _mm_add_epi32(tmp0, rounding); \
-      tmp1 = _mm_add_epi32(tmp1, rounding); \
-      tmp2 = _mm_add_epi32(tmp2, rounding); \
-      tmp3 = _mm_add_epi32(tmp3, rounding); \
-      \
-      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-      \
-      stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
-      stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
-      \
-      stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
-      stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
-      stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
-      stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
-      \
-      stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
-      stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
-      stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
-      stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
-    } \
-      \
-    /* Stage6 */ \
-    { \
-      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
-      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
-      const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
-      const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
-      \
-      stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
-      stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
-      stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
-      stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
-      stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
-      stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
-      stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
-      stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
-      \
-      MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
-                             stg6_0, stg4_0, stg6_0, stg4_0, \
-                             stp2_10, stp2_13, stp2_11, stp2_12) \
-    }
-
-void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
-                                int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in[16], l[16], r[16], *curr1;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-          stp1_8_0, stp1_12_0;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-
-  curr1 = l;
-  for (i = 0; i < 2; i++) {
-    // 1-D idct
-
-    // Load input data.
-    in[0] = load_input_data(input);
-    in[8] = load_input_data(input + 8 * 1);
-    in[1] = load_input_data(input + 8 * 2);
-    in[9] = load_input_data(input + 8 * 3);
-    in[2] = load_input_data(input + 8 * 4);
-    in[10] = load_input_data(input + 8 * 5);
-    in[3] = load_input_data(input + 8 * 6);
-    in[11] = load_input_data(input + 8 * 7);
-    in[4] = load_input_data(input + 8 * 8);
-    in[12] = load_input_data(input + 8 * 9);
-    in[5] = load_input_data(input + 8 * 10);
-    in[13] = load_input_data(input + 8 * 11);
-    in[6] = load_input_data(input + 8 * 12);
-    in[14] = load_input_data(input + 8 * 13);
-    in[7] = load_input_data(input + 8 * 14);
-    in[15] = load_input_data(input + 8 * 15);
-
-    array_transpose_8x8(in, in);
-    array_transpose_8x8(in + 8, in + 8);
-
-    IDCT16
-
-    // Stage7
-    curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
-    curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
-    curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
-    curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
-    curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
-    curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
-    curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
-    curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
-    curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
-    curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
-    curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
-    curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
-    curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
-    curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
-    curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
-    curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-    curr1 = r;
-    input += 128;
-  }
-  for (i = 0; i < 2; i++) {
-    int j;
-    // 1-D idct
-    array_transpose_8x8(l + i * 8, in);
-    array_transpose_8x8(r + i * 8, in + 8);
-
-    IDCT16
-
-    // 2-D
-    in[0] = _mm_add_epi16(stp2_0, stp1_15);
-    in[1] = _mm_add_epi16(stp2_1, stp1_14);
-    in[2] = _mm_add_epi16(stp2_2, stp2_13);
-    in[3] = _mm_add_epi16(stp2_3, stp2_12);
-    in[4] = _mm_add_epi16(stp2_4, stp2_11);
-    in[5] = _mm_add_epi16(stp2_5, stp2_10);
-    in[6] = _mm_add_epi16(stp2_6, stp1_9);
-    in[7] = _mm_add_epi16(stp2_7, stp1_8);
-    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
-    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
-    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
-    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
-    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
-    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
-    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
-    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-    for (j = 0; j < 16; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
-
-    dest += 8;
-  }
-}
-
-void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
-                              int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a, i;
-
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 6);
-
-  dc_value = _mm_set1_epi16(a);
-
-  for (i = 0; i < 16; ++i) {
-    RECON_AND_STORE(dest +  0, dc_value);
-    RECON_AND_STORE(dest +  8, dc_value);
-    dest += stride;
-  }
-}
-
-static void iadst16_8col(__m128i *in) {
-  // perform 16x16 1-D ADST for 8 columns
-  __m128i s[16], x[16], u[32], v[32];
-  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
-  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
-  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
-  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
-  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
-  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
-  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
-  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
-  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
-  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kZero = _mm_set1_epi16(0);
-
-  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
-  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
-  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
-  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
-  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
-  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
-  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
-  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
-  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
-  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
-  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
-  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
-  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
-  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
-  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
-  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
-  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
-  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
-  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
-  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
-  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
-  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
-  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
-  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
-  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
-  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
-  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
-  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
-  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
-  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
-  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
-  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
-
-  u[0] = _mm_add_epi32(v[0], v[16]);
-  u[1] = _mm_add_epi32(v[1], v[17]);
-  u[2] = _mm_add_epi32(v[2], v[18]);
-  u[3] = _mm_add_epi32(v[3], v[19]);
-  u[4] = _mm_add_epi32(v[4], v[20]);
-  u[5] = _mm_add_epi32(v[5], v[21]);
-  u[6] = _mm_add_epi32(v[6], v[22]);
-  u[7] = _mm_add_epi32(v[7], v[23]);
-  u[8] = _mm_add_epi32(v[8], v[24]);
-  u[9] = _mm_add_epi32(v[9], v[25]);
-  u[10] = _mm_add_epi32(v[10], v[26]);
-  u[11] = _mm_add_epi32(v[11], v[27]);
-  u[12] = _mm_add_epi32(v[12], v[28]);
-  u[13] = _mm_add_epi32(v[13], v[29]);
-  u[14] = _mm_add_epi32(v[14], v[30]);
-  u[15] = _mm_add_epi32(v[15], v[31]);
-  u[16] = _mm_sub_epi32(v[0], v[16]);
-  u[17] = _mm_sub_epi32(v[1], v[17]);
-  u[18] = _mm_sub_epi32(v[2], v[18]);
-  u[19] = _mm_sub_epi32(v[3], v[19]);
-  u[20] = _mm_sub_epi32(v[4], v[20]);
-  u[21] = _mm_sub_epi32(v[5], v[21]);
-  u[22] = _mm_sub_epi32(v[6], v[22]);
-  u[23] = _mm_sub_epi32(v[7], v[23]);
-  u[24] = _mm_sub_epi32(v[8], v[24]);
-  u[25] = _mm_sub_epi32(v[9], v[25]);
-  u[26] = _mm_sub_epi32(v[10], v[26]);
-  u[27] = _mm_sub_epi32(v[11], v[27]);
-  u[28] = _mm_sub_epi32(v[12], v[28]);
-  u[29] = _mm_sub_epi32(v[13], v[29]);
-  u[30] = _mm_sub_epi32(v[14], v[30]);
-  u[31] = _mm_sub_epi32(v[15], v[31]);
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
-  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
-  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
-  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
-  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
-  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
-  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
-  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
-  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
-  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
-  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
-  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
-  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
-  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
-  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
-  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
-  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
-  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
-  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
-  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
-  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
-  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
-  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
-  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
-  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
-  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
-  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
-  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
-  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
-  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
-  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
-
-  s[0] = _mm_packs_epi32(u[0], u[1]);
-  s[1] = _mm_packs_epi32(u[2], u[3]);
-  s[2] = _mm_packs_epi32(u[4], u[5]);
-  s[3] = _mm_packs_epi32(u[6], u[7]);
-  s[4] = _mm_packs_epi32(u[8], u[9]);
-  s[5] = _mm_packs_epi32(u[10], u[11]);
-  s[6] = _mm_packs_epi32(u[12], u[13]);
-  s[7] = _mm_packs_epi32(u[14], u[15]);
-  s[8] = _mm_packs_epi32(u[16], u[17]);
-  s[9] = _mm_packs_epi32(u[18], u[19]);
-  s[10] = _mm_packs_epi32(u[20], u[21]);
-  s[11] = _mm_packs_epi32(u[22], u[23]);
-  s[12] = _mm_packs_epi32(u[24], u[25]);
-  s[13] = _mm_packs_epi32(u[26], u[27]);
-  s[14] = _mm_packs_epi32(u[28], u[29]);
-  s[15] = _mm_packs_epi32(u[30], u[31]);
-
-  // stage 2
-  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
-  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
-  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
-  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
-  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
-  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
-  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
-  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
-
-  u[0] = _mm_add_epi32(v[0], v[8]);
-  u[1] = _mm_add_epi32(v[1], v[9]);
-  u[2] = _mm_add_epi32(v[2], v[10]);
-  u[3] = _mm_add_epi32(v[3], v[11]);
-  u[4] = _mm_add_epi32(v[4], v[12]);
-  u[5] = _mm_add_epi32(v[5], v[13]);
-  u[6] = _mm_add_epi32(v[6], v[14]);
-  u[7] = _mm_add_epi32(v[7], v[15]);
-  u[8] = _mm_sub_epi32(v[0], v[8]);
-  u[9] = _mm_sub_epi32(v[1], v[9]);
-  u[10] = _mm_sub_epi32(v[2], v[10]);
-  u[11] = _mm_sub_epi32(v[3], v[11]);
-  u[12] = _mm_sub_epi32(v[4], v[12]);
-  u[13] = _mm_sub_epi32(v[5], v[13]);
-  u[14] = _mm_sub_epi32(v[6], v[14]);
-  u[15] = _mm_sub_epi32(v[7], v[15]);
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
-  x[0] = _mm_add_epi16(s[0], s[4]);
-  x[1] = _mm_add_epi16(s[1], s[5]);
-  x[2] = _mm_add_epi16(s[2], s[6]);
-  x[3] = _mm_add_epi16(s[3], s[7]);
-  x[4] = _mm_sub_epi16(s[0], s[4]);
-  x[5] = _mm_sub_epi16(s[1], s[5]);
-  x[6] = _mm_sub_epi16(s[2], s[6]);
-  x[7] = _mm_sub_epi16(s[3], s[7]);
-  x[8] = _mm_packs_epi32(u[0], u[1]);
-  x[9] = _mm_packs_epi32(u[2], u[3]);
-  x[10] = _mm_packs_epi32(u[4], u[5]);
-  x[11] = _mm_packs_epi32(u[6], u[7]);
-  x[12] = _mm_packs_epi32(u[8], u[9]);
-  x[13] = _mm_packs_epi32(u[10], u[11]);
-  x[14] = _mm_packs_epi32(u[12], u[13]);
-  x[15] = _mm_packs_epi32(u[14], u[15]);
-
-  // stage 3
-  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
-  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
-  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
-  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
-  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
-  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
-  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
-  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
-
-  u[0] = _mm_add_epi32(v[0], v[4]);
-  u[1] = _mm_add_epi32(v[1], v[5]);
-  u[2] = _mm_add_epi32(v[2], v[6]);
-  u[3] = _mm_add_epi32(v[3], v[7]);
-  u[4] = _mm_sub_epi32(v[0], v[4]);
-  u[5] = _mm_sub_epi32(v[1], v[5]);
-  u[6] = _mm_sub_epi32(v[2], v[6]);
-  u[7] = _mm_sub_epi32(v[3], v[7]);
-  u[8] = _mm_add_epi32(v[8], v[12]);
-  u[9] = _mm_add_epi32(v[9], v[13]);
-  u[10] = _mm_add_epi32(v[10], v[14]);
-  u[11] = _mm_add_epi32(v[11], v[15]);
-  u[12] = _mm_sub_epi32(v[8], v[12]);
-  u[13] = _mm_sub_epi32(v[9], v[13]);
-  u[14] = _mm_sub_epi32(v[10], v[14]);
-  u[15] = _mm_sub_epi32(v[11], v[15]);
-
-  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[0] = _mm_add_epi16(x[0], x[2]);
-  s[1] = _mm_add_epi16(x[1], x[3]);
-  s[2] = _mm_sub_epi16(x[0], x[2]);
-  s[3] = _mm_sub_epi16(x[1], x[3]);
-  s[4] = _mm_packs_epi32(v[0], v[1]);
-  s[5] = _mm_packs_epi32(v[2], v[3]);
-  s[6] = _mm_packs_epi32(v[4], v[5]);
-  s[7] = _mm_packs_epi32(v[6], v[7]);
-  s[8] = _mm_add_epi16(x[8], x[10]);
-  s[9] = _mm_add_epi16(x[9], x[11]);
-  s[10] = _mm_sub_epi16(x[8], x[10]);
-  s[11] = _mm_sub_epi16(x[9], x[11]);
-  s[12] = _mm_packs_epi32(v[8], v[9]);
-  s[13] = _mm_packs_epi32(v[10], v[11]);
-  s[14] = _mm_packs_epi32(v[12], v[13]);
-  s[15] = _mm_packs_epi32(v[14], v[15]);
-
-  // stage 4
-  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
-  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
-  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
-  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
-  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
-  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
-  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
-  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  in[0] = s[0];
-  in[1] = _mm_sub_epi16(kZero, s[8]);
-  in[2] = s[12];
-  in[3] = _mm_sub_epi16(kZero, s[4]);
-  in[4] = _mm_packs_epi32(v[4], v[5]);
-  in[5] = _mm_packs_epi32(v[12], v[13]);
-  in[6] = _mm_packs_epi32(v[8], v[9]);
-  in[7] = _mm_packs_epi32(v[0], v[1]);
-  in[8] = _mm_packs_epi32(v[2], v[3]);
-  in[9] = _mm_packs_epi32(v[10], v[11]);
-  in[10] = _mm_packs_epi32(v[14], v[15]);
-  in[11] = _mm_packs_epi32(v[6], v[7]);
-  in[12] = s[5];
-  in[13] = _mm_sub_epi16(kZero, s[13]);
-  in[14] = s[9];
-  in[15] = _mm_sub_epi16(kZero, s[1]);
-}
-
-static void idct16_8col(__m128i *in) {
-  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i v[16], u[16], s[16], t[16];
-
-  // stage 1
-  s[0] = in[0];
-  s[1] = in[8];
-  s[2] = in[4];
-  s[3] = in[12];
-  s[4] = in[2];
-  s[5] = in[10];
-  s[6] = in[6];
-  s[7] = in[14];
-  s[8] = in[1];
-  s[9] = in[9];
-  s[10] = in[5];
-  s[11] = in[13];
-  s[12] = in[3];
-  s[13] = in[11];
-  s[14] = in[7];
-  s[15] = in[15];
-
-  // stage 2
-  u[0] = _mm_unpacklo_epi16(s[8], s[15]);
-  u[1] = _mm_unpackhi_epi16(s[8], s[15]);
-  u[2] = _mm_unpacklo_epi16(s[9], s[14]);
-  u[3] = _mm_unpackhi_epi16(s[9], s[14]);
-  u[4] = _mm_unpacklo_epi16(s[10], s[13]);
-  u[5] = _mm_unpackhi_epi16(s[10], s[13]);
-  u[6] = _mm_unpacklo_epi16(s[11], s[12]);
-  u[7] = _mm_unpackhi_epi16(s[11], s[12]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[8]  = _mm_packs_epi32(u[0], u[1]);
-  s[15] = _mm_packs_epi32(u[2], u[3]);
-  s[9]  = _mm_packs_epi32(u[4], u[5]);
-  s[14] = _mm_packs_epi32(u[6], u[7]);
-  s[10] = _mm_packs_epi32(u[8], u[9]);
-  s[13] = _mm_packs_epi32(u[10], u[11]);
-  s[11] = _mm_packs_epi32(u[12], u[13]);
-  s[12] = _mm_packs_epi32(u[14], u[15]);
-
-  // stage 3
-  t[0] = s[0];
-  t[1] = s[1];
-  t[2] = s[2];
-  t[3] = s[3];
-  u[0] = _mm_unpacklo_epi16(s[4], s[7]);
-  u[1] = _mm_unpackhi_epi16(s[4], s[7]);
-  u[2] = _mm_unpacklo_epi16(s[5], s[6]);
-  u[3] = _mm_unpackhi_epi16(s[5], s[6]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  t[4] = _mm_packs_epi32(u[0], u[1]);
-  t[7] = _mm_packs_epi32(u[2], u[3]);
-  t[5] = _mm_packs_epi32(u[4], u[5]);
-  t[6] = _mm_packs_epi32(u[6], u[7]);
-  t[8] = _mm_add_epi16(s[8], s[9]);
-  t[9] = _mm_sub_epi16(s[8], s[9]);
-  t[10] = _mm_sub_epi16(s[11], s[10]);
-  t[11] = _mm_add_epi16(s[10], s[11]);
-  t[12] = _mm_add_epi16(s[12], s[13]);
-  t[13] = _mm_sub_epi16(s[12], s[13]);
-  t[14] = _mm_sub_epi16(s[15], s[14]);
-  t[15] = _mm_add_epi16(s[14], s[15]);
-
-  // stage 4
-  u[0] = _mm_unpacklo_epi16(t[0], t[1]);
-  u[1] = _mm_unpackhi_epi16(t[0], t[1]);
-  u[2] = _mm_unpacklo_epi16(t[2], t[3]);
-  u[3] = _mm_unpackhi_epi16(t[2], t[3]);
-  u[4] = _mm_unpacklo_epi16(t[9], t[14]);
-  u[5] = _mm_unpackhi_epi16(t[9], t[14]);
-  u[6] = _mm_unpacklo_epi16(t[10], t[13]);
-  u[7] = _mm_unpackhi_epi16(t[10], t[13]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[0] = _mm_packs_epi32(u[0], u[1]);
-  s[1] = _mm_packs_epi32(u[2], u[3]);
-  s[2] = _mm_packs_epi32(u[4], u[5]);
-  s[3] = _mm_packs_epi32(u[6], u[7]);
-  s[4] = _mm_add_epi16(t[4], t[5]);
-  s[5] = _mm_sub_epi16(t[4], t[5]);
-  s[6] = _mm_sub_epi16(t[7], t[6]);
-  s[7] = _mm_add_epi16(t[6], t[7]);
-  s[8] = t[8];
-  s[15] = t[15];
-  s[9]  = _mm_packs_epi32(u[8], u[9]);
-  s[14] = _mm_packs_epi32(u[10], u[11]);
-  s[10] = _mm_packs_epi32(u[12], u[13]);
-  s[13] = _mm_packs_epi32(u[14], u[15]);
-  s[11] = t[11];
-  s[12] = t[12];
-
-  // stage 5
-  t[0] = _mm_add_epi16(s[0], s[3]);
-  t[1] = _mm_add_epi16(s[1], s[2]);
-  t[2] = _mm_sub_epi16(s[1], s[2]);
-  t[3] = _mm_sub_epi16(s[0], s[3]);
-  t[4] = s[4];
-  t[7] = s[7];
-
-  u[0] = _mm_unpacklo_epi16(s[5], s[6]);
-  u[1] = _mm_unpackhi_epi16(s[5], s[6]);
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  t[5] = _mm_packs_epi32(u[0], u[1]);
-  t[6] = _mm_packs_epi32(u[2], u[3]);
-
-  t[8] = _mm_add_epi16(s[8], s[11]);
-  t[9] = _mm_add_epi16(s[9], s[10]);
-  t[10] = _mm_sub_epi16(s[9], s[10]);
-  t[11] = _mm_sub_epi16(s[8], s[11]);
-  t[12] = _mm_sub_epi16(s[15], s[12]);
-  t[13] = _mm_sub_epi16(s[14], s[13]);
-  t[14] = _mm_add_epi16(s[13], s[14]);
-  t[15] = _mm_add_epi16(s[12], s[15]);
-
-  // stage 6
-  s[0] = _mm_add_epi16(t[0], t[7]);
-  s[1] = _mm_add_epi16(t[1], t[6]);
-  s[2] = _mm_add_epi16(t[2], t[5]);
-  s[3] = _mm_add_epi16(t[3], t[4]);
-  s[4] = _mm_sub_epi16(t[3], t[4]);
-  s[5] = _mm_sub_epi16(t[2], t[5]);
-  s[6] = _mm_sub_epi16(t[1], t[6]);
-  s[7] = _mm_sub_epi16(t[0], t[7]);
-  s[8] = t[8];
-  s[9] = t[9];
-
-  u[0] = _mm_unpacklo_epi16(t[10], t[13]);
-  u[1] = _mm_unpackhi_epi16(t[10], t[13]);
-  u[2] = _mm_unpacklo_epi16(t[11], t[12]);
-  u[3] = _mm_unpackhi_epi16(t[11], t[12]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  s[10] = _mm_packs_epi32(u[0], u[1]);
-  s[13] = _mm_packs_epi32(u[2], u[3]);
-  s[11] = _mm_packs_epi32(u[4], u[5]);
-  s[12] = _mm_packs_epi32(u[6], u[7]);
-  s[14] = t[14];
-  s[15] = t[15];
-
-  // stage 7
-  in[0] = _mm_add_epi16(s[0], s[15]);
-  in[1] = _mm_add_epi16(s[1], s[14]);
-  in[2] = _mm_add_epi16(s[2], s[13]);
-  in[3] = _mm_add_epi16(s[3], s[12]);
-  in[4] = _mm_add_epi16(s[4], s[11]);
-  in[5] = _mm_add_epi16(s[5], s[10]);
-  in[6] = _mm_add_epi16(s[6], s[9]);
-  in[7] = _mm_add_epi16(s[7], s[8]);
-  in[8] = _mm_sub_epi16(s[7], s[8]);
-  in[9] = _mm_sub_epi16(s[6], s[9]);
-  in[10] = _mm_sub_epi16(s[5], s[10]);
-  in[11] = _mm_sub_epi16(s[4], s[11]);
-  in[12] = _mm_sub_epi16(s[3], s[12]);
-  in[13] = _mm_sub_epi16(s[2], s[13]);
-  in[14] = _mm_sub_epi16(s[1], s[14]);
-  in[15] = _mm_sub_epi16(s[0], s[15]);
-}
-
-void idct16_sse2(__m128i *in0, __m128i *in1) {
-  array_transpose_16x16(in0, in1);
-  idct16_8col(in0);
-  idct16_8col(in1);
-}
-
-void iadst16_sse2(__m128i *in0, __m128i *in1) {
-  array_transpose_16x16(in0, in1);
-  iadst16_8col(in0);
-  iadst16_8col(in1);
-}
-
-void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
-                               int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  __m128i in[16], l[16];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6,
-          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-          stp1_8_0, stp1_12_0;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-  // First 1-D inverse DCT
-  // Load input data.
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 8 * 2);
-  in[2] = load_input_data(input + 8 * 4);
-  in[3] = load_input_data(input + 8 * 6);
-
-  TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
-
-  // Stage2
-  {
-    const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
-    const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
-
-    tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
-    tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
-    tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
-    tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp5 = _mm_add_epi32(tmp5, rounding);
-    tmp7 = _mm_add_epi32(tmp7, rounding);
-
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
-
-    stp2_8  = _mm_packs_epi32(tmp0, tmp2);
-    stp2_11 = _mm_packs_epi32(tmp5, tmp7);
-  }
-
-  // Stage3
-  {
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
-
-    tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
-    tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
-    stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
-    stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
-
-    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
-  }
-
-  // Stage4
-  {
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
-
-    tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
-    tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
-    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
-    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
-    tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
-    tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp1 = _mm_add_epi32(tmp1, rounding);
-    tmp3 = _mm_add_epi32(tmp3, rounding);
-    tmp5 = _mm_add_epi32(tmp5, rounding);
-    tmp7 = _mm_add_epi32(tmp7, rounding);
-
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
-
-    stp1_0 = _mm_packs_epi32(tmp0, tmp0);
-    stp1_1 = _mm_packs_epi32(tmp2, tmp2);
-    stp2_9 = _mm_packs_epi32(tmp1, tmp3);
-    stp2_10 = _mm_packs_epi32(tmp5, tmp7);
-
-    stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
-  }
-
-  // Stage5 and Stage6
-  {
-    tmp0 = _mm_add_epi16(stp2_8, stp2_11);
-    tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
-    tmp2 = _mm_add_epi16(stp2_9, stp2_10);
-    tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
-
-    stp1_9  = _mm_unpacklo_epi64(tmp2, zero);
-    stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
-    stp1_8  = _mm_unpacklo_epi64(tmp0, zero);
-    stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
-
-    stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
-    stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
-    stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
-    stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
-  }
-
-  // Stage6
-  {
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
-
-    tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
-    tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
-    tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
-    tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
-    tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
-    tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
-
-    tmp1 = _mm_add_epi32(tmp1, rounding);
-    tmp3 = _mm_add_epi32(tmp3, rounding);
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp1_6 = _mm_packs_epi32(tmp3, tmp1);
-
-    stp2_10 = _mm_packs_epi32(tmp0, zero);
-    stp2_13 = _mm_packs_epi32(tmp2, zero);
-    stp2_11 = _mm_packs_epi32(tmp4, zero);
-    stp2_12 = _mm_packs_epi32(tmp6, zero);
-
-    tmp0 = _mm_add_epi16(stp1_0, stp1_4);
-    tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
-    tmp2 = _mm_add_epi16(stp1_1, stp1_6);
-    tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
-
-    stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
-    stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
-    stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
-    stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
-    stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
-    stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
-    stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
-    stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
-  }
-
-  // Stage7. Left 8x16 only.
-  l[0] = _mm_add_epi16(stp2_0, stp1_15);
-  l[1] = _mm_add_epi16(stp2_1, stp1_14);
-  l[2] = _mm_add_epi16(stp2_2, stp2_13);
-  l[3] = _mm_add_epi16(stp2_3, stp2_12);
-  l[4] = _mm_add_epi16(stp2_4, stp2_11);
-  l[5] = _mm_add_epi16(stp2_5, stp2_10);
-  l[6] = _mm_add_epi16(stp2_6, stp1_9);
-  l[7] = _mm_add_epi16(stp2_7, stp1_8);
-  l[8] = _mm_sub_epi16(stp2_7, stp1_8);
-  l[9] = _mm_sub_epi16(stp2_6, stp1_9);
-  l[10] = _mm_sub_epi16(stp2_5, stp2_10);
-  l[11] = _mm_sub_epi16(stp2_4, stp2_11);
-  l[12] = _mm_sub_epi16(stp2_3, stp2_12);
-  l[13] = _mm_sub_epi16(stp2_2, stp2_13);
-  l[14] = _mm_sub_epi16(stp2_1, stp1_14);
-  l[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-  // Second 1-D inverse transform, performed per 8x16 block
-  for (i = 0; i < 2; i++) {
-    int j;
-    array_transpose_4X8(l + 8 * i, in);
-
-    IDCT16_10
-
-    // Stage7
-    in[0] = _mm_add_epi16(stp2_0, stp1_15);
-    in[1] = _mm_add_epi16(stp2_1, stp1_14);
-    in[2] = _mm_add_epi16(stp2_2, stp2_13);
-    in[3] = _mm_add_epi16(stp2_3, stp2_12);
-    in[4] = _mm_add_epi16(stp2_4, stp2_11);
-    in[5] = _mm_add_epi16(stp2_5, stp2_10);
-    in[6] = _mm_add_epi16(stp2_6, stp1_9);
-    in[7] = _mm_add_epi16(stp2_7, stp1_8);
-    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
-    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
-    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
-    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
-    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
-    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
-    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
-    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-    for (j = 0; j < 16; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
-
-    dest += 8;
-  }
-}
-
-#define LOAD_DQCOEFF(reg, input) \
-  {  \
-    reg = load_input_data(input); \
-    input += 8; \
-  }  \
-
-#define IDCT32_34 \
-/* Stage1 */ \
-{ \
-  const __m128i zero = _mm_setzero_si128();\
-  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
-  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
-  \
-  const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \
-  const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
-  \
-  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
-  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
-  \
-  const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
-  const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
-  \
-  MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \
-                         stg1_1, stp1_16, stp1_31); \
-  MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \
-                         stg1_7, stp1_19, stp1_28); \
-  MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \
-                         stg1_9, stp1_20, stp1_27); \
-  MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \
-                         stg1_15, stp1_23, stp1_24); \
-} \
-\
-/* Stage2 */ \
-{ \
-  const __m128i zero = _mm_setzero_si128();\
-  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
-  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
-  \
-  const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
-  const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
-  \
-  MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \
-                         stg2_1, stp2_8, stp2_15); \
-  MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \
-                         stg2_7, stp2_11, stp2_12); \
-  \
-  stp2_16 = stp1_16; \
-  stp2_19 = stp1_19; \
-  \
-  stp2_20 = stp1_20; \
-  stp2_23 = stp1_23; \
-  \
-  stp2_24 = stp1_24; \
-  stp2_27 = stp1_27; \
-  \
-  stp2_28 = stp1_28; \
-  stp2_31 = stp1_31; \
-} \
-\
-/* Stage3 */ \
-{ \
-  const __m128i zero = _mm_setzero_si128();\
-  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
-  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
-  \
-  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
-  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
-  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
-  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
-  \
-  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
-  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
-  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
-  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
-  \
-  MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \
-                         stg3_1, stp1_4, stp1_7); \
-  \
-  stp1_8 = stp2_8; \
-  stp1_11 = stp2_11; \
-  stp1_12 = stp2_12; \
-  stp1_15 = stp2_15; \
-  \
-  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
-                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
-                         stp1_18, stp1_29) \
-  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
-                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
-                         stp1_22, stp1_25) \
-  \
-  stp1_16 = stp2_16; \
-  stp1_31 = stp2_31; \
-  stp1_19 = stp2_19; \
-  stp1_20 = stp2_20; \
-  stp1_23 = stp2_23; \
-  stp1_24 = stp2_24; \
-  stp1_27 = stp2_27; \
-  stp1_28 = stp2_28; \
-} \
-\
-/* Stage4 */ \
-{ \
-  const __m128i zero = _mm_setzero_si128();\
-  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
-  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
-  \
-  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
-  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
-  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
-  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
-  \
-  MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \
-                         stg4_1, stp2_0, stp2_1); \
-  \
-  stp2_4 = stp1_4; \
-  stp2_5 = stp1_4; \
-  stp2_6 = stp1_7; \
-  stp2_7 = stp1_7; \
-  \
-  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
-                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
-                         stp2_10, stp2_13) \
-  \
-  stp2_8 = stp1_8; \
-  stp2_15 = stp1_15; \
-  stp2_11 = stp1_11; \
-  stp2_12 = stp1_12; \
-  \
-  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
-  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
-  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
-  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
-  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
-  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
-  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
-  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
-  \
-  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
-  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
-  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
-  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
-  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
-  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
-  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
-  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
-} \
-\
-/* Stage5 */ \
-{ \
-  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
-  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
-  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
-  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
-  \
-  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
-  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
-  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
-  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
-  \
-  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
-  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
-  \
-  stp1_0 = stp2_0; \
-  stp1_1 = stp2_1; \
-  stp1_2 = stp2_1; \
-  stp1_3 = stp2_0; \
-  \
-  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
-  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
-  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
-  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
-  \
-  tmp0 = _mm_add_epi32(tmp0, rounding); \
-  tmp1 = _mm_add_epi32(tmp1, rounding); \
-  tmp2 = _mm_add_epi32(tmp2, rounding); \
-  tmp3 = _mm_add_epi32(tmp3, rounding); \
-  \
-  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-  \
-  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
-  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
-  \
-  stp1_4 = stp2_4; \
-  stp1_7 = stp2_7; \
-  \
-  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
-  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
-  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
-  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
-  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
-  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
-  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
-  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
-  \
-  stp1_16 = stp2_16; \
-  stp1_17 = stp2_17; \
-  \
-  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
-                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
-                         stp1_19, stp1_28) \
-  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
-                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
-                         stp1_21, stp1_26) \
-  \
-  stp1_22 = stp2_22; \
-  stp1_23 = stp2_23; \
-  stp1_24 = stp2_24; \
-  stp1_25 = stp2_25; \
-  stp1_30 = stp2_30; \
-  stp1_31 = stp2_31; \
-} \
-\
-/* Stage6 */ \
-{ \
-  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
-  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
-  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
-  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
-  \
-  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
-  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
-  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
-  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
-  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
-  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
-  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
-  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
-  \
-  stp2_8 = stp1_8; \
-  stp2_9 = stp1_9; \
-  stp2_14 = stp1_14; \
-  stp2_15 = stp1_15; \
-  \
-  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
-                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
-                         stp2_13, stp2_11, stp2_12) \
-  \
-  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
-  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
-  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
-  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
-  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
-  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
-  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
-  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
-  \
-  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
-  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
-  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
-  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
-  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
-  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
-  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
-  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
-} \
-\
-/* Stage7 */ \
-{ \
-  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
-  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
-  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
-  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
-  \
-  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
-  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
-  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
-  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
-  \
-  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
-  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
-  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
-  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
-  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
-  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
-  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
-  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
-  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
-  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
-  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
-  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
-  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
-  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
-  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
-  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
-  \
-  stp1_16 = stp2_16; \
-  stp1_17 = stp2_17; \
-  stp1_18 = stp2_18; \
-  stp1_19 = stp2_19; \
-  \
-  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
-                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
-                         stp1_21, stp1_26) \
-  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
-                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
-                         stp1_23, stp1_24) \
-  \
-  stp1_28 = stp2_28; \
-  stp1_29 = stp2_29; \
-  stp1_30 = stp2_30; \
-  stp1_31 = stp2_31; \
-}
-
-
-#define IDCT32 \
-/* Stage1 */ \
-{ \
-  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
-  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
-  const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
-  const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
-  \
-  const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
-  const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
-  const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \
-  const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
-  \
-  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
-  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
-  const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
-  const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
-  \
-  const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
-  const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
-  const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
-  const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
-  \
-  MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
-                         stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
-                         stp1_17, stp1_30) \
-  MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
-                         stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
-                         stp1_19, stp1_28) \
-  MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
-                         stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
-                         stp1_21, stp1_26) \
-  MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
-                         stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
-                         stp1_23, stp1_24) \
-} \
-\
-/* Stage2 */ \
-{ \
-  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
-  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
-  const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
-  const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
-  \
-  const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
-  const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
-  const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
-  const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
-  \
-  MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
-                         stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
-                         stp2_14) \
-  MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
-                         stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
-                         stp2_11, stp2_12) \
-  \
-  stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
-  stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
-  stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
-  stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
-  \
-  stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
-  stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
-  stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
-  stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
-  \
-  stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
-  stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
-  stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
-  stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
-  \
-  stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
-  stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
-  stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
-  stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
-} \
-\
-/* Stage3 */ \
-{ \
-  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
-  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
-  const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
-  const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
-  \
-  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
-  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
-  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
-  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
-  \
-  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
-  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
-  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
-  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
-  \
-  MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
-                         stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
-                         stp1_6) \
-  \
-  stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
-  stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
-  stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
-  stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
-  stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
-  stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
-  stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
-  stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
-  \
-  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
-                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
-                         stp1_18, stp1_29) \
-  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
-                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
-                         stp1_22, stp1_25) \
-  \
-  stp1_16 = stp2_16; \
-  stp1_31 = stp2_31; \
-  stp1_19 = stp2_19; \
-  stp1_20 = stp2_20; \
-  stp1_23 = stp2_23; \
-  stp1_24 = stp2_24; \
-  stp1_27 = stp2_27; \
-  stp1_28 = stp2_28; \
-} \
-\
-/* Stage4 */ \
-{ \
-  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
-  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
-  const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
-  const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
-  \
-  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
-  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
-  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
-  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
-  \
-  MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
-                         stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
-                         stp2_2, stp2_3) \
-  \
-  stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
-  stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
-  stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
-  stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
-  \
-  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
-                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
-                         stp2_10, stp2_13) \
-  \
-  stp2_8 = stp1_8; \
-  stp2_15 = stp1_15; \
-  stp2_11 = stp1_11; \
-  stp2_12 = stp1_12; \
-  \
-  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
-  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
-  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
-  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
-  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
-  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
-  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
-  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
-  \
-  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
-  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
-  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
-  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
-  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
-  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
-  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
-  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
-} \
-\
-/* Stage5 */ \
-{ \
-  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
-  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
-  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
-  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
-  \
-  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
-  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
-  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
-  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
-  \
-  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
-  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
-  \
-  stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
-  stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
-  stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
-  stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
-  \
-  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
-  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
-  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
-  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
-  \
-  tmp0 = _mm_add_epi32(tmp0, rounding); \
-  tmp1 = _mm_add_epi32(tmp1, rounding); \
-  tmp2 = _mm_add_epi32(tmp2, rounding); \
-  tmp3 = _mm_add_epi32(tmp3, rounding); \
-  \
-  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-  \
-  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
-  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
-  \
-  stp1_4 = stp2_4; \
-  stp1_7 = stp2_7; \
-  \
-  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
-  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
-  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
-  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
-  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
-  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
-  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
-  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
-  \
-  stp1_16 = stp2_16; \
-  stp1_17 = stp2_17; \
-  \
-  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
-                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
-                         stp1_19, stp1_28) \
-  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
-                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
-                         stp1_21, stp1_26) \
-  \
-  stp1_22 = stp2_22; \
-  stp1_23 = stp2_23; \
-  stp1_24 = stp2_24; \
-  stp1_25 = stp2_25; \
-  stp1_30 = stp2_30; \
-  stp1_31 = stp2_31; \
-} \
-\
-/* Stage6 */ \
-{ \
-  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
-  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
-  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
-  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
-  \
-  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
-  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
-  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
-  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
-  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
-  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
-  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
-  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
-  \
-  stp2_8 = stp1_8; \
-  stp2_9 = stp1_9; \
-  stp2_14 = stp1_14; \
-  stp2_15 = stp1_15; \
-  \
-  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
-                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
-                         stp2_13, stp2_11, stp2_12) \
-  \
-  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
-  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
-  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
-  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
-  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
-  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
-  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
-  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
-  \
-  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
-  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
-  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
-  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
-  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
-  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
-  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
-  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
-} \
-\
-/* Stage7 */ \
-{ \
-  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
-  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
-  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
-  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
-  \
-  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
-  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
-  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
-  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
-  \
-  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
-  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
-  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
-  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
-  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
-  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
-  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
-  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
-  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
-  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
-  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
-  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
-  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
-  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
-  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
-  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
-  \
-  stp1_16 = stp2_16; \
-  stp1_17 = stp2_17; \
-  stp1_18 = stp2_18; \
-  stp1_19 = stp2_19; \
-  \
-  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
-                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
-                         stp1_21, stp1_26) \
-  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
-                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
-                         stp1_23, stp1_24) \
-  \
-  stp1_28 = stp2_28; \
-  stp1_29 = stp2_29; \
-  stp1_30 = stp2_30; \
-  stp1_31 = stp2_31; \
-}
-
-// Only upper-left 8x8 has non-zero coeff
-void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
-                               int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1<<5);
-
-  // idct constants for each stage
-  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in[32], col[32];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
-          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
-          stp1_30, stp1_31;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
-          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
-          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
-          stp2_30, stp2_31;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-
-  // Load input data. Only need to load the top left 8x8 block.
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 32);
-  in[2] = load_input_data(input + 64);
-  in[3] = load_input_data(input + 96);
-  in[4] = load_input_data(input + 128);
-  in[5] = load_input_data(input + 160);
-  in[6] = load_input_data(input + 192);
-  in[7] = load_input_data(input + 224);
-
-  for (i = 8; i < 32; ++i) {
-    in[i] = _mm_setzero_si128();
-  }
-
-  array_transpose_8x8(in, in);
-  // TODO(hkuang): Following transposes are unnecessary. But remove them will
-  // lead to performance drop on some devices.
-  array_transpose_8x8(in + 8, in + 8);
-  array_transpose_8x8(in + 16, in + 16);
-  array_transpose_8x8(in + 24, in + 24);
-
-  IDCT32_34
-
-  // 1_D: Store 32 intermediate results for each 8x32 block.
-  col[0] = _mm_add_epi16(stp1_0, stp1_31);
-  col[1] = _mm_add_epi16(stp1_1, stp1_30);
-  col[2] = _mm_add_epi16(stp1_2, stp1_29);
-  col[3] = _mm_add_epi16(stp1_3, stp1_28);
-  col[4] = _mm_add_epi16(stp1_4, stp1_27);
-  col[5] = _mm_add_epi16(stp1_5, stp1_26);
-  col[6] = _mm_add_epi16(stp1_6, stp1_25);
-  col[7] = _mm_add_epi16(stp1_7, stp1_24);
-  col[8] = _mm_add_epi16(stp1_8, stp1_23);
-  col[9] = _mm_add_epi16(stp1_9, stp1_22);
-  col[10] = _mm_add_epi16(stp1_10, stp1_21);
-  col[11] = _mm_add_epi16(stp1_11, stp1_20);
-  col[12] = _mm_add_epi16(stp1_12, stp1_19);
-  col[13] = _mm_add_epi16(stp1_13, stp1_18);
-  col[14] = _mm_add_epi16(stp1_14, stp1_17);
-  col[15] = _mm_add_epi16(stp1_15, stp1_16);
-  col[16] = _mm_sub_epi16(stp1_15, stp1_16);
-  col[17] = _mm_sub_epi16(stp1_14, stp1_17);
-  col[18] = _mm_sub_epi16(stp1_13, stp1_18);
-  col[19] = _mm_sub_epi16(stp1_12, stp1_19);
-  col[20] = _mm_sub_epi16(stp1_11, stp1_20);
-  col[21] = _mm_sub_epi16(stp1_10, stp1_21);
-  col[22] = _mm_sub_epi16(stp1_9, stp1_22);
-  col[23] = _mm_sub_epi16(stp1_8, stp1_23);
-  col[24] = _mm_sub_epi16(stp1_7, stp1_24);
-  col[25] = _mm_sub_epi16(stp1_6, stp1_25);
-  col[26] = _mm_sub_epi16(stp1_5, stp1_26);
-  col[27] = _mm_sub_epi16(stp1_4, stp1_27);
-  col[28] = _mm_sub_epi16(stp1_3, stp1_28);
-  col[29] = _mm_sub_epi16(stp1_2, stp1_29);
-  col[30] = _mm_sub_epi16(stp1_1, stp1_30);
-  col[31] = _mm_sub_epi16(stp1_0, stp1_31);
-  for (i = 0; i < 4; i++) {
-    int j;
-    const __m128i zero = _mm_setzero_si128();
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(col + i * 8, in);
-    IDCT32_34
-
-    // 2_D: Calculate the results and store them to destination.
-    in[0] = _mm_add_epi16(stp1_0, stp1_31);
-    in[1] = _mm_add_epi16(stp1_1, stp1_30);
-    in[2] = _mm_add_epi16(stp1_2, stp1_29);
-    in[3] = _mm_add_epi16(stp1_3, stp1_28);
-    in[4] = _mm_add_epi16(stp1_4, stp1_27);
-    in[5] = _mm_add_epi16(stp1_5, stp1_26);
-    in[6] = _mm_add_epi16(stp1_6, stp1_25);
-    in[7] = _mm_add_epi16(stp1_7, stp1_24);
-    in[8] = _mm_add_epi16(stp1_8, stp1_23);
-    in[9] = _mm_add_epi16(stp1_9, stp1_22);
-    in[10] = _mm_add_epi16(stp1_10, stp1_21);
-    in[11] = _mm_add_epi16(stp1_11, stp1_20);
-    in[12] = _mm_add_epi16(stp1_12, stp1_19);
-    in[13] = _mm_add_epi16(stp1_13, stp1_18);
-    in[14] = _mm_add_epi16(stp1_14, stp1_17);
-    in[15] = _mm_add_epi16(stp1_15, stp1_16);
-    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
-    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
-    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
-    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
-    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
-    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
-    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
-    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
-    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
-    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
-    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
-    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
-    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
-    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
-    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
-    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
-
-    for (j = 0; j < 32; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
-
-    dest += 8;
-  }
-}
-
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
-                                 int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-
-  // idct constants for each stage
-  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
-  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
-  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
-  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
-  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
-  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
-  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
-  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in[32], col[128], zero_idx[16];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
-          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
-          stp1_30, stp1_31;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
-          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
-          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
-          stp2_30, stp2_31;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i, j, i32;
-
-  for (i = 0; i < 4; i++) {
-    i32 = (i << 5);
-    // First 1-D idct
-    // Load input data.
-    LOAD_DQCOEFF(in[0], input);
-    LOAD_DQCOEFF(in[8], input);
-    LOAD_DQCOEFF(in[16], input);
-    LOAD_DQCOEFF(in[24], input);
-    LOAD_DQCOEFF(in[1], input);
-    LOAD_DQCOEFF(in[9], input);
-    LOAD_DQCOEFF(in[17], input);
-    LOAD_DQCOEFF(in[25], input);
-    LOAD_DQCOEFF(in[2], input);
-    LOAD_DQCOEFF(in[10], input);
-    LOAD_DQCOEFF(in[18], input);
-    LOAD_DQCOEFF(in[26], input);
-    LOAD_DQCOEFF(in[3], input);
-    LOAD_DQCOEFF(in[11], input);
-    LOAD_DQCOEFF(in[19], input);
-    LOAD_DQCOEFF(in[27], input);
-
-    LOAD_DQCOEFF(in[4], input);
-    LOAD_DQCOEFF(in[12], input);
-    LOAD_DQCOEFF(in[20], input);
-    LOAD_DQCOEFF(in[28], input);
-    LOAD_DQCOEFF(in[5], input);
-    LOAD_DQCOEFF(in[13], input);
-    LOAD_DQCOEFF(in[21], input);
-    LOAD_DQCOEFF(in[29], input);
-    LOAD_DQCOEFF(in[6], input);
-    LOAD_DQCOEFF(in[14], input);
-    LOAD_DQCOEFF(in[22], input);
-    LOAD_DQCOEFF(in[30], input);
-    LOAD_DQCOEFF(in[7], input);
-    LOAD_DQCOEFF(in[15], input);
-    LOAD_DQCOEFF(in[23], input);
-    LOAD_DQCOEFF(in[31], input);
-
-    // checking if all entries are zero
-    zero_idx[0] = _mm_or_si128(in[0], in[1]);
-    zero_idx[1] = _mm_or_si128(in[2], in[3]);
-    zero_idx[2] = _mm_or_si128(in[4], in[5]);
-    zero_idx[3] = _mm_or_si128(in[6], in[7]);
-    zero_idx[4] = _mm_or_si128(in[8], in[9]);
-    zero_idx[5] = _mm_or_si128(in[10], in[11]);
-    zero_idx[6] = _mm_or_si128(in[12], in[13]);
-    zero_idx[7] = _mm_or_si128(in[14], in[15]);
-    zero_idx[8] = _mm_or_si128(in[16], in[17]);
-    zero_idx[9] = _mm_or_si128(in[18], in[19]);
-    zero_idx[10] = _mm_or_si128(in[20], in[21]);
-    zero_idx[11] = _mm_or_si128(in[22], in[23]);
-    zero_idx[12] = _mm_or_si128(in[24], in[25]);
-    zero_idx[13] = _mm_or_si128(in[26], in[27]);
-    zero_idx[14] = _mm_or_si128(in[28], in[29]);
-    zero_idx[15] = _mm_or_si128(in[30], in[31]);
-
-    zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
-    zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
-    zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
-    zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
-    zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
-    zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
-    zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
-    zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
-
-    zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
-    zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
-    zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
-    zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
-    zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
-    zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
-    zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
-
-    if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
-      col[i32 + 0] = _mm_setzero_si128();
-      col[i32 + 1] = _mm_setzero_si128();
-      col[i32 + 2] = _mm_setzero_si128();
-      col[i32 + 3] = _mm_setzero_si128();
-      col[i32 + 4] = _mm_setzero_si128();
-      col[i32 + 5] = _mm_setzero_si128();
-      col[i32 + 6] = _mm_setzero_si128();
-      col[i32 + 7] = _mm_setzero_si128();
-      col[i32 + 8] = _mm_setzero_si128();
-      col[i32 + 9] = _mm_setzero_si128();
-      col[i32 + 10] = _mm_setzero_si128();
-      col[i32 + 11] = _mm_setzero_si128();
-      col[i32 + 12] = _mm_setzero_si128();
-      col[i32 + 13] = _mm_setzero_si128();
-      col[i32 + 14] = _mm_setzero_si128();
-      col[i32 + 15] = _mm_setzero_si128();
-      col[i32 + 16] = _mm_setzero_si128();
-      col[i32 + 17] = _mm_setzero_si128();
-      col[i32 + 18] = _mm_setzero_si128();
-      col[i32 + 19] = _mm_setzero_si128();
-      col[i32 + 20] = _mm_setzero_si128();
-      col[i32 + 21] = _mm_setzero_si128();
-      col[i32 + 22] = _mm_setzero_si128();
-      col[i32 + 23] = _mm_setzero_si128();
-      col[i32 + 24] = _mm_setzero_si128();
-      col[i32 + 25] = _mm_setzero_si128();
-      col[i32 + 26] = _mm_setzero_si128();
-      col[i32 + 27] = _mm_setzero_si128();
-      col[i32 + 28] = _mm_setzero_si128();
-      col[i32 + 29] = _mm_setzero_si128();
-      col[i32 + 30] = _mm_setzero_si128();
-      col[i32 + 31] = _mm_setzero_si128();
-      continue;
-    }
-
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(in, in);
-    array_transpose_8x8(in + 8, in + 8);
-    array_transpose_8x8(in + 16, in + 16);
-    array_transpose_8x8(in + 24, in + 24);
-
-    IDCT32
-
-    // 1_D: Store 32 intermediate results for each 8x32 block.
-    col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
-    col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
-    col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
-    col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
-    col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
-    col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
-    col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
-    col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
-    col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
-    col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
-    col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
-    col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
-    col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
-    col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
-    col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
-    col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
-    col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
-    col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
-    col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
-    col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
-    col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
-    col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
-    col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
-    col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
-    col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
-    col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
-    col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
-    col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
-    col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
-    col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
-    col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
-    col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
-  }
-  for (i = 0; i < 4; i++) {
-    // Second 1-D idct
-    j = i << 3;
-
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(col + j, in);
-    array_transpose_8x8(col + j + 32, in + 8);
-    array_transpose_8x8(col + j + 64, in + 16);
-    array_transpose_8x8(col + j + 96, in + 24);
-
-    IDCT32
-
-    // 2_D: Calculate the results and store them to destination.
-    in[0] = _mm_add_epi16(stp1_0, stp1_31);
-    in[1] = _mm_add_epi16(stp1_1, stp1_30);
-    in[2] = _mm_add_epi16(stp1_2, stp1_29);
-    in[3] = _mm_add_epi16(stp1_3, stp1_28);
-    in[4] = _mm_add_epi16(stp1_4, stp1_27);
-    in[5] = _mm_add_epi16(stp1_5, stp1_26);
-    in[6] = _mm_add_epi16(stp1_6, stp1_25);
-    in[7] = _mm_add_epi16(stp1_7, stp1_24);
-    in[8] = _mm_add_epi16(stp1_8, stp1_23);
-    in[9] = _mm_add_epi16(stp1_9, stp1_22);
-    in[10] = _mm_add_epi16(stp1_10, stp1_21);
-    in[11] = _mm_add_epi16(stp1_11, stp1_20);
-    in[12] = _mm_add_epi16(stp1_12, stp1_19);
-    in[13] = _mm_add_epi16(stp1_13, stp1_18);
-    in[14] = _mm_add_epi16(stp1_14, stp1_17);
-    in[15] = _mm_add_epi16(stp1_15, stp1_16);
-    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
-    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
-    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
-    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
-    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
-    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
-    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
-    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
-    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
-    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
-    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
-    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
-    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
-    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
-    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
-    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
-
-    for (j = 0; j < 32; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
-
-    dest += 8;
-  }
-}
-
-void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
-                              int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a, j;
-
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 6);
-
-  dc_value = _mm_set1_epi16(a);
-
-  for (j = 0; j < 32; ++j) {
-    RECON_AND_STORE(dest +  0 + j * stride, dc_value);
-    RECON_AND_STORE(dest +  8 + j * stride, dc_value);
-    RECON_AND_STORE(dest + 16 + j * stride, dc_value);
-    RECON_AND_STORE(dest + 24 + j * stride, dc_value);
-  }
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
-  __m128i ubounded, retval;
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
-  ubounded = _mm_cmpgt_epi16(value, max);
-  retval = _mm_andnot_si128(ubounded, value);
-  ubounded = _mm_and_si128(ubounded, max);
-  retval = _mm_or_si128(retval, ubounded);
-  retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
-  return retval;
-}
-
-void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                    int stride, int bd) {
-  tran_low_t out[4 * 4];
-  tran_low_t *outptr = out;
-  int i, j;
-  __m128i inptr[4];
-  __m128i sign_bits[2];
-  __m128i temp_mm, min_input, max_input;
-  int test;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  int optimised_cols = 0;
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i eight = _mm_set1_epi16(8);
-  const __m128i max = _mm_set1_epi16(12043);
-  const __m128i min = _mm_set1_epi16(-12043);
-  // Load input into __m128i
-  inptr[0] = _mm_loadu_si128((const __m128i *)input);
-  inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
-  inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
-  inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
-
-  // Pack to 16 bits
-  inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
-  inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
-
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp_mm = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp_mm);
-
-  if (!test) {
-    // Do the row transform
-    idct4_sse2(inptr);
-
-    // Check the min & max values
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp_mm = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp_mm);
-
-    if (test) {
-      transpose_4x4(inptr);
-      sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
-      sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
-      inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
-      inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
-      inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
-      inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
-      _mm_storeu_si128((__m128i *)outptr, inptr[0]);
-      _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
-      _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
-      _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 4; ++i) {
-      vpx_highbd_idct4_c(input, outptr, bd);
-      input += 4;
-      outptr += 4;
-    }
-  }
-
-  if (optimised_cols) {
-    idct4_sse2(inptr);
-
-    // Final round and shift
-    inptr[0] = _mm_add_epi16(inptr[0], eight);
-    inptr[1] = _mm_add_epi16(inptr[1], eight);
-
-    inptr[0] = _mm_srai_epi16(inptr[0], 4);
-    inptr[1] = _mm_srai_epi16(inptr[1], 4);
-
-    // Reconstruction and Store
-    {
-      __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
-      __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
-      d0 = _mm_unpacklo_epi64(
-          d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
-      d2 = _mm_unpacklo_epi64(
-          d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
-      d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
-      d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
-      // store input0
-      _mm_storel_epi64((__m128i *)dest, d0);
-      // store input1
-      d0 = _mm_srli_si128(d0, 8);
-      _mm_storel_epi64((__m128i *)(dest + stride), d0);
-      // store input2
-      _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
-      // store input3
-      d2 = _mm_srli_si128(d2, 8);
-      _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[4], temp_out[4];
-    // Columns
-    for (i = 0; i < 4; ++i) {
-      for (j = 0; j < 4; ++j)
-        temp_in[j] = out[j * 4 + i];
-      vpx_highbd_idct4_c(temp_in, temp_out, bd);
-      for (j = 0; j < 4; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
-      }
-    }
-  }
-}
-
-void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                    int stride, int bd) {
-  tran_low_t out[8 * 8];
-  tran_low_t *outptr = out;
-  int i, j, test;
-  __m128i inptr[8];
-  __m128i min_input, max_input, temp1, temp2, sign_bits;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i sixteen = _mm_set1_epi16(16);
-  const __m128i max = _mm_set1_epi16(6201);
-  const __m128i min = _mm_set1_epi16(-6201);
-  int optimised_cols = 0;
-
-  // Load input into __m128i & pack to 16 bits
-  for (i = 0; i < 8; i++) {
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
-    inptr[i] = _mm_packs_epi32(temp1, temp2);
-  }
-
-  // Find the min & max for the row transform
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  for (i = 2; i < 8; i++) {
-    max_input = _mm_max_epi16(max_input, inptr[i]);
-    min_input = _mm_min_epi16(min_input, inptr[i]);
-  }
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp1 = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp1);
-
-  if (!test) {
-    // Do the row transform
-    idct8_sse2(inptr);
-
-    // Find the min & max for the column transform
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    for (i = 2; i < 8; i++) {
-      max_input = _mm_max_epi16(max_input, inptr[i]);
-      min_input = _mm_min_epi16(min_input, inptr[i]);
-    }
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp1 = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp1);
-
-    if (test) {
-      array_transpose_8x8(inptr, inptr);
-      for (i = 0; i < 8; i++) {
-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
-        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
-        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
-      }
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 8; ++i) {
-      vpx_highbd_idct8_c(input, outptr, bd);
-      input += 8;
-      outptr += 8;
-    }
-  }
-
-  if (optimised_cols) {
-    idct8_sse2(inptr);
-
-    // Final round & shift and Reconstruction and Store
-    {
-      __m128i d[8];
-      for (i = 0; i < 8; i++) {
-        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
-        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
-        inptr[i] = _mm_srai_epi16(inptr[i], 5);
-        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
-        // Store
-        _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
-      }
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[8], temp_out[8];
-    for (i = 0; i < 8; ++i) {
-      for (j = 0; j < 8; ++j)
-        temp_in[j] = out[j * 8 + i];
-      vpx_highbd_idct8_c(temp_in, temp_out, bd);
-      for (j = 0; j < 8; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
-      }
-    }
-  }
-}
-
-void vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                    int stride, int bd) {
-  tran_low_t out[8 * 8] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j, test;
-  __m128i inptr[8];
-  __m128i min_input, max_input, temp1, temp2, sign_bits;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i sixteen = _mm_set1_epi16(16);
-  const __m128i max = _mm_set1_epi16(6201);
-  const __m128i min = _mm_set1_epi16(-6201);
-  int optimised_cols = 0;
-
-  // Load input into __m128i & pack to 16 bits
-  for (i = 0; i < 8; i++) {
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
-    inptr[i] = _mm_packs_epi32(temp1, temp2);
-  }
-
-  // Find the min & max for the row transform
-  // only first 4 row has non-zero coefs
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  for (i = 2; i < 4; i++) {
-    max_input = _mm_max_epi16(max_input, inptr[i]);
-    min_input = _mm_min_epi16(min_input, inptr[i]);
-  }
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp1 = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp1);
-
-  if (!test) {
-    // Do the row transform
-    idct8_sse2(inptr);
-
-    // Find the min & max for the column transform
-    // N.B. Only first 4 cols contain non-zero coeffs
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    for (i = 2; i < 8; i++) {
-      max_input = _mm_max_epi16(max_input, inptr[i]);
-      min_input = _mm_min_epi16(min_input, inptr[i]);
-    }
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp1 = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp1);
-
-    if (test) {
-      // Use fact only first 4 rows contain non-zero coeffs
-      array_transpose_4X8(inptr, inptr);
-      for (i = 0; i < 4; i++) {
-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
-        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
-        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
-      }
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 4; ++i) {
-      vpx_highbd_idct8_c(input, outptr, bd);
-      input += 8;
-      outptr += 8;
-    }
-  }
-
-  if (optimised_cols) {
-    idct8_sse2(inptr);
-
-    // Final round & shift and Reconstruction and Store
-    {
-      __m128i d[8];
-      for (i = 0; i < 8; i++) {
-        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
-        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
-        inptr[i] = _mm_srai_epi16(inptr[i], 5);
-        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
-        // Store
-        _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
-      }
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[8], temp_out[8];
-    for (i = 0; i < 8; ++i) {
-      for (j = 0; j < 8; ++j)
-        temp_in[j] = out[j * 8 + i];
-      vpx_highbd_idct8_c(temp_in, temp_out, bd);
-      for (j = 0; j < 8; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
-      }
-    }
-  }
-}
-
-void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                       int stride, int bd) {
-  tran_low_t out[16 * 16];
-  tran_low_t *outptr = out;
-  int i, j, test;
-  __m128i inptr[32];
-  __m128i min_input, max_input, temp1, temp2, sign_bits;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i rounding = _mm_set1_epi16(32);
-  const __m128i max = _mm_set1_epi16(3155);
-  const __m128i min = _mm_set1_epi16(-3155);
-  int optimised_cols = 0;
-
-  // Load input into __m128i & pack to 16 bits
-  for (i = 0; i < 16; i++) {
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
-    inptr[i] = _mm_packs_epi32(temp1, temp2);
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
-    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
-  }
-
-  // Find the min & max for the row transform
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  for (i = 2; i < 32; i++) {
-    max_input = _mm_max_epi16(max_input, inptr[i]);
-    min_input = _mm_min_epi16(min_input, inptr[i]);
-  }
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp1 = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp1);
-
-  if (!test) {
-    // Do the row transform
-    idct16_sse2(inptr, inptr + 16);
-
-    // Find the min & max for the column transform
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    for (i = 2; i < 32; i++) {
-      max_input = _mm_max_epi16(max_input, inptr[i]);
-      min_input = _mm_min_epi16(min_input, inptr[i]);
-    }
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp1 = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp1);
-
-    if (test) {
-      array_transpose_16x16(inptr, inptr + 16);
-      for (i = 0; i < 16; i++) {
-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
-        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
-        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
-        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
-        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
-        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
-      }
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 16; ++i) {
-      vpx_highbd_idct16_c(input, outptr, bd);
-      input += 16;
-      outptr += 16;
-    }
-  }
-
-  if (optimised_cols) {
-    idct16_sse2(inptr, inptr + 16);
-
-    // Final round & shift and Reconstruction and Store
-    {
-      __m128i d[2];
-      for (i = 0; i < 16; i++) {
-        inptr[i   ] = _mm_add_epi16(inptr[i   ], rounding);
-        inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
-        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
-        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
-        inptr[i   ] = _mm_srai_epi16(inptr[i   ], 6);
-        inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
-        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i   ]), bd);
-        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
-        // Store
-        _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
-        _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
-      }
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[16], temp_out[16];
-    for (i = 0; i < 16; ++i) {
-      for (j = 0; j < 16; ++j)
-        temp_in[j] = out[j * 16 + i];
-      vpx_highbd_idct16_c(temp_in, temp_out, bd);
-      for (j = 0; j < 16; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
-      }
-    }
-  }
-}
-
-void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                      int stride, int bd) {
-  tran_low_t out[16 * 16] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j, test;
-  __m128i inptr[32];
-  __m128i min_input, max_input, temp1, temp2, sign_bits;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i rounding = _mm_set1_epi16(32);
-  const __m128i max = _mm_set1_epi16(3155);
-  const __m128i min = _mm_set1_epi16(-3155);
-  int optimised_cols = 0;
-
-  // Load input into __m128i & pack to 16 bits
-  for (i = 0; i < 16; i++) {
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
-    inptr[i] = _mm_packs_epi32(temp1, temp2);
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
-    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
-  }
-
-  // Find the min & max for the row transform
-  // Since all non-zero dct coefficients are in upper-left 4x4 area,
-  // we only need to consider first 4 rows here.
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  for (i = 2; i < 4; i++) {
-    max_input = _mm_max_epi16(max_input, inptr[i]);
-    min_input = _mm_min_epi16(min_input, inptr[i]);
-  }
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp1 = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp1);
-
-  if (!test) {
-    // Do the row transform (N.B. This transposes inptr)
-    idct16_sse2(inptr, inptr + 16);
-
-    // Find the min & max for the column transform
-    // N.B. Only first 4 cols contain non-zero coeffs
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    for (i = 2; i < 16; i++) {
-      max_input = _mm_max_epi16(max_input, inptr[i]);
-      min_input = _mm_min_epi16(min_input, inptr[i]);
-    }
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp1 = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp1);
-
-    if (test) {
-      // Use fact only first 4 rows contain non-zero coeffs
-      array_transpose_8x8(inptr, inptr);
-      array_transpose_8x8(inptr + 8, inptr + 16);
-      for (i = 0; i < 4; i++) {
-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
-        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
-        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
-        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
-        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
-        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
-      }
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 4; ++i) {
-      vpx_highbd_idct16_c(input, outptr, bd);
-      input += 16;
-      outptr += 16;
-    }
-  }
-
-  if (optimised_cols) {
-    idct16_sse2(inptr, inptr + 16);
-
-    // Final round & shift and Reconstruction and Store
-    {
-      __m128i d[2];
-      for (i = 0; i < 16; i++) {
-        inptr[i   ] = _mm_add_epi16(inptr[i   ], rounding);
-        inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
-        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
-        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
-        inptr[i   ] = _mm_srai_epi16(inptr[i   ], 6);
-        inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
-        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i   ]), bd);
-        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
-        // Store
-        _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
-        _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
-      }
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[16], temp_out[16];
-    for (i = 0; i < 16; ++i) {
-      for (j = 0; j < 16; ++j)
-        temp_in[j] = out[j * 16 + i];
-      vpx_highbd_idct16_c(temp_in, temp_out, bd);
-      for (j = 0; j < 16; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
-      }
-    }
-  }
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_sse2.h b/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
deleted file mode 100644
index bd520c18e5..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_DSP_X86_INV_TXFM_SSE2_H_
-#define VPX_DSP_X86_INV_TXFM_SSE2_H_
-
-#include <emmintrin.h>  // SSE2
-#include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
-#include "vpx_dsp/inv_txfm.h"
-#include "vpx_dsp/x86/txfm_common_sse2.h"
-
-// perform 8x8 transpose
-static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
-  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
-  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
-
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-
-  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
-  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
-  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
-  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
-  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
-  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
-  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
-  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
-}
-
-#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
-  {                                                     \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
-                                                        \
-    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);  /* i1 i0 */  \
-    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1);  /* i3 i2 */  \
-  }
-
-static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-
-  out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
-  out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
-  out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
-  out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
-}
-
-static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
-  __m128i tbuf[8];
-  array_transpose_8x8(res0, res0);
-  array_transpose_8x8(res1, tbuf);
-  array_transpose_8x8(res0 + 8, res1);
-  array_transpose_8x8(res1 + 8, res1 + 8);
-
-  res0[8] = tbuf[0];
-  res0[9] = tbuf[1];
-  res0[10] = tbuf[2];
-  res0[11] = tbuf[3];
-  res0[12] = tbuf[4];
-  res0[13] = tbuf[5];
-  res0[14] = tbuf[6];
-  res0[15] = tbuf[7];
-}
-
-// Function to allow 8 bit optimisations to be used when profile 0 is used with
-// highbitdepth enabled
-static INLINE __m128i load_input_data(const tran_low_t *data) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5],
-      data[6], data[7]);
-#else
-  return _mm_load_si128((const __m128i *)data);
-#endif
-}
-
-static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) {
-  in[0]  = load_input_data(input + 0 * 16);
-  in[1]  = load_input_data(input + 1 * 16);
-  in[2]  = load_input_data(input + 2 * 16);
-  in[3]  = load_input_data(input + 3 * 16);
-  in[4]  = load_input_data(input + 4 * 16);
-  in[5]  = load_input_data(input + 5 * 16);
-  in[6]  = load_input_data(input + 6 * 16);
-  in[7]  = load_input_data(input + 7 * 16);
-
-  in[8]  = load_input_data(input + 8 * 16);
-  in[9]  = load_input_data(input + 9 * 16);
-  in[10]  = load_input_data(input + 10 * 16);
-  in[11]  = load_input_data(input + 11 * 16);
-  in[12]  = load_input_data(input + 12 * 16);
-  in[13]  = load_input_data(input + 13 * 16);
-  in[14]  = load_input_data(input + 14 * 16);
-  in[15]  = load_input_data(input + 15 * 16);
-}
-
-#define RECON_AND_STORE(dest, in_x) \
-  {                                                     \
-     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
-      d0 = _mm_unpacklo_epi8(d0, zero); \
-      d0 = _mm_add_epi16(in_x, d0); \
-      d0 = _mm_packus_epi16(d0, d0); \
-      _mm_storel_epi64((__m128i *)(dest), d0); \
-  }
-
-static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
-  const __m128i final_rounding = _mm_set1_epi16(1<<5);
-  const __m128i zero = _mm_setzero_si128();
-  // Final rounding and shift
-  in[0] = _mm_adds_epi16(in[0], final_rounding);
-  in[1] = _mm_adds_epi16(in[1], final_rounding);
-  in[2] = _mm_adds_epi16(in[2], final_rounding);
-  in[3] = _mm_adds_epi16(in[3], final_rounding);
-  in[4] = _mm_adds_epi16(in[4], final_rounding);
-  in[5] = _mm_adds_epi16(in[5], final_rounding);
-  in[6] = _mm_adds_epi16(in[6], final_rounding);
-  in[7] = _mm_adds_epi16(in[7], final_rounding);
-  in[8] = _mm_adds_epi16(in[8], final_rounding);
-  in[9] = _mm_adds_epi16(in[9], final_rounding);
-  in[10] = _mm_adds_epi16(in[10], final_rounding);
-  in[11] = _mm_adds_epi16(in[11], final_rounding);
-  in[12] = _mm_adds_epi16(in[12], final_rounding);
-  in[13] = _mm_adds_epi16(in[13], final_rounding);
-  in[14] = _mm_adds_epi16(in[14], final_rounding);
-  in[15] = _mm_adds_epi16(in[15], final_rounding);
-
-  in[0] = _mm_srai_epi16(in[0], 6);
-  in[1] = _mm_srai_epi16(in[1], 6);
-  in[2] = _mm_srai_epi16(in[2], 6);
-  in[3] = _mm_srai_epi16(in[3], 6);
-  in[4] = _mm_srai_epi16(in[4], 6);
-  in[5] = _mm_srai_epi16(in[5], 6);
-  in[6] = _mm_srai_epi16(in[6], 6);
-  in[7] = _mm_srai_epi16(in[7], 6);
-  in[8] = _mm_srai_epi16(in[8], 6);
-  in[9] = _mm_srai_epi16(in[9], 6);
-  in[10] = _mm_srai_epi16(in[10], 6);
-  in[11] = _mm_srai_epi16(in[11], 6);
-  in[12] = _mm_srai_epi16(in[12], 6);
-  in[13] = _mm_srai_epi16(in[13], 6);
-  in[14] = _mm_srai_epi16(in[14], 6);
-  in[15] = _mm_srai_epi16(in[15], 6);
-
-  RECON_AND_STORE(dest +  0 * stride, in[0]);
-  RECON_AND_STORE(dest +  1 * stride, in[1]);
-  RECON_AND_STORE(dest +  2 * stride, in[2]);
-  RECON_AND_STORE(dest +  3 * stride, in[3]);
-  RECON_AND_STORE(dest +  4 * stride, in[4]);
-  RECON_AND_STORE(dest +  5 * stride, in[5]);
-  RECON_AND_STORE(dest +  6 * stride, in[6]);
-  RECON_AND_STORE(dest +  7 * stride, in[7]);
-  RECON_AND_STORE(dest +  8 * stride, in[8]);
-  RECON_AND_STORE(dest +  9 * stride, in[9]);
-  RECON_AND_STORE(dest + 10 * stride, in[10]);
-  RECON_AND_STORE(dest + 11 * stride, in[11]);
-  RECON_AND_STORE(dest + 12 * stride, in[12]);
-  RECON_AND_STORE(dest + 13 * stride, in[13]);
-  RECON_AND_STORE(dest + 14 * stride, in[14]);
-  RECON_AND_STORE(dest + 15 * stride, in[15]);
-}
-
-void idct4_sse2(__m128i *in);
-void idct8_sse2(__m128i *in);
-void idct16_sse2(__m128i *in0, __m128i *in1);
-void iadst4_sse2(__m128i *in);
-void iadst8_sse2(__m128i *in);
-void iadst16_sse2(__m128i *in0, __m128i *in1);
-
-#endif  // VPX_DSP_X86_INV_TXFM_SSE2_H_
diff --git a/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm b/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
deleted file mode 100644
index 20baf820f6..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
+++ /dev/null
@@ -1,1793 +0,0 @@
-;
-;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-; This file provides SSSE3 version of the inverse transformation. Part
-; of the functions are originally derived from the ffmpeg project.
-; Note that the current version applies to x86 64-bit only.
-
-SECTION_RODATA
-
-pw_11585x2: times 8 dw 23170
-
-pw_m2404x2:  times 8 dw  -2404*2
-pw_m4756x2:  times 8 dw  -4756*2
-pw_m5520x2:  times 8 dw  -5520*2
-pw_m8423x2:  times 8 dw  -8423*2
-pw_m9102x2:  times 8 dw  -9102*2
-pw_m10394x2: times 8 dw -10394*2
-pw_m11003x2: times 8 dw -11003*2
-
-pw_16364x2: times 8 dw 16364*2
-pw_16305x2: times 8 dw 16305*2
-pw_16207x2: times 8 dw 16207*2
-pw_16069x2: times 8 dw 16069*2
-pw_15893x2: times 8 dw 15893*2
-pw_15679x2: times 8 dw 15679*2
-pw_15426x2: times 8 dw 15426*2
-pw_15137x2: times 8 dw 15137*2
-pw_14811x2: times 8 dw 14811*2
-pw_14449x2: times 8 dw 14449*2
-pw_14053x2: times 8 dw 14053*2
-pw_13623x2: times 8 dw 13623*2
-pw_13160x2: times 8 dw 13160*2
-pw_12665x2: times 8 dw 12665*2
-pw_12140x2: times 8 dw 12140*2
-pw__9760x2: times 8 dw  9760*2
-pw__7723x2: times 8 dw  7723*2
-pw__7005x2: times 8 dw  7005*2
-pw__6270x2: times 8 dw  6270*2
-pw__3981x2: times 8 dw  3981*2
-pw__3196x2: times 8 dw  3196*2
-pw__1606x2: times 8 dw  1606*2
-pw___804x2: times 8 dw   804*2
-
-pd_8192:    times 4 dd 8192
-pw_32:      times 8 dw 32
-pw_16:      times 8 dw 16
-
-%macro TRANSFORM_COEFFS 2
-pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2
-pw_m%2_%1:  dw -%2,  %1, -%2,  %1, -%2,  %1, -%2,  %1
-pw_m%1_m%2: dw -%1, -%2, -%1, -%2, -%1, -%2, -%1, -%2
-%endmacro
-
-TRANSFORM_COEFFS    6270, 15137
-TRANSFORM_COEFFS    3196, 16069
-TRANSFORM_COEFFS   13623,  9102
-
-; constants for 32x32_34
-TRANSFORM_COEFFS      804, 16364
-TRANSFORM_COEFFS    15426,  5520
-TRANSFORM_COEFFS     3981, 15893
-TRANSFORM_COEFFS    16207,  2404
-TRANSFORM_COEFFS     1606, 16305
-TRANSFORM_COEFFS    15679,  4756
-TRANSFORM_COEFFS    11585, 11585
-
-; constants for 32x32_1024
-TRANSFORM_COEFFS    12140, 11003
-TRANSFORM_COEFFS     7005, 14811
-TRANSFORM_COEFFS    14053,  8423
-TRANSFORM_COEFFS     9760, 13160
-TRANSFORM_COEFFS    12665, 10394
-TRANSFORM_COEFFS     7723, 14449
-
-%macro PAIR_PP_COEFFS 2
-dpw_%1_%2:   dw  %1,  %1,  %1,  %1,  %2,  %2,  %2,  %2
-%endmacro
-
-%macro PAIR_MP_COEFFS 2
-dpw_m%1_%2:  dw -%1, -%1, -%1, -%1,  %2,  %2,  %2,  %2
-%endmacro
-
-%macro PAIR_MM_COEFFS 2
-dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2
-%endmacro
-
-PAIR_PP_COEFFS     30274, 12540
-PAIR_PP_COEFFS      6392, 32138
-PAIR_MP_COEFFS     18204, 27246
-
-PAIR_PP_COEFFS     12540, 12540
-PAIR_PP_COEFFS     30274, 30274
-PAIR_PP_COEFFS      6392,  6392
-PAIR_PP_COEFFS     32138, 32138
-PAIR_MM_COEFFS     18204, 18204
-PAIR_PP_COEFFS     27246, 27246
-
-SECTION .text
-
-%if ARCH_X86_64
-%macro SUM_SUB 3
-  psubw  m%3, m%1, m%2
-  paddw  m%1, m%2
-  SWAP    %2, %3
-%endmacro
-
-; butterfly operation
-%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2
-  pmaddwd            m%1, m%3, %5
-  pmaddwd            m%2, m%3, %6
-  paddd              m%1,  %4
-  paddd              m%2,  %4
-  psrad              m%1,  14
-  psrad              m%2,  14
-%endmacro
-
-%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
-  punpckhwd          m%6, m%2, m%1
-  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_m%4_%3], [pw_%3_%4]
-  punpcklwd          m%2, m%1
-  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_m%4_%3], [pw_%3_%4]
-  packssdw           m%1, m%7
-  packssdw           m%2, m%6
-%endmacro
-
-%macro BUTTERFLY_4Xmm 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
-  punpckhwd          m%6, m%2, m%1
-  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_m%4_%3], [pw_m%3_m%4]
-  punpcklwd          m%2, m%1
-  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_m%4_%3], [pw_m%3_m%4]
-  packssdw           m%1, m%7
-  packssdw           m%2, m%6
-%endmacro
-
-; matrix transpose
-%macro INTERLEAVE_2X 4
-  punpckh%1          m%4, m%2, m%3
-  punpckl%1          m%2, m%3
-  SWAP               %3,  %4
-%endmacro
-
-%macro TRANSPOSE8X8 9
-  INTERLEAVE_2X  wd, %1, %2, %9
-  INTERLEAVE_2X  wd, %3, %4, %9
-  INTERLEAVE_2X  wd, %5, %6, %9
-  INTERLEAVE_2X  wd, %7, %8, %9
-
-  INTERLEAVE_2X  dq, %1, %3, %9
-  INTERLEAVE_2X  dq, %2, %4, %9
-  INTERLEAVE_2X  dq, %5, %7, %9
-  INTERLEAVE_2X  dq, %6, %8, %9
-
-  INTERLEAVE_2X  qdq, %1, %5, %9
-  INTERLEAVE_2X  qdq, %3, %7, %9
-  INTERLEAVE_2X  qdq, %2, %6, %9
-  INTERLEAVE_2X  qdq, %4, %8, %9
-
-  SWAP  %2, %5
-  SWAP  %4, %7
-%endmacro
-
-%macro IDCT8_1D 0
-  SUM_SUB          0,    4,    9
-  BUTTERFLY_4X     2,    6,    6270, 15137,  m8,  9,  10
-  pmulhrsw        m0,  m12
-  pmulhrsw        m4,  m12
-  BUTTERFLY_4X     1,    7,    3196, 16069,  m8,  9,  10
-  BUTTERFLY_4X     5,    3,   13623,  9102,  m8,  9,  10
-
-  SUM_SUB          1,    5,    9
-  SUM_SUB          7,    3,    9
-  SUM_SUB          0,    6,    9
-  SUM_SUB          4,    2,    9
-  SUM_SUB          3,    5,    9
-  pmulhrsw        m3,  m12
-  pmulhrsw        m5,  m12
-
-  SUM_SUB          0,    7,    9
-  SUM_SUB          4,    3,    9
-  SUM_SUB          2,    5,    9
-  SUM_SUB          6,    1,    9
-
-  SWAP             3,    6
-  SWAP             1,    4
-%endmacro
-
-; This macro handles 8 pixels per line
-%macro ADD_STORE_8P_2X 5;  src1, src2, tmp1, tmp2, zero
-  paddw           m%1, m11
-  paddw           m%2, m11
-  psraw           m%1, 5
-  psraw           m%2, 5
-
-  movh            m%3, [outputq]
-  movh            m%4, [outputq + strideq]
-  punpcklbw       m%3, m%5
-  punpcklbw       m%4, m%5
-  paddw           m%3, m%1
-  paddw           m%4, m%2
-  packuswb        m%3, m%5
-  packuswb        m%4, m%5
-  movh               [outputq], m%3
-  movh     [outputq + strideq], m%4
-%endmacro
-
-INIT_XMM ssse3
-; full inverse 8x8 2D-DCT transform
-cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
-  mova     m8, [pd_8192]
-  mova    m11, [pw_16]
-  mova    m12, [pw_11585x2]
-
-  lea      r3, [2 * strideq]
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova     m0, [inputq +   0]
-  packssdw m0, [inputq +  16]
-  mova     m1, [inputq +  32]
-  packssdw m1, [inputq +  48]
-  mova     m2, [inputq +  64]
-  packssdw m2, [inputq +  80]
-  mova     m3, [inputq +  96]
-  packssdw m3, [inputq + 112]
-  mova     m4, [inputq + 128]
-  packssdw m4, [inputq + 144]
-  mova     m5, [inputq + 160]
-  packssdw m5, [inputq + 176]
-  mova     m6, [inputq + 192]
-  packssdw m6, [inputq + 208]
-  mova     m7, [inputq + 224]
-  packssdw m7, [inputq + 240]
-%else
-  mova     m0, [inputq +   0]
-  mova     m1, [inputq +  16]
-  mova     m2, [inputq +  32]
-  mova     m3, [inputq +  48]
-  mova     m4, [inputq +  64]
-  mova     m5, [inputq +  80]
-  mova     m6, [inputq +  96]
-  mova     m7, [inputq + 112]
-%endif
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
-  IDCT8_1D
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
-  IDCT8_1D
-
-  pxor    m12, m12
-  ADD_STORE_8P_2X  0, 1, 9, 10, 12
-  lea              outputq, [outputq + r3]
-  ADD_STORE_8P_2X  2, 3, 9, 10, 12
-  lea              outputq, [outputq + r3]
-  ADD_STORE_8P_2X  4, 5, 9, 10, 12
-  lea              outputq, [outputq + r3]
-  ADD_STORE_8P_2X  6, 7, 9, 10, 12
-
-  RET
-
-; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero
-cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
-  mova       m8, [pd_8192]
-  mova      m11, [pw_16]
-  mova      m12, [pw_11585x2]
-
-  lea        r3, [2 * strideq]
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova       m0, [inputq +   0]
-  packssdw   m0, [inputq +  16]
-  mova       m1, [inputq +  32]
-  packssdw   m1, [inputq +  48]
-  mova       m2, [inputq +  64]
-  packssdw   m2, [inputq +  80]
-  mova       m3, [inputq +  96]
-  packssdw   m3, [inputq + 112]
-%else
-  mova       m0, [inputq +  0]
-  mova       m1, [inputq + 16]
-  mova       m2, [inputq + 32]
-  mova       m3, [inputq + 48]
-%endif
-
-  punpcklwd  m0, m1
-  punpcklwd  m2, m3
-  punpckhdq  m9, m0, m2
-  punpckldq  m0, m2
-  SWAP       2, 9
-
-  ; m0 -> [0], [0]
-  ; m1 -> [1], [1]
-  ; m2 -> [2], [2]
-  ; m3 -> [3], [3]
-  punpckhqdq m10, m0, m0
-  punpcklqdq m0,  m0
-  punpckhqdq m9,  m2, m2
-  punpcklqdq m2,  m2
-  SWAP       1, 10
-  SWAP       3,  9
-
-  pmulhrsw   m0, m12
-  pmulhrsw   m2, [dpw_30274_12540]
-  pmulhrsw   m1, [dpw_6392_32138]
-  pmulhrsw   m3, [dpw_m18204_27246]
-
-  SUM_SUB    0, 2, 9
-  SUM_SUB    1, 3, 9
-
-  punpcklqdq m9, m3, m3
-  punpckhqdq m5, m3, m9
-
-  SUM_SUB    3, 5, 9
-  punpckhqdq m5, m3
-  pmulhrsw   m5, m12
-
-  punpckhqdq m9, m1, m5
-  punpcklqdq m1, m5
-  SWAP       5, 9
-
-  SUM_SUB    0, 5, 9
-  SUM_SUB    2, 1, 9
-
-  punpckhqdq m3, m0, m0
-  punpckhqdq m4, m1, m1
-  punpckhqdq m6, m5, m5
-  punpckhqdq m7, m2, m2
-
-  punpcklwd  m0, m3
-  punpcklwd  m7, m2
-  punpcklwd  m1, m4
-  punpcklwd  m6, m5
-
-  punpckhdq  m4, m0, m7
-  punpckldq  m0, m7
-  punpckhdq  m10, m1, m6
-  punpckldq  m5, m1, m6
-
-  punpckhqdq m1, m0, m5
-  punpcklqdq m0, m5
-  punpckhqdq m3, m4, m10
-  punpcklqdq m2, m4, m10
-
-
-  pmulhrsw   m0, m12
-  pmulhrsw   m6, m2, [dpw_30274_30274]
-  pmulhrsw   m4, m2, [dpw_12540_12540]
-
-  pmulhrsw   m7, m1, [dpw_32138_32138]
-  pmulhrsw   m1, [dpw_6392_6392]
-  pmulhrsw   m5, m3, [dpw_m18204_m18204]
-  pmulhrsw   m3, [dpw_27246_27246]
-
-  mova       m2, m0
-  SUM_SUB    0, 6, 9
-  SUM_SUB    2, 4, 9
-  SUM_SUB    1, 5, 9
-  SUM_SUB    7, 3, 9
-
-  SUM_SUB    3, 5, 9
-  pmulhrsw   m3, m12
-  pmulhrsw   m5, m12
-
-  SUM_SUB    0, 7, 9
-  SUM_SUB    2, 3, 9
-  SUM_SUB    4, 5, 9
-  SUM_SUB    6, 1, 9
-
-  SWAP       3, 6
-  SWAP       1, 2
-  SWAP       2, 4
-
-
-  pxor    m12, m12
-  ADD_STORE_8P_2X  0, 1, 9, 10, 12
-  lea              outputq, [outputq + r3]
-  ADD_STORE_8P_2X  2, 3, 9, 10, 12
-  lea              outputq, [outputq + r3]
-  ADD_STORE_8P_2X  4, 5, 9, 10, 12
-  lea              outputq, [outputq + r3]
-  ADD_STORE_8P_2X  6, 7, 9, 10, 12
-
-  RET
-
-%define  idx0 16 * 0
-%define  idx1 16 * 1
-%define  idx2 16 * 2
-%define  idx3 16 * 3
-%define  idx4 16 * 4
-%define  idx5 16 * 5
-%define  idx6 16 * 6
-%define  idx7 16 * 7
-%define  idx8 16 * 0
-%define  idx9 16 * 1
-%define idx10 16 * 2
-%define idx11 16 * 3
-%define idx12 16 * 4
-%define idx13 16 * 5
-%define idx14 16 * 6
-%define idx15 16 * 7
-%define idx16 16 * 0
-%define idx17 16 * 1
-%define idx18 16 * 2
-%define idx19 16 * 3
-%define idx20 16 * 4
-%define idx21 16 * 5
-%define idx22 16 * 6
-%define idx23 16 * 7
-%define idx24 16 * 0
-%define idx25 16 * 1
-%define idx26 16 * 2
-%define idx27 16 * 3
-%define idx28 16 * 4
-%define idx29 16 * 5
-%define idx30 16 * 6
-%define idx31 16 * 7
-
-; FROM idct32x32_add_neon.asm
-;
-; Instead of doing the transforms stage by stage, it is done by loading
-; some input values and doing as many stages as possible to minimize the
-; storing/loading of intermediate results. To fit within registers, the
-; final coefficients are cut into four blocks:
-; BLOCK A: 16-19,28-31
-; BLOCK B: 20-23,24-27
-; BLOCK C: 8-11,12-15
-; BLOCK D: 0-3,4-7
-; Blocks A and C are straight calculation through the various stages. In
-; block B, further calculations are performed using the results from
-; block A. In block D, further calculations are performed using the results
-; from block C and then the final calculations are done using results from
-; block A and B which have been combined at the end of block B.
-;
-
-%macro IDCT32X32_34 4
-  ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                m11, m1
-  pmulhrsw             m1, [pw___804x2] ; stp1_16
-  mova      [r4 +      0], m0
-  pmulhrsw            m11, [pw_16364x2] ; stp2_31
-  mova      [r4 + 16 * 2], m2
-  mova                m12, m7
-  pmulhrsw             m7, [pw_15426x2] ; stp1_28
-  mova      [r4 + 16 * 4], m4
-  pmulhrsw            m12, [pw_m5520x2] ; stp2_19
-  mova      [r4 + 16 * 6], m6
-
-  ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m2, m1   ; stp1_16
-  mova                 m0, m11  ; stp1_31
-  mova                 m4, m7   ; stp1_28
-  mova                m15, m12  ; stp1_19
-
-  ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          0,     2,   3196, 16069,  m8,  9,  10 ; stp1_17, stp1_30
-  BUTTERFLY_4Xmm        4,    15,   3196, 16069,  m8,  9,  10 ; stp1_29, stp1_18
-
-  ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               1, 12, 9 ; stp2_16, stp2_19
-  SUM_SUB               0, 15, 9 ; stp2_17, stp2_18
-  SUM_SUB              11,  7, 9 ; stp2_31, stp2_28
-  SUM_SUB               2,  4, 9 ; stp2_30, stp2_29
-
-  ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          4,    15,   6270, 15137,  m8,  9,  10 ; stp1_18, stp1_29
-  BUTTERFLY_4X          7,    12,   6270, 15137,  m8,  9,  10 ; stp1_19, stp1_28
-
-  ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m6, m5
-  pmulhrsw             m5, [pw__3981x2] ; stp1_20
-  mova [stp + %4 + idx28], m12
-  mova [stp + %4 + idx29], m15
-  pmulhrsw             m6, [pw_15893x2] ; stp2_27
-  mova [stp + %4 + idx30], m2
-  mova                 m2, m3
-  pmulhrsw             m3, [pw_m2404x2] ; stp1_23
-  mova [stp + %4 + idx31], m11
-  pmulhrsw             m2, [pw_16207x2] ; stp2_24
-
-  ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                m13, m5 ; stp1_20
-  mova                m14, m6 ; stp1_27
-  mova                m15, m3 ; stp1_23
-  mova                m11, m2 ; stp1_24
-
-  ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_21, stp1_26
-  BUTTERFLY_4Xmm       11,    15,  13623,  9102,  m8,  9,  10 ; stp1_25, stp1_22
-
-  ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               3,  5, 9 ; stp2_23, stp2_20
-  SUM_SUB              15, 14, 9 ; stp2_22, stp2_21
-  SUM_SUB               2,  6, 9 ; stp2_24, stp2_27
-  SUM_SUB              11, 13, 9 ; stp2_25, stp2_26
-
-  ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4Xmm        6,     5,   6270, 15137,  m8,  9,  10 ; stp1_27, stp1_20
-  BUTTERFLY_4Xmm       13,    14,   6270, 15137,  m8,  9,  10 ; stp1_26, stp1_21
-
-  ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               1,  3, 9 ; stp2_16, stp2_23
-  SUM_SUB               0, 15, 9 ; stp2_17, stp2_22
-  SUM_SUB               4, 14, 9 ; stp2_18, stp2_21
-  SUM_SUB               7,  5, 9 ; stp2_19, stp2_20
-  mova [stp + %3 + idx16], m1
-  mova [stp + %3 + idx17], m0
-  mova [stp + %3 + idx18], m4
-  mova [stp + %3 + idx19], m7
-
-  mova                 m4, [stp + %4 + idx28]
-  mova                 m7, [stp + %4 + idx29]
-  mova                m10, [stp + %4 + idx30]
-  mova                m12, [stp + %4 + idx31]
-  SUM_SUB               4,  6, 9 ; stp2_28, stp2_27
-  SUM_SUB               7, 13, 9 ; stp2_29, stp2_26
-  SUM_SUB              10, 11, 9 ; stp2_30, stp2_25
-  SUM_SUB              12,  2, 9 ; stp2_31, stp2_24
-  mova [stp + %4 + idx28], m4
-  mova [stp + %4 + idx29], m7
-  mova [stp + %4 + idx30], m10
-  mova [stp + %4 + idx31], m12
-
-  ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               6,  5, 9
-  pmulhrsw             m6, m10  ; stp1_27
-  pmulhrsw             m5, m10  ; stp1_20
-  SUM_SUB              13, 14,  9
-  pmulhrsw            m13, m10  ; stp1_26
-  pmulhrsw            m14, m10  ; stp1_21
-  SUM_SUB              11, 15,  9
-  pmulhrsw            m11, m10  ; stp1_25
-  pmulhrsw            m15, m10  ; stp1_22
-  SUM_SUB               2,  3,  9
-  pmulhrsw             m2, m10  ; stp1_24
-  pmulhrsw             m3, m10  ; stp1_23
-%else
-  BUTTERFLY_4X          6,     5,  11585, 11585,  m8,  9,  10 ; stp1_20, stp1_27
-  SWAP 6, 5
-  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_21, stp1_26
-  SWAP 13, 14
-  BUTTERFLY_4X         11,    15,  11585, 11585,  m8,  9,  10 ; stp1_22, stp1_25
-  SWAP 11, 15
-  BUTTERFLY_4X          2,     3,  11585, 11585,  m8,  9,  10 ; stp1_23, stp1_24
-  SWAP 2, 3
-%endif
-
-  mova [stp + %4 + idx24], m2
-  mova [stp + %4 + idx25], m11
-  mova [stp + %4 + idx26], m13
-  mova [stp + %4 + idx27], m6
-
-  ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m0, [rsp + transposed_in + 16 *  2]
-  mova                 m6, [rsp + transposed_in + 16 *  6]
-
-  mova                 m1, m0
-  pmulhrsw             m0, [pw__1606x2] ; stp1_8
-  mova [stp + %3 + idx20], m5
-  mova [stp + %3 + idx21], m14
-  pmulhrsw             m1, [pw_16305x2] ; stp2_15
-  mova [stp + %3 + idx22], m15
-  mova                 m7, m6
-  pmulhrsw             m7, [pw_m4756x2] ; stp2_11
-  mova [stp + %3 + idx23], m3
-  pmulhrsw             m6, [pw_15679x2] ; stp1_12
-
-  ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m3, m0 ; stp1_8
-  mova                 m2, m1 ; stp1_15
-
-  ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_9, stp1_14
-  mova                 m4, m7 ; stp1_11
-  mova                 m5, m6 ; stp1_12
-  BUTTERFLY_4Xmm        5,     4,   6270, 15137,  m8,  9,  10 ; stp1_13, stp1_10
-
-  ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0,  7, 9 ;  stp1_8, stp1_11
-  SUM_SUB               2,  4, 9 ;  stp1_9, stp1_10
-  SUM_SUB               1,  6, 9 ;  stp1_15, stp1_12
-  SUM_SUB               3,  5, 9 ;  stp1_14, stp1_13
-
-  ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               5,  4, 9
-  pmulhrsw             m5, m10  ; stp1_13
-  pmulhrsw             m4, m10  ; stp1_10
-  SUM_SUB               6,  7, 9
-  pmulhrsw             m6, m10  ; stp1_12
-  pmulhrsw             m7, m10  ; stp1_11
-%else
-  BUTTERFLY_4X          5,     4,  11585, 11585,  m8,  9,  10 ; stp1_10, stp1_13
-  SWAP 5, 4
-  BUTTERFLY_4X          6,     7,  11585, 11585,  m8,  9,  10 ; stp1_11, stp1_12
-  SWAP 6, 7
-%endif
-
-  ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova [stp + %2 +  idx8], m0
-  mova [stp + %2 +  idx9], m2
-  mova [stp + %2 + idx10], m4
-  mova [stp + %2 + idx11], m7
-
-  ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                m11, [rsp + transposed_in + 16 *  4]
-  mova                m12, m11
-  pmulhrsw            m11, [pw__3196x2] ; stp1_4
-  pmulhrsw            m12, [pw_16069x2] ; stp1_7
-
-  ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m0, [rsp + transposed_in + 16 *  0]
-  mova                m10, [pw_11585x2]
-  pmulhrsw             m0, m10  ; stp1_1
-
-  mova                m14, m11 ; stp1_4
-  mova                m13, m12 ; stp1_7
-
-  ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  SUM_SUB              13,   14,  9
-  pmulhrsw            m13, m10  ; stp1_6
-  pmulhrsw            m14, m10  ; stp1_5
-%else
-  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_5, stp1_6
-  SWAP 13, 14
-%endif
-  mova                 m7, m0 ; stp1_0 = stp1_1
-  mova                 m4, m0 ; stp1_1
-  mova                 m2, m7 ; stp1_0
-
-  ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0, 12, 9 ;  stp1_0, stp1_7
-  SUM_SUB               7, 13, 9 ;  stp1_1, stp1_6
-  SUM_SUB               2, 14, 9 ;  stp1_2, stp1_5
-  SUM_SUB               4, 11, 9 ;  stp1_3, stp1_4
-
-  ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0,  1, 9 ;  stp1_0, stp1_15
-  SUM_SUB               7,  3, 9 ;  stp1_1, stp1_14
-  SUM_SUB               2,  5, 9 ;  stp1_2, stp1_13
-  SUM_SUB               4,  6, 9 ;  stp1_3, stp1_12
-
-  ; 0-3, 28-31 final stage
-  mova                m15, [stp + %4 + idx30]
-  mova                m10, [stp + %4 + idx31]
-  SUM_SUB               0, 10, 9 ;  stp1_0, stp1_31
-  SUM_SUB               7, 15, 9 ;  stp1_1, stp1_30
-  mova [stp + %1 +  idx0], m0
-  mova [stp + %1 +  idx1], m7
-  mova [stp + %4 + idx30], m15
-  mova [stp + %4 + idx31], m10
-  mova                 m7, [stp + %4 + idx28]
-  mova                 m0, [stp + %4 + idx29]
-  SUM_SUB               2,  0, 9 ;  stp1_2, stp1_29
-  SUM_SUB               4,  7, 9 ;  stp1_3, stp1_28
-  mova [stp + %1 +  idx2], m2
-  mova [stp + %1 +  idx3], m4
-  mova [stp + %4 + idx28], m7
-  mova [stp + %4 + idx29], m0
-
-  ; 12-15, 16-19 final stage
-  mova                 m0, [stp + %3 + idx16]
-  mova                 m7, [stp + %3 + idx17]
-  mova                 m2, [stp + %3 + idx18]
-  mova                 m4, [stp + %3 + idx19]
-  SUM_SUB               1,  0, 9 ;  stp1_15, stp1_16
-  SUM_SUB               3,  7, 9 ;  stp1_14, stp1_17
-  SUM_SUB               5,  2, 9 ;  stp1_13, stp1_18
-  SUM_SUB               6,  4, 9 ;  stp1_12, stp1_19
-  mova [stp + %2 + idx12], m6
-  mova [stp + %2 + idx13], m5
-  mova [stp + %2 + idx14], m3
-  mova [stp + %2 + idx15], m1
-  mova [stp + %3 + idx16], m0
-  mova [stp + %3 + idx17], m7
-  mova [stp + %3 + idx18], m2
-  mova [stp + %3 + idx19], m4
-
-  mova                 m4, [stp + %2 +  idx8]
-  mova                 m5, [stp + %2 +  idx9]
-  mova                 m6, [stp + %2 + idx10]
-  mova                 m7, [stp + %2 + idx11]
-  SUM_SUB              11,  7, 9 ;  stp1_4, stp1_11
-  SUM_SUB              14,  6, 9 ;  stp1_5, stp1_10
-  SUM_SUB              13,  5, 9 ;  stp1_6, stp1_9
-  SUM_SUB              12,  4, 9 ;  stp1_7, stp1_8
-
-  ; 4-7, 24-27 final stage
-  mova                 m0, [stp + %4 + idx27]
-  mova                 m1, [stp + %4 + idx26]
-  mova                 m2, [stp + %4 + idx25]
-  mova                 m3, [stp + %4 + idx24]
-  SUM_SUB              11,  0, 9 ;  stp1_4, stp1_27
-  SUM_SUB              14,  1, 9 ;  stp1_5, stp1_26
-  SUM_SUB              13,  2, 9 ;  stp1_6, stp1_25
-  SUM_SUB              12,  3, 9 ;  stp1_7, stp1_24
-  mova [stp + %4 + idx27], m0
-  mova [stp + %4 + idx26], m1
-  mova [stp + %4 + idx25], m2
-  mova [stp + %4 + idx24], m3
-  mova [stp + %1 +  idx4], m11
-  mova [stp + %1 +  idx5], m14
-  mova [stp + %1 +  idx6], m13
-  mova [stp + %1 +  idx7], m12
-
-  ; 8-11, 20-23 final stage
-  mova                 m0, [stp + %3 + idx20]
-  mova                 m1, [stp + %3 + idx21]
-  mova                 m2, [stp + %3 + idx22]
-  mova                 m3, [stp + %3 + idx23]
-  SUM_SUB               7,  0, 9 ;  stp1_11, stp_20
-  SUM_SUB               6,  1, 9 ;  stp1_10, stp_21
-  SUM_SUB               5,  2, 9 ;   stp1_9, stp_22
-  SUM_SUB               4,  3, 9 ;   stp1_8, stp_23
-  mova [stp + %2 +  idx8], m4
-  mova [stp + %2 +  idx9], m5
-  mova [stp + %2 + idx10], m6
-  mova [stp + %2 + idx11], m7
-  mova [stp + %3 + idx20], m0
-  mova [stp + %3 + idx21], m1
-  mova [stp + %3 + idx22], m2
-  mova [stp + %3 + idx23], m3
-%endmacro
-
-%macro RECON_AND_STORE 1
-  mova            m11, [pw_32]
-  lea             stp, [rsp + %1]
-  mov              r6, 32
-  pxor             m8, m8
-%%recon_and_store:
-  mova             m0, [stp + 16 * 32 * 0]
-  mova             m1, [stp + 16 * 32 * 1]
-  mova             m2, [stp + 16 * 32 * 2]
-  mova             m3, [stp + 16 * 32 * 3]
-  add             stp, 16
-
-  paddw            m0, m11
-  paddw            m1, m11
-  paddw            m2, m11
-  paddw            m3, m11
-  psraw            m0, 6
-  psraw            m1, 6
-  psraw            m2, 6
-  psraw            m3, 6
-  movh             m4, [outputq +  0]
-  movh             m5, [outputq +  8]
-  movh             m6, [outputq + 16]
-  movh             m7, [outputq + 24]
-  punpcklbw        m4, m8
-  punpcklbw        m5, m8
-  punpcklbw        m6, m8
-  punpcklbw        m7, m8
-  paddw            m0, m4
-  paddw            m1, m5
-  paddw            m2, m6
-  paddw            m3, m7
-  packuswb         m0, m1
-  packuswb         m2, m3
-  mova [outputq +  0], m0
-  mova [outputq + 16], m2
-  lea         outputq, [outputq + strideq]
-  dec              r6
-  jnz %%recon_and_store
-%endmacro
-
-%define i32x32_size     16*32*5
-%define pass_two_start  16*32*0
-%define transposed_in   16*32*4
-%define pass_one_start  16*32*0
-%define stp r8
-
-INIT_XMM ssse3
-cglobal idct32x32_34_add, 3, 11, 16, i32x32_size, input, output, stride
-  mova            m8, [pd_8192]
-  lea            stp, [rsp + pass_one_start]
-
-idct32x32_34:
-  mov             r3, inputq
-  lea             r4, [rsp + transposed_in]
-
-idct32x32_34_transpose:
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova            m0, [r3 +       0]
-  packssdw        m0, [r3 +      16]
-  mova            m1, [r3 + 32 *  4]
-  packssdw        m1, [r3 + 32 *  4 + 16]
-  mova            m2, [r3 + 32 *  8]
-  packssdw        m2, [r3 + 32 *  8 + 16]
-  mova            m3, [r3 + 32 * 12]
-  packssdw        m3, [r3 + 32 * 12 + 16]
-  mova            m4, [r3 + 32 * 16]
-  packssdw        m4, [r3 + 32 * 16 + 16]
-  mova            m5, [r3 + 32 * 20]
-  packssdw        m5, [r3 + 32 * 20 + 16]
-  mova            m6, [r3 + 32 * 24]
-  packssdw        m6, [r3 + 32 * 24 + 16]
-  mova            m7, [r3 + 32 * 28]
-  packssdw        m7, [r3 + 32 * 28 + 16]
-%else
-  mova            m0, [r3 +       0]
-  mova            m1, [r3 + 16 *  4]
-  mova            m2, [r3 + 16 *  8]
-  mova            m3, [r3 + 16 * 12]
-  mova            m4, [r3 + 16 * 16]
-  mova            m5, [r3 + 16 * 20]
-  mova            m6, [r3 + 16 * 24]
-  mova            m7, [r3 + 16 * 28]
-%endif
-
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  IDCT32X32_34  16*0, 16*32, 16*64, 16*96
-  lea            stp, [stp + 16 * 8]
-  mov             r6, 4
-  lea            stp, [rsp + pass_one_start]
-  lea             r9, [rsp + pass_one_start]
-
-idct32x32_34_2:
-  lea             r4, [rsp + transposed_in]
-  mov             r3, r9
-
-idct32x32_34_transpose_2:
-  mova            m0, [r3 +      0]
-  mova            m1, [r3 + 16 * 1]
-  mova            m2, [r3 + 16 * 2]
-  mova            m3, [r3 + 16 * 3]
-  mova            m4, [r3 + 16 * 4]
-  mova            m5, [r3 + 16 * 5]
-  mova            m6, [r3 + 16 * 6]
-  mova            m7, [r3 + 16 * 7]
-
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  IDCT32X32_34  16*0, 16*8, 16*16, 16*24
-
-  lea            stp, [stp + 16 * 32]
-  add             r9, 16 * 32
-  dec             r6
-  jnz idct32x32_34_2
-
-  RECON_AND_STORE pass_two_start
-
-  RET
-
-%macro IDCT32X32_135 4
-  ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m1, [rsp + transposed_in + 16 *  1]
-  mova                m11, m1
-  pmulhrsw             m1, [pw___804x2] ; stp1_16
-  pmulhrsw            m11, [pw_16364x2] ; stp2_31
-
-  mova                 m7, [rsp + transposed_in + 16 *  7]
-  mova                m12, m7
-  pmulhrsw             m7, [pw_15426x2] ; stp1_28
-  pmulhrsw            m12, [pw_m5520x2] ; stp2_19
-
-  mova                 m3, [rsp + transposed_in + 16 *  9]
-  mova                 m4, m3
-  pmulhrsw             m3, [pw__7005x2] ; stp1_18
-  pmulhrsw             m4, [pw_14811x2] ; stp2_29
-
-  mova                 m0, [rsp + transposed_in + 16 * 15]
-  mova                 m2, m0
-  pmulhrsw             m0, [pw_12140x2]  ; stp1_30
-  pmulhrsw             m2, [pw_m11003x2] ; stp2_17
-
-  ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               1,  2, 9 ; stp2_16, stp2_17
-  SUM_SUB              12,  3, 9 ; stp2_19, stp2_18
-  SUM_SUB               7,  4, 9 ; stp2_28, stp2_29
-  SUM_SUB              11,  0, 9 ; stp2_31, stp2_30
-
-  ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          0,     2,   3196, 16069,  m8,  9,  10 ; stp1_17, stp1_30
-  BUTTERFLY_4Xmm        4,     3,   3196, 16069,  m8,  9,  10 ; stp1_29, stp1_18
-
-  ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               1, 12, 9 ; stp2_16, stp2_19
-  SUM_SUB               0,  3, 9 ; stp2_17, stp2_18
-  SUM_SUB              11,  7, 9 ; stp2_31, stp2_28
-  SUM_SUB               2,  4, 9 ; stp2_30, stp2_29
-
-  ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          4,     3,   6270, 15137,  m8,  9,  10 ; stp1_18, stp1_29
-  BUTTERFLY_4X          7,    12,   6270, 15137,  m8,  9,  10 ; stp1_19, stp1_28
-
-  mova [stp + %3 + idx16], m1
-  mova [stp + %3 + idx17], m0
-  mova [stp + %3 + idx18], m4
-  mova [stp + %3 + idx19], m7
-  mova [stp + %4 + idx28], m12
-  mova [stp + %4 + idx29], m3
-  mova [stp + %4 + idx30], m2
-  mova [stp + %4 + idx31], m11
-
-  ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m2, [rsp + transposed_in + 16 *  3]
-  mova                 m3, m2
-  pmulhrsw             m3, [pw_m2404x2] ; stp1_23
-  pmulhrsw             m2, [pw_16207x2] ; stp2_24
-
-  mova                 m5, [rsp + transposed_in + 16 *  5]
-  mova                 m6, m5
-  pmulhrsw             m5, [pw__3981x2] ; stp1_20
-  pmulhrsw             m6, [pw_15893x2] ; stp2_27
-
-  mova                m14, [rsp + transposed_in + 16 * 11]
-  mova                m13, m14
-  pmulhrsw            m13, [pw_m8423x2] ; stp1_21
-  pmulhrsw            m14, [pw_14053x2] ; stp2_26
-
-  mova                 m0, [rsp + transposed_in + 16 * 13]
-  mova                 m1, m0
-  pmulhrsw             m0, [pw__9760x2] ; stp1_22
-  pmulhrsw             m1, [pw_13160x2] ; stp2_25
-
-  ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               5, 13, 9 ; stp2_20, stp2_21
-  SUM_SUB               3,  0, 9 ; stp2_23, stp2_22
-  SUM_SUB               2,  1, 9 ; stp2_24, stp2_25
-  SUM_SUB               6, 14, 9 ; stp2_27, stp2_26
-
-  ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_21, stp1_26
-  BUTTERFLY_4Xmm        1,     0,  13623,  9102,  m8,  9,  10 ; stp1_25, stp1_22
-
-  ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               3,  5, 9 ; stp2_23, stp2_20
-  SUM_SUB               0, 14, 9 ; stp2_22, stp2_21
-  SUM_SUB               2,  6, 9 ; stp2_24, stp2_27
-  SUM_SUB               1, 13, 9 ; stp2_25, stp2_26
-
-  ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4Xmm        6,     5,   6270, 15137,  m8,  9,  10 ; stp1_27, stp1_20
-  BUTTERFLY_4Xmm       13,    14,   6270, 15137,  m8,  9,  10 ; stp1_26, stp1_21
-
-  ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m4, [stp + %3 + idx16]
-  mova                 m7, [stp + %3 + idx17]
-  mova                m11, [stp + %3 + idx18]
-  mova                m12, [stp + %3 + idx19]
-  SUM_SUB               4,  3, 9 ; stp2_16, stp2_23
-  SUM_SUB               7,  0, 9 ; stp2_17, stp2_22
-  SUM_SUB              11, 14, 9 ; stp2_18, stp2_21
-  SUM_SUB              12,  5, 9 ; stp2_19, stp2_20
-  mova [stp + %3 + idx16], m4
-  mova [stp + %3 + idx17], m7
-  mova [stp + %3 + idx18], m11
-  mova [stp + %3 + idx19], m12
-
-  mova                 m4, [stp + %4 + idx28]
-  mova                 m7, [stp + %4 + idx29]
-  mova                m11, [stp + %4 + idx30]
-  mova                m12, [stp + %4 + idx31]
-  SUM_SUB               4,  6, 9 ; stp2_28, stp2_27
-  SUM_SUB               7, 13, 9 ; stp2_29, stp2_26
-  SUM_SUB              11,  1, 9 ; stp2_30, stp2_25
-  SUM_SUB              12,  2, 9 ; stp2_31, stp2_24
-  mova [stp + %4 + idx28], m4
-  mova [stp + %4 + idx29], m7
-  mova [stp + %4 + idx30], m11
-  mova [stp + %4 + idx31], m12
-
-  ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               6,  5,  9
-  pmulhrsw             m6, m10  ; stp1_27
-  pmulhrsw             m5, m10  ; stp1_20
-  SUM_SUB              13, 14,  9
-  pmulhrsw            m13, m10  ; stp1_26
-  pmulhrsw            m14, m10  ; stp1_21
-  SUM_SUB               1,  0,  9
-  pmulhrsw             m1, m10  ; stp1_25
-  pmulhrsw             m0, m10  ; stp1_22
-  SUM_SUB               2,  3,  9
-  pmulhrsw             m2, m10  ; stp1_25
-  pmulhrsw             m3, m10  ; stp1_22
-%else
-  BUTTERFLY_4X          6,     5,  11585, 11585,  m8,  9,  10 ; stp1_20, stp1_27
-  SWAP  6, 5
-  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_21, stp1_26
-  SWAP 13, 14
-  BUTTERFLY_4X          1,     0,  11585, 11585,  m8,  9,  10 ; stp1_22, stp1_25
-  SWAP  1, 0
-  BUTTERFLY_4X          2,     3,  11585, 11585,  m8,  9,  10 ; stp1_23, stp1_24
-  SWAP  2, 3
-%endif
-  mova [stp + %3 + idx20], m5
-  mova [stp + %3 + idx21], m14
-  mova [stp + %3 + idx22], m0
-  mova [stp + %3 + idx23], m3
-  mova [stp + %4 + idx24], m2
-  mova [stp + %4 + idx25], m1
-  mova [stp + %4 + idx26], m13
-  mova [stp + %4 + idx27], m6
-
-  ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m0, [rsp + transposed_in + 16 *  2]
-  mova                 m1, m0
-  pmulhrsw             m0, [pw__1606x2] ; stp1_8
-  pmulhrsw             m1, [pw_16305x2] ; stp2_15
-
-  mova                 m6, [rsp + transposed_in + 16 *  6]
-  mova                 m7, m6
-  pmulhrsw             m7, [pw_m4756x2] ; stp2_11
-  pmulhrsw             m6, [pw_15679x2] ; stp1_12
-
-  mova                 m4, [rsp + transposed_in + 16 * 10]
-  mova                 m5, m4
-  pmulhrsw             m4, [pw__7723x2] ; stp1_10
-  pmulhrsw             m5, [pw_14449x2] ; stp2_13
-
-  mova                 m2, [rsp + transposed_in + 16 * 14]
-  mova                 m3, m2
-  pmulhrsw             m3, [pw_m10394x2] ; stp1_9
-  pmulhrsw             m2, [pw_12665x2] ; stp2_14
-
-  ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0,  3, 9 ;  stp1_8, stp1_9
-  SUM_SUB               7,  4, 9 ; stp1_11, stp1_10
-  SUM_SUB               6,  5, 9 ; stp1_12, stp1_13
-  SUM_SUB               1,  2, 9 ; stp1_15, stp1_14
-
-  ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_9, stp1_14
-  BUTTERFLY_4Xmm        5,     4,   6270, 15137,  m8,  9,  10 ; stp1_13, stp1_10
-
-  ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0,  7, 9 ;  stp1_8, stp1_11
-  SUM_SUB               2,  4, 9 ;  stp1_9, stp1_10
-  SUM_SUB               1,  6, 9 ;  stp1_15, stp1_12
-  SUM_SUB               3,  5, 9 ;  stp1_14, stp1_13
-
-  ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               5,    4,  9
-  pmulhrsw             m5, m10  ; stp1_13
-  pmulhrsw             m4, m10  ; stp1_10
-  SUM_SUB               6,    7,  9
-  pmulhrsw             m6, m10  ; stp1_12
-  pmulhrsw             m7, m10  ; stp1_11
-%else
-  BUTTERFLY_4X       5,     4,  11585,  11585,  m8,  9,  10 ; stp1_10, stp1_13
-  SWAP  5, 4
-  BUTTERFLY_4X       6,     7,  11585,  11585,  m8,  9,  10 ; stp1_11, stp1_12
-  SWAP  6, 7
-%endif
-  ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova [stp + %2 +  idx8], m0
-  mova [stp + %2 +  idx9], m2
-  mova [stp + %2 + idx10], m4
-  mova [stp + %2 + idx11], m7
-  mova [stp + %2 + idx12], m6
-  mova [stp + %2 + idx13], m5
-  mova [stp + %2 + idx14], m3
-  mova [stp + %2 + idx15], m1
-
-  ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                m11, [rsp + transposed_in + 16 *  4]
-  mova                m12, m11
-  pmulhrsw            m11, [pw__3196x2] ; stp1_4
-  pmulhrsw            m12, [pw_16069x2] ; stp1_7
-
-  mova                m13, [rsp + transposed_in + 16 * 12]
-  mova                m14, m13
-  pmulhrsw            m13, [pw_13623x2] ; stp1_6
-  pmulhrsw            m14, [pw_m9102x2] ; stp1_5
-
-  ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m0, [rsp + transposed_in + 16 *  0]
-  mova                 m2, [rsp + transposed_in + 16 *  8]
-  pmulhrsw             m0, [pw_11585x2]  ; stp1_1
-  mova                 m3, m2
-  pmulhrsw             m2, [pw__6270x2]  ; stp1_2
-  pmulhrsw             m3, [pw_15137x2]  ; stp1_3
-
-  SUM_SUB              11, 14, 9 ;  stp1_4, stp1_5
-  SUM_SUB              12, 13, 9 ;  stp1_7, stp1_6
-
-  ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB              13,   14,  9
-  pmulhrsw            m13, m10  ; stp1_6
-  pmulhrsw            m14, m10  ; stp1_5
-%else
-  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_5, stp1_6
-  SWAP 13, 14
-%endif
-  mova                 m1, m0    ; stp1_0 = stp1_1
-  SUM_SUB               0,  3, 9 ;  stp1_0, stp1_3
-  SUM_SUB               1,  2, 9 ;  stp1_1, stp1_2
-
-  ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0, 12, 9 ;  stp1_0, stp1_7
-  SUM_SUB               1, 13, 9 ;  stp1_1, stp1_6
-  SUM_SUB               2, 14, 9 ;  stp1_2, stp1_5
-  SUM_SUB               3, 11, 9 ;  stp1_3, stp1_4
-
-  ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m4, [stp + %2 + idx12]
-  mova                 m5, [stp + %2 + idx13]
-  mova                 m6, [stp + %2 + idx14]
-  mova                 m7, [stp + %2 + idx15]
-  SUM_SUB               0,  7, 9 ;  stp1_0, stp1_15
-  SUM_SUB               1,  6, 9 ;  stp1_1, stp1_14
-  SUM_SUB               2,  5, 9 ;  stp1_2, stp1_13
-  SUM_SUB               3,  4, 9 ;  stp1_3, stp1_12
-
-  ; 0-3, 28-31 final stage
-  mova                m10, [stp + %4 + idx31]
-  mova                m15, [stp + %4 + idx30]
-  SUM_SUB               0, 10, 9 ;  stp1_0, stp1_31
-  SUM_SUB               1, 15, 9 ;  stp1_1, stp1_30
-  mova [stp + %1 +  idx0], m0
-  mova [stp + %1 +  idx1], m1
-  mova [stp + %4 + idx31], m10
-  mova [stp + %4 + idx30], m15
-  mova                 m0, [stp + %4 + idx29]
-  mova                 m1, [stp + %4 + idx28]
-  SUM_SUB               2,  0, 9 ;  stp1_2, stp1_29
-  SUM_SUB               3,  1, 9 ;  stp1_3, stp1_28
-  mova [stp + %1 +  idx2], m2
-  mova [stp + %1 +  idx3], m3
-  mova [stp + %4 + idx29], m0
-  mova [stp + %4 + idx28], m1
-
-  ; 12-15, 16-19 final stage
-  mova                 m0, [stp + %3 + idx16]
-  mova                 m1, [stp + %3 + idx17]
-  mova                 m2, [stp + %3 + idx18]
-  mova                 m3, [stp + %3 + idx19]
-  SUM_SUB               7,  0, 9 ;  stp1_15, stp1_16
-  SUM_SUB               6,  1, 9 ;  stp1_14, stp1_17
-  SUM_SUB               5,  2, 9 ;  stp1_13, stp1_18
-  SUM_SUB               4,  3, 9 ;  stp1_12, stp1_19
-  mova [stp + %2 + idx12], m4
-  mova [stp + %2 + idx13], m5
-  mova [stp + %2 + idx14], m6
-  mova [stp + %2 + idx15], m7
-  mova [stp + %3 + idx16], m0
-  mova [stp + %3 + idx17], m1
-  mova [stp + %3 + idx18], m2
-  mova [stp + %3 + idx19], m3
-
-  mova                 m4, [stp + %2 +  idx8]
-  mova                 m5, [stp + %2 +  idx9]
-  mova                 m6, [stp + %2 + idx10]
-  mova                 m7, [stp + %2 + idx11]
-  SUM_SUB              11,  7, 9 ;  stp1_4, stp1_11
-  SUM_SUB              14,  6, 9 ;  stp1_5, stp1_10
-  SUM_SUB              13,  5, 9 ;  stp1_6, stp1_9
-  SUM_SUB              12,  4, 9 ;  stp1_7, stp1_8
-
-  ; 4-7, 24-27 final stage
-  mova                 m3, [stp + %4 + idx24]
-  mova                 m2, [stp + %4 + idx25]
-  mova                 m1, [stp + %4 + idx26]
-  mova                 m0, [stp + %4 + idx27]
-  SUM_SUB              12,  3, 9 ;  stp1_7, stp1_24
-  SUM_SUB              13,  2, 9 ;  stp1_6, stp1_25
-  SUM_SUB              14,  1, 9 ;  stp1_5, stp1_26
-  SUM_SUB              11,  0, 9 ;  stp1_4, stp1_27
-  mova [stp + %4 + idx24], m3
-  mova [stp + %4 + idx25], m2
-  mova [stp + %4 + idx26], m1
-  mova [stp + %4 + idx27], m0
-  mova [stp + %1 +  idx4], m11
-  mova [stp + %1 +  idx5], m14
-  mova [stp + %1 +  idx6], m13
-  mova [stp + %1 +  idx7], m12
-
-  ; 8-11, 20-23 final stage
-  mova                 m0, [stp + %3 + idx20]
-  mova                 m1, [stp + %3 + idx21]
-  mova                 m2, [stp + %3 + idx22]
-  mova                 m3, [stp + %3 + idx23]
-  SUM_SUB               7,  0, 9 ;  stp1_11, stp_20
-  SUM_SUB               6,  1, 9 ;  stp1_10, stp_21
-  SUM_SUB               5,  2, 9 ;   stp1_9, stp_22
-  SUM_SUB               4,  3, 9 ;   stp1_8, stp_23
-  mova [stp + %2 +  idx8], m4
-  mova [stp + %2 +  idx9], m5
-  mova [stp + %2 + idx10], m6
-  mova [stp + %2 + idx11], m7
-  mova [stp + %3 + idx20], m0
-  mova [stp + %3 + idx21], m1
-  mova [stp + %3 + idx22], m2
-  mova [stp + %3 + idx23], m3
-%endmacro
-
-INIT_XMM ssse3
-cglobal idct32x32_135_add, 3, 11, 16, i32x32_size, input, output, stride
-  mova            m8, [pd_8192]
-  mov             r6, 2
-  lea            stp, [rsp + pass_one_start]
-
-idct32x32_135:
-  mov             r3, inputq
-  lea             r4, [rsp + transposed_in]
-  mov             r7, 2
-
-idct32x32_135_transpose:
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova            m0, [r3 +       0]
-  packssdw        m0, [r3 +      16]
-  mova            m1, [r3 + 32 *  4]
-  packssdw        m1, [r3 + 32 *  4 + 16]
-  mova            m2, [r3 + 32 *  8]
-  packssdw        m2, [r3 + 32 *  8 + 16]
-  mova            m3, [r3 + 32 * 12]
-  packssdw        m3, [r3 + 32 * 12 + 16]
-  mova            m4, [r3 + 32 * 16]
-  packssdw        m4, [r3 + 32 * 16 + 16]
-  mova            m5, [r3 + 32 * 20]
-  packssdw        m5, [r3 + 32 * 20 + 16]
-  mova            m6, [r3 + 32 * 24]
-  packssdw        m6, [r3 + 32 * 24 + 16]
-  mova            m7, [r3 + 32 * 28]
-  packssdw        m7, [r3 + 32 * 28 + 16]
-%else
-  mova            m0, [r3 +       0]
-  mova            m1, [r3 + 16 *  4]
-  mova            m2, [r3 + 16 *  8]
-  mova            m3, [r3 + 16 * 12]
-  mova            m4, [r3 + 16 * 16]
-  mova            m5, [r3 + 16 * 20]
-  mova            m6, [r3 + 16 * 24]
-  mova            m7, [r3 + 16 * 28]
-%endif
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  mova [r4 +      0], m0
-  mova [r4 + 16 * 1], m1
-  mova [r4 + 16 * 2], m2
-  mova [r4 + 16 * 3], m3
-  mova [r4 + 16 * 4], m4
-  mova [r4 + 16 * 5], m5
-  mova [r4 + 16 * 6], m6
-  mova [r4 + 16 * 7], m7
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  add             r3, 32
-%else
-  add             r3, 16
-%endif
-  add             r4, 16 * 8
-  dec             r7
-  jne idct32x32_135_transpose
-
-  IDCT32X32_135 16*0, 16*32, 16*64, 16*96
-  lea            stp, [stp + 16 * 8]
-%if CONFIG_VP9_HIGHBITDEPTH
-  lea         inputq, [inputq + 32 * 32]
-%else
-  lea         inputq, [inputq + 16 * 32]
-%endif
-  dec             r6
-  jnz idct32x32_135
-
-  mov             r6, 4
-  lea            stp, [rsp + pass_one_start]
-  lea             r9, [rsp + pass_one_start]
-
-idct32x32_135_2:
-  lea             r4, [rsp + transposed_in]
-  mov             r3, r9
-  mov             r7, 2
-
-idct32x32_135_transpose_2:
-  mova            m0, [r3 +      0]
-  mova            m1, [r3 + 16 * 1]
-  mova            m2, [r3 + 16 * 2]
-  mova            m3, [r3 + 16 * 3]
-  mova            m4, [r3 + 16 * 4]
-  mova            m5, [r3 + 16 * 5]
-  mova            m6, [r3 + 16 * 6]
-  mova            m7, [r3 + 16 * 7]
-
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  mova [r4 +      0], m0
-  mova [r4 + 16 * 1], m1
-  mova [r4 + 16 * 2], m2
-  mova [r4 + 16 * 3], m3
-  mova [r4 + 16 * 4], m4
-  mova [r4 + 16 * 5], m5
-  mova [r4 + 16 * 6], m6
-  mova [r4 + 16 * 7], m7
-
-  add             r3, 16 * 8
-  add             r4, 16 * 8
-  dec             r7
-  jne idct32x32_135_transpose_2
-
-  IDCT32X32_135 16*0, 16*8, 16*16, 16*24
-
-  lea            stp, [stp + 16 * 32]
-  add             r9, 16 * 32
-  dec             r6
-  jnz idct32x32_135_2
-
-  RECON_AND_STORE pass_two_start
-
-  RET
-
-%macro IDCT32X32_1024 4
-  ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m1, [rsp + transposed_in + 16 *  1]
-  mova                m11, [rsp + transposed_in + 16 * 31]
-  BUTTERFLY_4X          1,    11,    804, 16364,  m8,  9,  10 ; stp1_16, stp1_31
-
-  mova                 m0, [rsp + transposed_in + 16 * 15]
-  mova                 m2, [rsp + transposed_in + 16 * 17]
-  BUTTERFLY_4X          2,     0,  12140, 11003,  m8,  9,  10 ; stp1_17, stp1_30
-
-  mova                 m7, [rsp + transposed_in + 16 *  7]
-  mova                m12, [rsp + transposed_in + 16 * 25]
-  BUTTERFLY_4X         12,     7,  15426,  5520,  m8,  9,  10 ; stp1_19, stp1_28
-
-  mova                 m3, [rsp + transposed_in + 16 *  9]
-  mova                 m4, [rsp + transposed_in + 16 * 23]
-  BUTTERFLY_4X          3,     4,   7005, 14811,  m8,  9,  10 ; stp1_18, stp1_29
-
-  ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               1,  2, 9 ; stp2_16, stp2_17
-  SUM_SUB              12,  3, 9 ; stp2_19, stp2_18
-  SUM_SUB               7,  4, 9 ; stp2_28, stp2_29
-  SUM_SUB              11,  0, 9 ; stp2_31, stp2_30
-
-  ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          0,     2,   3196, 16069,  m8,  9,  10 ; stp1_17, stp1_30
-  BUTTERFLY_4Xmm        4,     3,   3196, 16069,  m8,  9,  10 ; stp1_29, stp1_18
-
-  ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               1, 12, 9 ; stp2_16, stp2_19
-  SUM_SUB               0,  3, 9 ; stp2_17, stp2_18
-  SUM_SUB              11,  7, 9 ; stp2_31, stp2_28
-  SUM_SUB               2,  4, 9 ; stp2_30, stp2_29
-
-  ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          4,     3,   6270, 15137,  m8,  9,  10 ; stp1_18, stp1_29
-  BUTTERFLY_4X          7,    12,   6270, 15137,  m8,  9,  10 ; stp1_19, stp1_28
-
-  mova [stp + %3 + idx16], m1
-  mova [stp + %3 + idx17], m0
-  mova [stp + %3 + idx18], m4
-  mova [stp + %3 + idx19], m7
-  mova [stp + %4 + idx28], m12
-  mova [stp + %4 + idx29], m3
-  mova [stp + %4 + idx30], m2
-  mova [stp + %4 + idx31], m11
-
-  ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m5, [rsp + transposed_in + 16 *  5]
-  mova                 m6, [rsp + transposed_in + 16 * 27]
-  BUTTERFLY_4X          5,     6,   3981, 15893,  m8,  9,  10 ; stp1_20, stp1_27
-
-  mova                m13, [rsp + transposed_in + 16 * 21]
-  mova                m14, [rsp + transposed_in + 16 * 11]
-  BUTTERFLY_4X         13,    14,  14053,  8423,  m8,  9,  10 ; stp1_21, stp1_26
-
-  mova                 m0, [rsp + transposed_in + 16 * 13]
-  mova                 m1, [rsp + transposed_in + 16 * 19]
-  BUTTERFLY_4X          0,     1,   9760, 13160,  m8,  9,  10 ; stp1_22, stp1_25
-
-  mova                 m2, [rsp + transposed_in + 16 *  3]
-  mova                 m3, [rsp + transposed_in + 16 * 29]
-  BUTTERFLY_4X          3,     2,  16207,  2404,  m8,  9,  10 ; stp1_23, stp1_24
-
-  ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               5, 13, 9 ; stp2_20, stp2_21
-  SUM_SUB               3,  0, 9 ; stp2_23, stp2_22
-  SUM_SUB               2,  1, 9 ; stp2_24, stp2_25
-  SUM_SUB               6, 14, 9 ; stp2_27, stp2_26
-
-  ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_21, stp1_26
-  BUTTERFLY_4Xmm        1,     0,  13623,  9102,  m8,  9,  10 ; stp1_25, stp1_22
-
-  ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               3,  5, 9 ; stp2_23, stp2_20
-  SUM_SUB               0, 14, 9 ; stp2_22, stp2_21
-  SUM_SUB               2,  6, 9 ; stp2_24, stp2_27
-  SUM_SUB               1, 13, 9 ; stp2_25, stp2_26
-
-  ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4Xmm        6,     5,   6270, 15137,  m8,  9,  10 ; stp1_27, stp1_20
-  BUTTERFLY_4Xmm       13,    14,   6270, 15137,  m8,  9,  10 ; stp1_26, stp1_21
-
-  ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m4, [stp + %3 + idx16]
-  mova                 m7, [stp + %3 + idx17]
-  mova                m11, [stp + %3 + idx18]
-  mova                m12, [stp + %3 + idx19]
-  SUM_SUB               4,  3, 9 ; stp2_16, stp2_23
-  SUM_SUB               7,  0, 9 ; stp2_17, stp2_22
-  SUM_SUB              11, 14, 9 ; stp2_18, stp2_21
-  SUM_SUB              12,  5, 9 ; stp2_19, stp2_20
-  mova [stp + %3 + idx16], m4
-  mova [stp + %3 + idx17], m7
-  mova [stp + %3 + idx18], m11
-  mova [stp + %3 + idx19], m12
-
-  mova                 m4, [stp + %4 + idx28]
-  mova                 m7, [stp + %4 + idx29]
-  mova                m11, [stp + %4 + idx30]
-  mova                m12, [stp + %4 + idx31]
-  SUM_SUB               4,  6, 9 ; stp2_28, stp2_27
-  SUM_SUB               7, 13, 9 ; stp2_29, stp2_26
-  SUM_SUB              11,  1, 9 ; stp2_30, stp2_25
-  SUM_SUB              12,  2, 9 ; stp2_31, stp2_24
-  mova [stp + %4 + idx28], m4
-  mova [stp + %4 + idx29], m7
-  mova [stp + %4 + idx30], m11
-  mova [stp + %4 + idx31], m12
-
-  ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               6,  5,  9
-  pmulhrsw             m6, m10  ; stp1_27
-  pmulhrsw             m5, m10  ; stp1_20
-  SUM_SUB              13, 14,  9
-  pmulhrsw            m13, m10  ; stp1_26
-  pmulhrsw            m14, m10  ; stp1_21
-  SUM_SUB               1,  0,  9
-  pmulhrsw             m1, m10  ; stp1_25
-  pmulhrsw             m0, m10  ; stp1_22
-  SUM_SUB               2,  3,  9
-  pmulhrsw             m2, m10  ; stp1_25
-  pmulhrsw             m3, m10  ; stp1_22
-%else
-  BUTTERFLY_4X          6,     5,  11585, 11585,  m8,  9,  10 ; stp1_20, stp1_27
-  SWAP  6, 5
-  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_21, stp1_26
-  SWAP 13, 14
-  BUTTERFLY_4X          1,     0,  11585, 11585,  m8,  9,  10 ; stp1_22, stp1_25
-  SWAP  1, 0
-  BUTTERFLY_4X          2,     3,  11585, 11585,  m8,  9,  10 ; stp1_23, stp1_24
-  SWAP  2, 3
-%endif
-  mova [stp + %3 + idx20], m5
-  mova [stp + %3 + idx21], m14
-  mova [stp + %3 + idx22], m0
-  mova [stp + %3 + idx23], m3
-  mova [stp + %4 + idx24], m2
-  mova [stp + %4 + idx25], m1
-  mova [stp + %4 + idx26], m13
-  mova [stp + %4 + idx27], m6
-
-  ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m0, [rsp + transposed_in + 16 *  2]
-  mova                 m1, [rsp + transposed_in + 16 * 30]
-  BUTTERFLY_4X          0,     1,   1606, 16305,  m8,  9,  10 ; stp1_8, stp1_15
-
-  mova                 m2, [rsp + transposed_in + 16 * 14]
-  mova                 m3, [rsp + transposed_in + 16 * 18]
-  BUTTERFLY_4X          3,     2,  12665, 10394,  m8,  9,  10 ; stp1_9, stp1_14
-
-  mova                 m4, [rsp + transposed_in + 16 * 10]
-  mova                 m5, [rsp + transposed_in + 16 * 22]
-  BUTTERFLY_4X          4,     5,   7723, 14449,  m8,  9,  10 ; stp1_10, stp1_13
-
-  mova                 m6, [rsp + transposed_in + 16 *  6]
-  mova                 m7, [rsp + transposed_in + 16 * 26]
-  BUTTERFLY_4X          7,     6,  15679,  4756,  m8,  9,  10 ; stp1_11, stp1_12
-
-  ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0,  3, 9 ;  stp1_8, stp1_9
-  SUM_SUB               7,  4, 9 ; stp1_11, stp1_10
-  SUM_SUB               6,  5, 9 ; stp1_12, stp1_13
-  SUM_SUB               1,  2, 9 ; stp1_15, stp1_14
-
-  ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_9, stp1_14
-  BUTTERFLY_4Xmm        5,     4,   6270, 15137,  m8,  9,  10 ; stp1_13, stp1_10
-
-  ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0,  7, 9 ;  stp1_8, stp1_11
-  SUM_SUB               2,  4, 9 ;  stp1_9, stp1_10
-  SUM_SUB               1,  6, 9 ;  stp1_15, stp1_12
-  SUM_SUB               3,  5, 9 ;  stp1_14, stp1_13
-
-  ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               5,    4,  9
-  pmulhrsw             m5, m10  ; stp1_13
-  pmulhrsw             m4, m10  ; stp1_10
-  SUM_SUB               6,    7,  9
-  pmulhrsw             m6, m10  ; stp1_12
-  pmulhrsw             m7, m10  ; stp1_11
-%else
-  BUTTERFLY_4X       5,     4,  11585,  11585,  m8,  9,  10 ; stp1_10, stp1_13
-  SWAP  5, 4
-  BUTTERFLY_4X       6,     7,  11585,  11585,  m8,  9,  10 ; stp1_11, stp1_12
-  SWAP  6, 7
-%endif
-  ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova [stp + %2 +  idx8], m0
-  mova [stp + %2 +  idx9], m2
-  mova [stp + %2 + idx10], m4
-  mova [stp + %2 + idx11], m7
-  mova [stp + %2 + idx12], m6
-  mova [stp + %2 + idx13], m5
-  mova [stp + %2 + idx14], m3
-  mova [stp + %2 + idx15], m1
-
-  ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                m11, [rsp + transposed_in + 16 *  4]
-  mova                m12, [rsp + transposed_in + 16 * 28]
-  BUTTERFLY_4X         11,    12,   3196, 16069,  m8,  9,  10 ; stp1_4, stp1_7
-
-  mova                m13, [rsp + transposed_in + 16 * 12]
-  mova                m14, [rsp + transposed_in + 16 * 20]
-  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_5, stp1_6
-
-  ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m0, [rsp + transposed_in + 16 *  0]
-  mova                 m1, [rsp + transposed_in + 16 * 16]
-
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               0,    1,  9
-  pmulhrsw             m0, m10  ; stp1_1
-  pmulhrsw             m1, m10  ; stp1_0
-%else
-  BUTTERFLY_4X          0,     1,  11585, 11585,  m8,  9,  10 ; stp1_1, stp1_0
-  SWAP  0, 1
-%endif
-  mova                 m2, [rsp + transposed_in + 16 *  8]
-  mova                 m3, [rsp + transposed_in + 16 * 24]
-  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_2, stp1_3
-
-  mova                m10, [pw_11585x2]
-  SUM_SUB              11, 14, 9 ;  stp1_4, stp1_5
-  SUM_SUB              12, 13, 9 ;  stp1_7, stp1_6
-
-  ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  SUM_SUB              13,   14,  9
-  pmulhrsw            m13, m10  ; stp1_6
-  pmulhrsw            m14, m10  ; stp1_5
-%else
-  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_5, stp1_6
-  SWAP 13, 14
-%endif
-  SUM_SUB               0,  3, 9 ;  stp1_0, stp1_3
-  SUM_SUB               1,  2, 9 ;  stp1_1, stp1_2
-
-  ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0, 12, 9 ;  stp1_0, stp1_7
-  SUM_SUB               1, 13, 9 ;  stp1_1, stp1_6
-  SUM_SUB               2, 14, 9 ;  stp1_2, stp1_5
-  SUM_SUB               3, 11, 9 ;  stp1_3, stp1_4
-
-  ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m4, [stp + %2 + idx12]
-  mova                 m5, [stp + %2 + idx13]
-  mova                 m6, [stp + %2 + idx14]
-  mova                 m7, [stp + %2 + idx15]
-  SUM_SUB               0,  7, 9 ;  stp1_0, stp1_15
-  SUM_SUB               1,  6, 9 ;  stp1_1, stp1_14
-  SUM_SUB               2,  5, 9 ;  stp1_2, stp1_13
-  SUM_SUB               3,  4, 9 ;  stp1_3, stp1_12
-
-  ; 0-3, 28-31 final stage
-  mova                m10, [stp + %4 + idx31]
-  mova                m15, [stp + %4 + idx30]
-  SUM_SUB               0, 10, 9 ;  stp1_0, stp1_31
-  SUM_SUB               1, 15, 9 ;  stp1_1, stp1_30
-  mova [stp + %1 +  idx0], m0
-  mova [stp + %1 +  idx1], m1
-  mova [stp + %4 + idx31], m10
-  mova [stp + %4 + idx30], m15
-  mova                 m0, [stp + %4 + idx29]
-  mova                 m1, [stp + %4 + idx28]
-  SUM_SUB               2,  0, 9 ;  stp1_2, stp1_29
-  SUM_SUB               3,  1, 9 ;  stp1_3, stp1_28
-  mova [stp + %1 +  idx2], m2
-  mova [stp + %1 +  idx3], m3
-  mova [stp + %4 + idx29], m0
-  mova [stp + %4 + idx28], m1
-
-  ; 12-15, 16-19 final stage
-  mova                 m0, [stp + %3 + idx16]
-  mova                 m1, [stp + %3 + idx17]
-  mova                 m2, [stp + %3 + idx18]
-  mova                 m3, [stp + %3 + idx19]
-  SUM_SUB               7,  0, 9 ;  stp1_15, stp1_16
-  SUM_SUB               6,  1, 9 ;  stp1_14, stp1_17
-  SUM_SUB               5,  2, 9 ;  stp1_13, stp1_18
-  SUM_SUB               4,  3, 9 ;  stp1_12, stp1_19
-  mova [stp + %2 + idx12], m4
-  mova [stp + %2 + idx13], m5
-  mova [stp + %2 + idx14], m6
-  mova [stp + %2 + idx15], m7
-  mova [stp + %3 + idx16], m0
-  mova [stp + %3 + idx17], m1
-  mova [stp + %3 + idx18], m2
-  mova [stp + %3 + idx19], m3
-
-  mova                 m4, [stp + %2 +  idx8]
-  mova                 m5, [stp + %2 +  idx9]
-  mova                 m6, [stp + %2 + idx10]
-  mova                 m7, [stp + %2 + idx11]
-  SUM_SUB              11,  7, 9 ;  stp1_4, stp1_11
-  SUM_SUB              14,  6, 9 ;  stp1_5, stp1_10
-  SUM_SUB              13,  5, 9 ;  stp1_6, stp1_9
-  SUM_SUB              12,  4, 9 ;  stp1_7, stp1_8
-
-  ; 4-7, 24-27 final stage
-  mova                 m3, [stp + %4 + idx24]
-  mova                 m2, [stp + %4 + idx25]
-  mova                 m1, [stp + %4 + idx26]
-  mova                 m0, [stp + %4 + idx27]
-  SUM_SUB              12,  3, 9 ;  stp1_7, stp1_24
-  SUM_SUB              13,  2, 9 ;  stp1_6, stp1_25
-  SUM_SUB              14,  1, 9 ;  stp1_5, stp1_26
-  SUM_SUB              11,  0, 9 ;  stp1_4, stp1_27
-  mova [stp + %4 + idx24], m3
-  mova [stp + %4 + idx25], m2
-  mova [stp + %4 + idx26], m1
-  mova [stp + %4 + idx27], m0
-  mova [stp + %1 +  idx4], m11
-  mova [stp + %1 +  idx5], m14
-  mova [stp + %1 +  idx6], m13
-  mova [stp + %1 +  idx7], m12
-
-  ; 8-11, 20-23 final stage
-  mova                 m0, [stp + %3 + idx20]
-  mova                 m1, [stp + %3 + idx21]
-  mova                 m2, [stp + %3 + idx22]
-  mova                 m3, [stp + %3 + idx23]
-  SUM_SUB               7,  0, 9 ;  stp1_11, stp_20
-  SUM_SUB               6,  1, 9 ;  stp1_10, stp_21
-  SUM_SUB               5,  2, 9 ;   stp1_9, stp_22
-  SUM_SUB               4,  3, 9 ;   stp1_8, stp_23
-  mova [stp + %2 +  idx8], m4
-  mova [stp + %2 +  idx9], m5
-  mova [stp + %2 + idx10], m6
-  mova [stp + %2 + idx11], m7
-  mova [stp + %3 + idx20], m0
-  mova [stp + %3 + idx21], m1
-  mova [stp + %3 + idx22], m2
-  mova [stp + %3 + idx23], m3
-%endmacro
-
-INIT_XMM ssse3
-cglobal idct32x32_1024_add, 3, 11, 16, i32x32_size, input, output, stride
-  mova            m8, [pd_8192]
-  mov             r6, 4
-  lea            stp, [rsp + pass_one_start]
-
-idct32x32_1024:
-  mov             r3, inputq
-  lea             r4, [rsp + transposed_in]
-  mov             r7, 4
-
-idct32x32_1024_transpose:
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova            m0, [r3 +       0]
-  packssdw        m0, [r3 +      16]
-  mova            m1, [r3 + 32 *  4]
-  packssdw        m1, [r3 + 32 *  4 + 16]
-  mova            m2, [r3 + 32 *  8]
-  packssdw        m2, [r3 + 32 *  8 + 16]
-  mova            m3, [r3 + 32 * 12]
-  packssdw        m3, [r3 + 32 * 12 + 16]
-  mova            m4, [r3 + 32 * 16]
-  packssdw        m4, [r3 + 32 * 16 + 16]
-  mova            m5, [r3 + 32 * 20]
-  packssdw        m5, [r3 + 32 * 20 + 16]
-  mova            m6, [r3 + 32 * 24]
-  packssdw        m6, [r3 + 32 * 24 + 16]
-  mova            m7, [r3 + 32 * 28]
-  packssdw        m7, [r3 + 32 * 28 + 16]
-%else
-  mova            m0, [r3 +       0]
-  mova            m1, [r3 + 16 *  4]
-  mova            m2, [r3 + 16 *  8]
-  mova            m3, [r3 + 16 * 12]
-  mova            m4, [r3 + 16 * 16]
-  mova            m5, [r3 + 16 * 20]
-  mova            m6, [r3 + 16 * 24]
-  mova            m7, [r3 + 16 * 28]
-%endif
-
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  mova [r4 +      0], m0
-  mova [r4 + 16 * 1], m1
-  mova [r4 + 16 * 2], m2
-  mova [r4 + 16 * 3], m3
-  mova [r4 + 16 * 4], m4
-  mova [r4 + 16 * 5], m5
-  mova [r4 + 16 * 6], m6
-  mova [r4 + 16 * 7], m7
-%if CONFIG_VP9_HIGHBITDEPTH
-  add             r3, 32
-%else
-  add             r3, 16
-%endif
-  add             r4, 16 * 8
-  dec             r7
-  jne idct32x32_1024_transpose
-
-  IDCT32X32_1024 16*0, 16*32, 16*64, 16*96
-
-  lea            stp, [stp + 16 * 8]
-%if CONFIG_VP9_HIGHBITDEPTH
-  lea         inputq, [inputq + 32 * 32]
-%else
-  lea         inputq, [inputq + 16 * 32]
-%endif
-  dec             r6
-  jnz idct32x32_1024
-
-  mov             r6, 4
-  lea            stp, [rsp + pass_one_start]
-  lea             r9, [rsp + pass_one_start]
-
-idct32x32_1024_2:
-  lea             r4, [rsp + transposed_in]
-  mov             r3, r9
-  mov             r7, 4
-
-idct32x32_1024_transpose_2:
-  mova            m0, [r3 +      0]
-  mova            m1, [r3 + 16 * 1]
-  mova            m2, [r3 + 16 * 2]
-  mova            m3, [r3 + 16 * 3]
-  mova            m4, [r3 + 16 * 4]
-  mova            m5, [r3 + 16 * 5]
-  mova            m6, [r3 + 16 * 6]
-  mova            m7, [r3 + 16 * 7]
-
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  mova [r4 +      0], m0
-  mova [r4 + 16 * 1], m1
-  mova [r4 + 16 * 2], m2
-  mova [r4 + 16 * 3], m3
-  mova [r4 + 16 * 4], m4
-  mova [r4 + 16 * 5], m5
-  mova [r4 + 16 * 6], m6
-  mova [r4 + 16 * 7], m7
-
-  add             r3, 16 * 8
-  add             r4, 16 * 8
-  dec             r7
-  jne idct32x32_1024_transpose_2
-
-  IDCT32X32_1024 16*0, 16*8, 16*16, 16*24
-
-  lea            stp, [stp + 16 * 32]
-  add             r9, 16 * 32
-  dec             r6
-  jnz idct32x32_1024_2
-
-  RECON_AND_STORE pass_two_start
-
-  RET
-%endif
diff --git a/thirdparty/libvpx/vpx_dsp/x86/inv_wht_sse2.asm b/thirdparty/libvpx/vpx_dsp/x86/inv_wht_sse2.asm
deleted file mode 100644
index fbbcd76bd7..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/inv_wht_sse2.asm
+++ /dev/null
@@ -1,109 +0,0 @@
-;
-;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro REORDER_INPUTS 0
-  ; a c d b  to  a b c d
-  SWAP 1, 3, 2
-%endmacro
-
-%macro TRANSFORM_COLS 0
-  ; input:
-  ; m0 a
-  ; m1 b
-  ; m2 c
-  ; m3 d
-  paddw           m0,        m2
-  psubw           m3,        m1
-
-  ; wide subtract
-  punpcklwd       m4,        m0
-  punpcklwd       m5,        m3
-  psrad           m4,        16
-  psrad           m5,        16
-  psubd           m4,        m5
-  psrad           m4,        1
-  packssdw        m4,        m4             ; e
-
-  psubw           m5,        m4,        m1  ; b
-  psubw           m4,        m2             ; c
-  psubw           m0,        m5
-  paddw           m3,        m4
-                                ; m0 a
-  SWAP            1,         5  ; m1 b
-  SWAP            2,         4  ; m2 c
-                                ; m3 d
-%endmacro
-
-%macro TRANSPOSE_4X4 0
-  punpcklwd       m0,        m2
-  punpcklwd       m1,        m3
-  mova            m2,        m0
-  punpcklwd       m0,        m1
-  punpckhwd       m2,        m1
-  pshufd          m1,        m0, 0x0e
-  pshufd          m3,        m2, 0x0e
-%endmacro
-
-; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3
-%macro TRANSPOSE_4X4_WIDE 0
-  mova            m3, m0
-  punpcklwd       m0, m1
-  punpckhwd       m3, m1
-  mova            m2, m0
-  punpcklwd       m0, m3
-  punpckhwd       m2, m3
-  pshufd          m1, m0, 0x0e
-  pshufd          m3, m2, 0x0e
-%endmacro
-
-%macro ADD_STORE_4P_2X 5  ; src1, src2, tmp1, tmp2, zero
-  movd            m%3,       [outputq]
-  movd            m%4,       [outputq + strideq]
-  punpcklbw       m%3,       m%5
-  punpcklbw       m%4,       m%5
-  paddw           m%1,       m%3
-  paddw           m%2,       m%4
-  packuswb        m%1,       m%5
-  packuswb        m%2,       m%5
-  movd            [outputq], m%1
-  movd            [outputq + strideq], m%2
-%endmacro
-
-INIT_XMM sse2
-cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova            m0,        [inputq +  0]
-  packssdw        m0,        [inputq + 16]
-  mova            m1,        [inputq + 32]
-  packssdw        m1,        [inputq + 48]
-%else
-  mova            m0,        [inputq +  0]
-  mova            m1,        [inputq + 16]
-%endif
-  psraw           m0,        2
-  psraw           m1,        2
-
-  TRANSPOSE_4X4_WIDE
-  REORDER_INPUTS
-  TRANSFORM_COLS
-  TRANSPOSE_4X4
-  REORDER_INPUTS
-  TRANSFORM_COLS
-
-  pxor            m4, m4
-  ADD_STORE_4P_2X  0, 1, 5, 6, 4
-  lea             outputq, [outputq + 2 * strideq]
-  ADD_STORE_4P_2X  2, 3, 5, 6, 4
-
-  RET
diff --git a/thirdparty/libvpx/vpx_dsp/x86/loopfilter_avx2.c b/thirdparty/libvpx/vpx_dsp/x86/loopfilter_avx2.c
deleted file mode 100644
index be1087c1e9..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/loopfilter_avx2.c
+++ /dev/null
@@ -1,979 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <immintrin.h>  /* AVX2 */
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_ports/mem.h"
-
-void vpx_lpf_horizontal_edge_8_avx2(unsigned char *s, int p,
-                                    const unsigned char *_blimit,
-                                    const unsigned char *_limit,
-                                    const unsigned char *_thresh) {
-    __m128i mask, hev, flat, flat2;
-    const __m128i zero = _mm_set1_epi16(0);
-    const __m128i one = _mm_set1_epi8(1);
-    __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
-    __m128i abs_p1p0;
-
-    const __m128i thresh = _mm_broadcastb_epi8(
-            _mm_cvtsi32_si128((int) _thresh[0]));
-    const __m128i limit = _mm_broadcastb_epi8(
-            _mm_cvtsi32_si128((int) _limit[0]));
-    const __m128i blimit = _mm_broadcastb_epi8(
-            _mm_cvtsi32_si128((int) _blimit[0]));
-
-    q4p4 = _mm_loadl_epi64((__m128i *) (s - 5 * p));
-    q4p4 = _mm_castps_si128(
-            _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *) (s + 4 * p)));
-    q3p3 = _mm_loadl_epi64((__m128i *) (s - 4 * p));
-    q3p3 = _mm_castps_si128(
-            _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *) (s + 3 * p)));
-    q2p2 = _mm_loadl_epi64((__m128i *) (s - 3 * p));
-    q2p2 = _mm_castps_si128(
-            _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *) (s + 2 * p)));
-    q1p1 = _mm_loadl_epi64((__m128i *) (s - 2 * p));
-    q1p1 = _mm_castps_si128(
-            _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *) (s + 1 * p)));
-    p1q1 = _mm_shuffle_epi32(q1p1, 78);
-    q0p0 = _mm_loadl_epi64((__m128i *) (s - 1 * p));
-    q0p0 = _mm_castps_si128(
-            _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *) (s - 0 * p)));
-    p0q0 = _mm_shuffle_epi32(q0p0, 78);
-
-    {
-        __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
-        abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
-                _mm_subs_epu8(q0p0, q1p1));
-        abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
-        fe = _mm_set1_epi8(0xfe);
-        ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
-        abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
-                _mm_subs_epu8(p0q0, q0p0));
-        abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
-                _mm_subs_epu8(p1q1, q1p1));
-        flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-        hev = _mm_subs_epu8(flat, thresh);
-        hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-
-        abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
-        abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-        mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
-        mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-        // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-        mask = _mm_max_epu8(abs_p1p0, mask);
-        // mask |= (abs(p1 - p0) > limit) * -1;
-        // mask |= (abs(q1 - q0) > limit) * -1;
-
-        work = _mm_max_epu8(
-                _mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
-                        _mm_subs_epu8(q1p1, q2p2)),
-                _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
-                        _mm_subs_epu8(q2p2, q3p3)));
-        mask = _mm_max_epu8(work, mask);
-        mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-        mask = _mm_subs_epu8(mask, limit);
-        mask = _mm_cmpeq_epi8(mask, zero);
-    }
-
-    // lp filter
-    {
-        const __m128i t4 = _mm_set1_epi8(4);
-        const __m128i t3 = _mm_set1_epi8(3);
-        const __m128i t80 = _mm_set1_epi8(0x80);
-        const __m128i t1 = _mm_set1_epi16(0x1);
-        __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
-        __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
-        __m128i qs0 = _mm_xor_si128(p0q0, t80);
-        __m128i qs1 = _mm_xor_si128(p1q1, t80);
-        __m128i filt;
-        __m128i work_a;
-        __m128i filter1, filter2;
-        __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
-        __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
-
-        filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
-        work_a = _mm_subs_epi8(qs0, qs0ps0);
-        filt = _mm_adds_epi8(filt, work_a);
-        filt = _mm_adds_epi8(filt, work_a);
-        filt = _mm_adds_epi8(filt, work_a);
-        /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
-        filt = _mm_and_si128(filt, mask);
-
-        filter1 = _mm_adds_epi8(filt, t4);
-        filter2 = _mm_adds_epi8(filt, t3);
-
-        filter1 = _mm_unpacklo_epi8(zero, filter1);
-        filter1 = _mm_srai_epi16(filter1, 0xB);
-        filter2 = _mm_unpacklo_epi8(zero, filter2);
-        filter2 = _mm_srai_epi16(filter2, 0xB);
-
-        /* Filter1 >> 3 */
-        filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
-        qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
-
-        /* filt >> 1 */
-        filt = _mm_adds_epi16(filter1, t1);
-        filt = _mm_srai_epi16(filt, 1);
-        filt = _mm_andnot_si128(
-                _mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), filt);
-        filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
-        qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
-        // loopfilter done
-
-        {
-            __m128i work;
-            flat = _mm_max_epu8(
-                    _mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
-                            _mm_subs_epu8(q0p0, q2p2)),
-                    _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
-                            _mm_subs_epu8(q0p0, q3p3)));
-            flat = _mm_max_epu8(abs_p1p0, flat);
-            flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
-            flat = _mm_subs_epu8(flat, one);
-            flat = _mm_cmpeq_epi8(flat, zero);
-            flat = _mm_and_si128(flat, mask);
-
-            q5p5 = _mm_loadl_epi64((__m128i *) (s - 6 * p));
-            q5p5 = _mm_castps_si128(
-                    _mm_loadh_pi(_mm_castsi128_ps(q5p5),
-                            (__m64 *) (s + 5 * p)));
-
-            q6p6 = _mm_loadl_epi64((__m128i *) (s - 7 * p));
-            q6p6 = _mm_castps_si128(
-                    _mm_loadh_pi(_mm_castsi128_ps(q6p6),
-                            (__m64 *) (s + 6 * p)));
-
-            flat2 = _mm_max_epu8(
-                    _mm_or_si128(_mm_subs_epu8(q4p4, q0p0),
-                            _mm_subs_epu8(q0p0, q4p4)),
-                    _mm_or_si128(_mm_subs_epu8(q5p5, q0p0),
-                            _mm_subs_epu8(q0p0, q5p5)));
-
-            q7p7 = _mm_loadl_epi64((__m128i *) (s - 8 * p));
-            q7p7 = _mm_castps_si128(
-                    _mm_loadh_pi(_mm_castsi128_ps(q7p7),
-                            (__m64 *) (s + 7 * p)));
-
-            work = _mm_max_epu8(
-                    _mm_or_si128(_mm_subs_epu8(q6p6, q0p0),
-                            _mm_subs_epu8(q0p0, q6p6)),
-                    _mm_or_si128(_mm_subs_epu8(q7p7, q0p0),
-                            _mm_subs_epu8(q0p0, q7p7)));
-
-            flat2 = _mm_max_epu8(work, flat2);
-            flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
-            flat2 = _mm_subs_epu8(flat2, one);
-            flat2 = _mm_cmpeq_epi8(flat2, zero);
-            flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
-        }
-
-        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-        // flat and wide flat calculations
-        {
-            const __m128i eight = _mm_set1_epi16(8);
-            const __m128i four = _mm_set1_epi16(4);
-            __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
-            __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
-            __m128i pixelFilter_p, pixelFilter_q;
-            __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
-            __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
-
-            p7_16 = _mm_unpacklo_epi8(q7p7, zero);
-            p6_16 = _mm_unpacklo_epi8(q6p6, zero);
-            p5_16 = _mm_unpacklo_epi8(q5p5, zero);
-            p4_16 = _mm_unpacklo_epi8(q4p4, zero);
-            p3_16 = _mm_unpacklo_epi8(q3p3, zero);
-            p2_16 = _mm_unpacklo_epi8(q2p2, zero);
-            p1_16 = _mm_unpacklo_epi8(q1p1, zero);
-            p0_16 = _mm_unpacklo_epi8(q0p0, zero);
-            q0_16 = _mm_unpackhi_epi8(q0p0, zero);
-            q1_16 = _mm_unpackhi_epi8(q1p1, zero);
-            q2_16 = _mm_unpackhi_epi8(q2p2, zero);
-            q3_16 = _mm_unpackhi_epi8(q3p3, zero);
-            q4_16 = _mm_unpackhi_epi8(q4p4, zero);
-            q5_16 = _mm_unpackhi_epi8(q5p5, zero);
-            q6_16 = _mm_unpackhi_epi8(q6p6, zero);
-            q7_16 = _mm_unpackhi_epi8(q7p7, zero);
-
-            pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
-                    _mm_add_epi16(p4_16, p3_16));
-            pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
-                    _mm_add_epi16(q4_16, q3_16));
-
-            pixetFilter_p2p1p0 = _mm_add_epi16(p0_16,
-                    _mm_add_epi16(p2_16, p1_16));
-            pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
-
-            pixetFilter_q2q1q0 = _mm_add_epi16(q0_16,
-                    _mm_add_epi16(q2_16, q1_16));
-            pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
-            pixelFilter_p = _mm_add_epi16(eight,
-                    _mm_add_epi16(pixelFilter_p, pixelFilter_q));
-            pixetFilter_p2p1p0 = _mm_add_epi16(four,
-                    _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
-            res_p = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)),
-                    4);
-            res_q = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)),
-                    4);
-            flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
-            res_p = _mm_srli_epi16(
-                    _mm_add_epi16(pixetFilter_p2p1p0,
-                            _mm_add_epi16(p3_16, p0_16)), 3);
-            res_q = _mm_srli_epi16(
-                    _mm_add_epi16(pixetFilter_p2p1p0,
-                            _mm_add_epi16(q3_16, q0_16)), 3);
-
-            flat_q0p0 = _mm_packus_epi16(res_p, res_q);
-
-            sum_p7 = _mm_add_epi16(p7_16, p7_16);
-            sum_q7 = _mm_add_epi16(q7_16, q7_16);
-            sum_p3 = _mm_add_epi16(p3_16, p3_16);
-            sum_q3 = _mm_add_epi16(q3_16, q3_16);
-
-            pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
-            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
-            res_p = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)),
-                    4);
-            res_q = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)),
-                    4);
-            flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
-
-            pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
-            pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
-            res_p = _mm_srli_epi16(
-                    _mm_add_epi16(pixetFilter_p2p1p0,
-                            _mm_add_epi16(sum_p3, p1_16)), 3);
-            res_q = _mm_srli_epi16(
-                    _mm_add_epi16(pixetFilter_q2q1q0,
-                            _mm_add_epi16(sum_q3, q1_16)), 3);
-            flat_q1p1 = _mm_packus_epi16(res_p, res_q);
-
-            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-            sum_p3 = _mm_add_epi16(sum_p3, p3_16);
-            sum_q3 = _mm_add_epi16(sum_q3, q3_16);
-
-            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
-            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
-            res_p = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)),
-                    4);
-            res_q = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)),
-                    4);
-            flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
-
-            pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
-            pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
-
-            res_p = _mm_srli_epi16(
-                    _mm_add_epi16(pixetFilter_p2p1p0,
-                            _mm_add_epi16(sum_p3, p2_16)), 3);
-            res_q = _mm_srli_epi16(
-                    _mm_add_epi16(pixetFilter_q2q1q0,
-                            _mm_add_epi16(sum_q3, q2_16)), 3);
-            flat_q2p2 = _mm_packus_epi16(res_p, res_q);
-
-            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
-            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
-            res_p = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)),
-                    4);
-            res_q = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)),
-                    4);
-            flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
-
-            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
-            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
-            res_p = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)),
-                    4);
-            res_q = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)),
-                    4);
-            flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
-
-            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
-            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
-            res_p = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)),
-                    4);
-            res_q = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)),
-                    4);
-            flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
-
-            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
-            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
-            res_p = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)),
-                    4);
-            res_q = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)),
-                    4);
-            flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
-        }
-        // wide flat
-        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-        flat = _mm_shuffle_epi32(flat, 68);
-        flat2 = _mm_shuffle_epi32(flat2, 68);
-
-        q2p2 = _mm_andnot_si128(flat, q2p2);
-        flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
-        q2p2 = _mm_or_si128(q2p2, flat_q2p2);
-
-        qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
-        flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
-        q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
-
-        qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
-        flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
-        q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
-
-        q6p6 = _mm_andnot_si128(flat2, q6p6);
-        flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
-        q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
-        _mm_storel_epi64((__m128i *) (s - 7 * p), q6p6);
-        _mm_storeh_pi((__m64 *) (s + 6 * p), _mm_castsi128_ps(q6p6));
-
-        q5p5 = _mm_andnot_si128(flat2, q5p5);
-        flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
-        q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
-        _mm_storel_epi64((__m128i *) (s - 6 * p), q5p5);
-        _mm_storeh_pi((__m64 *) (s + 5 * p), _mm_castsi128_ps(q5p5));
-
-        q4p4 = _mm_andnot_si128(flat2, q4p4);
-        flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
-        q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
-        _mm_storel_epi64((__m128i *) (s - 5 * p), q4p4);
-        _mm_storeh_pi((__m64 *) (s + 4 * p), _mm_castsi128_ps(q4p4));
-
-        q3p3 = _mm_andnot_si128(flat2, q3p3);
-        flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
-        q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
-        _mm_storel_epi64((__m128i *) (s - 4 * p), q3p3);
-        _mm_storeh_pi((__m64 *) (s + 3 * p), _mm_castsi128_ps(q3p3));
-
-        q2p2 = _mm_andnot_si128(flat2, q2p2);
-        flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
-        q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
-        _mm_storel_epi64((__m128i *) (s - 3 * p), q2p2);
-        _mm_storeh_pi((__m64 *) (s + 2 * p), _mm_castsi128_ps(q2p2));
-
-        q1p1 = _mm_andnot_si128(flat2, q1p1);
-        flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
-        q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
-        _mm_storel_epi64((__m128i *) (s - 2 * p), q1p1);
-        _mm_storeh_pi((__m64 *) (s + 1 * p), _mm_castsi128_ps(q1p1));
-
-        q0p0 = _mm_andnot_si128(flat2, q0p0);
-        flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
-        q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
-        _mm_storel_epi64((__m128i *) (s - 1 * p), q0p0);
-        _mm_storeh_pi((__m64 *) (s - 0 * p), _mm_castsi128_ps(q0p0));
-    }
-}
-
-DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
-  0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128,
-  8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
-};
-
-void vpx_lpf_horizontal_edge_16_avx2(unsigned char *s, int p,
-                                     const unsigned char *_blimit,
-                                     const unsigned char *_limit,
-                                     const unsigned char *_thresh) {
-    __m128i mask, hev, flat, flat2;
-    const __m128i zero = _mm_set1_epi16(0);
-    const __m128i one = _mm_set1_epi8(1);
-    __m128i p7, p6, p5;
-    __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
-    __m128i q5, q6, q7;
-    __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4,
-            q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1,
-            p256_0, q256_0;
-
-    const __m128i thresh = _mm_broadcastb_epi8(
-            _mm_cvtsi32_si128((int) _thresh[0]));
-    const __m128i limit = _mm_broadcastb_epi8(
-            _mm_cvtsi32_si128((int) _limit[0]));
-    const __m128i blimit = _mm_broadcastb_epi8(
-            _mm_cvtsi32_si128((int) _blimit[0]));
-
-    p256_4 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                (__m128d const *)(s - 5 * p)));
-    p256_3 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                (__m128d const *)(s - 4 * p)));
-    p256_2 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                (__m128d const *)(s - 3 * p)));
-    p256_1 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                (__m128d const *)(s - 2 * p)));
-    p256_0 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                (__m128d const *)(s - 1 * p)));
-    q256_0 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                (__m128d const *)(s - 0 * p)));
-    q256_1 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                (__m128d const *)(s + 1 * p)));
-    q256_2 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                (__m128d const *)(s + 2 * p)));
-    q256_3 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                (__m128d const *)(s + 3 * p)));
-    q256_4 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                (__m128d const *)(s + 4 * p)));
-
-    p4 = _mm256_castsi256_si128(p256_4);
-    p3 = _mm256_castsi256_si128(p256_3);
-    p2 = _mm256_castsi256_si128(p256_2);
-    p1 = _mm256_castsi256_si128(p256_1);
-    p0 = _mm256_castsi256_si128(p256_0);
-    q0 = _mm256_castsi256_si128(q256_0);
-    q1 = _mm256_castsi256_si128(q256_1);
-    q2 = _mm256_castsi256_si128(q256_2);
-    q3 = _mm256_castsi256_si128(q256_3);
-    q4 = _mm256_castsi256_si128(q256_4);
-
-    {
-        const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
-                _mm_subs_epu8(p0, p1));
-        const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
-                _mm_subs_epu8(q0, q1));
-        const __m128i fe = _mm_set1_epi8(0xfe);
-        const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
-        __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
-                _mm_subs_epu8(q0, p0));
-        __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
-                _mm_subs_epu8(q1, p1));
-        __m128i work;
-        flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-        hev = _mm_subs_epu8(flat, thresh);
-        hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-
-        abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
-        abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-        mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
-        mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-        // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-        mask = _mm_max_epu8(flat, mask);
-        // mask |= (abs(p1 - p0) > limit) * -1;
-        // mask |= (abs(q1 - q0) > limit) * -1;
-        work = _mm_max_epu8(
-                _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
-                _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
-        mask = _mm_max_epu8(work, mask);
-        work = _mm_max_epu8(
-                _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
-                _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
-        mask = _mm_max_epu8(work, mask);
-        mask = _mm_subs_epu8(mask, limit);
-        mask = _mm_cmpeq_epi8(mask, zero);
-    }
-
-    // lp filter
-    {
-        const __m128i t4 = _mm_set1_epi8(4);
-        const __m128i t3 = _mm_set1_epi8(3);
-        const __m128i t80 = _mm_set1_epi8(0x80);
-        const __m128i te0 = _mm_set1_epi8(0xe0);
-        const __m128i t1f = _mm_set1_epi8(0x1f);
-        const __m128i t1 = _mm_set1_epi8(0x1);
-        const __m128i t7f = _mm_set1_epi8(0x7f);
-
-        __m128i ps1 = _mm_xor_si128(p1, t80);
-        __m128i ps0 = _mm_xor_si128(p0, t80);
-        __m128i qs0 = _mm_xor_si128(q0, t80);
-        __m128i qs1 = _mm_xor_si128(q1, t80);
-        __m128i filt;
-        __m128i work_a;
-        __m128i filter1, filter2;
-        __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1,
-                flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4,
-                flat2_q5, flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1,
-                flat_q2;
-
-        filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
-        work_a = _mm_subs_epi8(qs0, ps0);
-        filt = _mm_adds_epi8(filt, work_a);
-        filt = _mm_adds_epi8(filt, work_a);
-        filt = _mm_adds_epi8(filt, work_a);
-        /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
-        filt = _mm_and_si128(filt, mask);
-
-        filter1 = _mm_adds_epi8(filt, t4);
-        filter2 = _mm_adds_epi8(filt, t3);
-
-        /* Filter1 >> 3 */
-        work_a = _mm_cmpgt_epi8(zero, filter1);
-        filter1 = _mm_srli_epi16(filter1, 3);
-        work_a = _mm_and_si128(work_a, te0);
-        filter1 = _mm_and_si128(filter1, t1f);
-        filter1 = _mm_or_si128(filter1, work_a);
-        qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
-
-        /* Filter2 >> 3 */
-        work_a = _mm_cmpgt_epi8(zero, filter2);
-        filter2 = _mm_srli_epi16(filter2, 3);
-        work_a = _mm_and_si128(work_a, te0);
-        filter2 = _mm_and_si128(filter2, t1f);
-        filter2 = _mm_or_si128(filter2, work_a);
-        ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
-
-        /* filt >> 1 */
-        filt = _mm_adds_epi8(filter1, t1);
-        work_a = _mm_cmpgt_epi8(zero, filt);
-        filt = _mm_srli_epi16(filt, 1);
-        work_a = _mm_and_si128(work_a, t80);
-        filt = _mm_and_si128(filt, t7f);
-        filt = _mm_or_si128(filt, work_a);
-        filt = _mm_andnot_si128(hev, filt);
-        ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
-        qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
-        // loopfilter done
-
-        {
-            __m128i work;
-            work = _mm_max_epu8(
-                    _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
-                    _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
-            flat = _mm_max_epu8(work, flat);
-            work = _mm_max_epu8(
-                    _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
-                    _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
-            flat = _mm_max_epu8(work, flat);
-            work = _mm_max_epu8(
-                    _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
-                    _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
-            flat = _mm_subs_epu8(flat, one);
-            flat = _mm_cmpeq_epi8(flat, zero);
-            flat = _mm_and_si128(flat, mask);
-
-            p256_5 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                        (__m128d const *)(s - 6 * p)));
-            q256_5 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                        (__m128d const *)(s + 5 * p)));
-            p5 = _mm256_castsi256_si128(p256_5);
-            q5 = _mm256_castsi256_si128(q256_5);
-            flat2 = _mm_max_epu8(
-                    _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
-                    _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
-
-            flat2 = _mm_max_epu8(work, flat2);
-            p256_6 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                        (__m128d const *)(s - 7 * p)));
-            q256_6 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                        (__m128d const *)(s + 6 * p)));
-            p6 = _mm256_castsi256_si128(p256_6);
-            q6 = _mm256_castsi256_si128(q256_6);
-            work = _mm_max_epu8(
-                    _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
-                    _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
-
-            flat2 = _mm_max_epu8(work, flat2);
-
-            p256_7 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                        (__m128d const *)(s - 8 * p)));
-            q256_7 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                        (__m128d const *)(s + 7 * p)));
-            p7 = _mm256_castsi256_si128(p256_7);
-            q7 = _mm256_castsi256_si128(q256_7);
-            work = _mm_max_epu8(
-                    _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
-                    _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
-
-            flat2 = _mm_max_epu8(work, flat2);
-            flat2 = _mm_subs_epu8(flat2, one);
-            flat2 = _mm_cmpeq_epi8(flat2, zero);
-            flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
-        }
-
-        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-        // flat and wide flat calculations
-        {
-            const __m256i eight = _mm256_set1_epi16(8);
-            const __m256i four = _mm256_set1_epi16(4);
-            __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
-                    pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p,
-                    res_q;
-
-            const __m256i filter = _mm256_load_si256(
-                                  (__m256i const *)filt_loopfilter_avx2);
-            p256_7 = _mm256_shuffle_epi8(p256_7, filter);
-            p256_6 = _mm256_shuffle_epi8(p256_6, filter);
-            p256_5 = _mm256_shuffle_epi8(p256_5, filter);
-            p256_4 = _mm256_shuffle_epi8(p256_4, filter);
-            p256_3 = _mm256_shuffle_epi8(p256_3, filter);
-            p256_2 = _mm256_shuffle_epi8(p256_2, filter);
-            p256_1 = _mm256_shuffle_epi8(p256_1, filter);
-            p256_0 = _mm256_shuffle_epi8(p256_0, filter);
-            q256_0 = _mm256_shuffle_epi8(q256_0, filter);
-            q256_1 = _mm256_shuffle_epi8(q256_1, filter);
-            q256_2 = _mm256_shuffle_epi8(q256_2, filter);
-            q256_3 = _mm256_shuffle_epi8(q256_3, filter);
-            q256_4 = _mm256_shuffle_epi8(q256_4, filter);
-            q256_5 = _mm256_shuffle_epi8(q256_5, filter);
-            q256_6 = _mm256_shuffle_epi8(q256_6, filter);
-            q256_7 = _mm256_shuffle_epi8(q256_7, filter);
-
-            pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
-                    _mm256_add_epi16(p256_4, p256_3));
-            pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5),
-                    _mm256_add_epi16(q256_4, q256_3));
-
-            pixetFilter_p2p1p0 = _mm256_add_epi16(p256_0,
-                    _mm256_add_epi16(p256_2, p256_1));
-            pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
-
-            pixetFilter_q2q1q0 = _mm256_add_epi16(q256_0,
-                    _mm256_add_epi16(q256_2, q256_1));
-            pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
-
-            pixelFilter_p = _mm256_add_epi16(eight,
-                    _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
-
-            pixetFilter_p2p1p0 = _mm256_add_epi16(four,
-                    _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
-
-            res_p = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_p,
-                            _mm256_add_epi16(p256_7, p256_0)), 4);
-
-            flat2_p0 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
-                            168));
-
-            res_q = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_p,
-                            _mm256_add_epi16(q256_7, q256_0)), 4);
-
-            flat2_q0 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
-                            168));
-
-            res_p = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixetFilter_p2p1p0,
-                            _mm256_add_epi16(p256_3, p256_0)), 3);
-
-            flat_p0 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
-                            168));
-
-            res_q = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixetFilter_p2p1p0,
-                            _mm256_add_epi16(q256_3, q256_0)), 3);
-
-            flat_q0 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
-                            168));
-
-            sum_p7 = _mm256_add_epi16(p256_7, p256_7);
-
-            sum_q7 = _mm256_add_epi16(q256_7, q256_7);
-
-            sum_p3 = _mm256_add_epi16(p256_3, p256_3);
-
-            sum_q3 = _mm256_add_epi16(q256_3, q256_3);
-
-            pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6);
-
-            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6);
-
-            res_p = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_p,
-                            _mm256_add_epi16(sum_p7, p256_1)), 4);
-
-            flat2_p1 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
-                            168));
-
-            res_q = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_q,
-                            _mm256_add_epi16(sum_q7, q256_1)), 4);
-
-            flat2_q1 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
-                            168));
-
-            pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2);
-
-            pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2);
-
-            res_p = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixetFilter_p2p1p0,
-                            _mm256_add_epi16(sum_p3, p256_1)), 3);
-
-            flat_p1 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
-                            168));
-
-            res_q = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixetFilter_q2q1q0,
-                            _mm256_add_epi16(sum_q3, q256_1)), 3);
-
-            flat_q1 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
-                            168));
-
-            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
-
-            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
-
-            sum_p3 = _mm256_add_epi16(sum_p3, p256_3);
-
-            sum_q3 = _mm256_add_epi16(sum_q3, q256_3);
-
-            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5);
-
-            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5);
-
-            res_p = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_p,
-                            _mm256_add_epi16(sum_p7, p256_2)), 4);
-
-            flat2_p2 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
-                            168));
-
-            res_q = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_q,
-                            _mm256_add_epi16(sum_q7, q256_2)), 4);
-
-            flat2_q2 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
-                            168));
-
-            pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1);
-
-            pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1);
-
-            res_p = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixetFilter_p2p1p0,
-                            _mm256_add_epi16(sum_p3, p256_2)), 3);
-
-            flat_p2 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
-                            168));
-
-            res_q = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixetFilter_q2q1q0,
-                            _mm256_add_epi16(sum_q3, q256_2)), 3);
-
-            flat_q2 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
-                            168));
-
-            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
-
-            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
-
-            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4);
-
-            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4);
-
-            res_p = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_p,
-                            _mm256_add_epi16(sum_p7, p256_3)), 4);
-
-            flat2_p3 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
-                            168));
-
-            res_q = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_q,
-                            _mm256_add_epi16(sum_q7, q256_3)), 4);
-
-            flat2_q3 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
-                            168));
-
-            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
-
-            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
-
-            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3);
-
-            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3);
-
-            res_p = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_p,
-                            _mm256_add_epi16(sum_p7, p256_4)), 4);
-
-            flat2_p4 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
-                            168));
-
-            res_q = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_q,
-                            _mm256_add_epi16(sum_q7, q256_4)), 4);
-
-            flat2_q4 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
-                            168));
-
-            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
-
-            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
-
-            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2);
-
-            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2);
-
-            res_p = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_p,
-                            _mm256_add_epi16(sum_p7, p256_5)), 4);
-
-            flat2_p5 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
-                            168));
-
-            res_q = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_q,
-                            _mm256_add_epi16(sum_q7, q256_5)), 4);
-
-            flat2_q5 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
-                            168));
-
-            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
-
-            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
-
-            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1);
-
-            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1);
-
-            res_p = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_p,
-                            _mm256_add_epi16(sum_p7, p256_6)), 4);
-
-            flat2_p6 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
-                            168));
-
-            res_q = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_q,
-                            _mm256_add_epi16(sum_q7, q256_6)), 4);
-
-            flat2_q6 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
-                            168));
-        }
-
-        // wide flat
-        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-        p2 = _mm_andnot_si128(flat, p2);
-        flat_p2 = _mm_and_si128(flat, flat_p2);
-        p2 = _mm_or_si128(flat_p2, p2);
-
-        p1 = _mm_andnot_si128(flat, ps1);
-        flat_p1 = _mm_and_si128(flat, flat_p1);
-        p1 = _mm_or_si128(flat_p1, p1);
-
-        p0 = _mm_andnot_si128(flat, ps0);
-        flat_p0 = _mm_and_si128(flat, flat_p0);
-        p0 = _mm_or_si128(flat_p0, p0);
-
-        q0 = _mm_andnot_si128(flat, qs0);
-        flat_q0 = _mm_and_si128(flat, flat_q0);
-        q0 = _mm_or_si128(flat_q0, q0);
-
-        q1 = _mm_andnot_si128(flat, qs1);
-        flat_q1 = _mm_and_si128(flat, flat_q1);
-        q1 = _mm_or_si128(flat_q1, q1);
-
-        q2 = _mm_andnot_si128(flat, q2);
-        flat_q2 = _mm_and_si128(flat, flat_q2);
-        q2 = _mm_or_si128(flat_q2, q2);
-
-        p6 = _mm_andnot_si128(flat2, p6);
-        flat2_p6 = _mm_and_si128(flat2, flat2_p6);
-        p6 = _mm_or_si128(flat2_p6, p6);
-        _mm_storeu_si128((__m128i *) (s - 7 * p), p6);
-
-        p5 = _mm_andnot_si128(flat2, p5);
-        flat2_p5 = _mm_and_si128(flat2, flat2_p5);
-        p5 = _mm_or_si128(flat2_p5, p5);
-        _mm_storeu_si128((__m128i *) (s - 6 * p), p5);
-
-        p4 = _mm_andnot_si128(flat2, p4);
-        flat2_p4 = _mm_and_si128(flat2, flat2_p4);
-        p4 = _mm_or_si128(flat2_p4, p4);
-        _mm_storeu_si128((__m128i *) (s - 5 * p), p4);
-
-        p3 = _mm_andnot_si128(flat2, p3);
-        flat2_p3 = _mm_and_si128(flat2, flat2_p3);
-        p3 = _mm_or_si128(flat2_p3, p3);
-        _mm_storeu_si128((__m128i *) (s - 4 * p), p3);
-
-        p2 = _mm_andnot_si128(flat2, p2);
-        flat2_p2 = _mm_and_si128(flat2, flat2_p2);
-        p2 = _mm_or_si128(flat2_p2, p2);
-        _mm_storeu_si128((__m128i *) (s - 3 * p), p2);
-
-        p1 = _mm_andnot_si128(flat2, p1);
-        flat2_p1 = _mm_and_si128(flat2, flat2_p1);
-        p1 = _mm_or_si128(flat2_p1, p1);
-        _mm_storeu_si128((__m128i *) (s - 2 * p), p1);
-
-        p0 = _mm_andnot_si128(flat2, p0);
-        flat2_p0 = _mm_and_si128(flat2, flat2_p0);
-        p0 = _mm_or_si128(flat2_p0, p0);
-        _mm_storeu_si128((__m128i *) (s - 1 * p), p0);
-
-        q0 = _mm_andnot_si128(flat2, q0);
-        flat2_q0 = _mm_and_si128(flat2, flat2_q0);
-        q0 = _mm_or_si128(flat2_q0, q0);
-        _mm_storeu_si128((__m128i *) (s - 0 * p), q0);
-
-        q1 = _mm_andnot_si128(flat2, q1);
-        flat2_q1 = _mm_and_si128(flat2, flat2_q1);
-        q1 = _mm_or_si128(flat2_q1, q1);
-        _mm_storeu_si128((__m128i *) (s + 1 * p), q1);
-
-        q2 = _mm_andnot_si128(flat2, q2);
-        flat2_q2 = _mm_and_si128(flat2, flat2_q2);
-        q2 = _mm_or_si128(flat2_q2, q2);
-        _mm_storeu_si128((__m128i *) (s + 2 * p), q2);
-
-        q3 = _mm_andnot_si128(flat2, q3);
-        flat2_q3 = _mm_and_si128(flat2, flat2_q3);
-        q3 = _mm_or_si128(flat2_q3, q3);
-        _mm_storeu_si128((__m128i *) (s + 3 * p), q3);
-
-        q4 = _mm_andnot_si128(flat2, q4);
-        flat2_q4 = _mm_and_si128(flat2, flat2_q4);
-        q4 = _mm_or_si128(flat2_q4, q4);
-        _mm_storeu_si128((__m128i *) (s + 4 * p), q4);
-
-        q5 = _mm_andnot_si128(flat2, q5);
-        flat2_q5 = _mm_and_si128(flat2, flat2_q5);
-        q5 = _mm_or_si128(flat2_q5, q5);
-        _mm_storeu_si128((__m128i *) (s + 5 * p), q5);
-
-        q6 = _mm_andnot_si128(flat2, q6);
-        flat2_q6 = _mm_and_si128(flat2, flat2_q6);
-        q6 = _mm_or_si128(flat2_q6, q6);
-        _mm_storeu_si128((__m128i *) (s + 6 * p), q6);
-    }
-}
diff --git a/thirdparty/libvpx/vpx_dsp/x86/loopfilter_sse2.c b/thirdparty/libvpx/vpx_dsp/x86/loopfilter_sse2.c
deleted file mode 100644
index 739adf31d0..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/loopfilter_sse2.c
+++ /dev/null
@@ -1,1776 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <emmintrin.h>  // SSE2
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_ports/mem.h"
-#include "vpx_ports/emmintrin_compat.h"
-
-static INLINE __m128i abs_diff(__m128i a, __m128i b) {
-  return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
-}
-
-// filter_mask and hev_mask
-#define FILTER_HEV_MASK do {                                                   \
-  /* (abs(q1 - q0), abs(p1 - p0) */                                            \
-  __m128i flat = abs_diff(q1p1, q0p0);                                         \
-  /* abs(p1 - q1), abs(p0 - q0) */                                             \
-  const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);                           \
-  __m128i abs_p0q0, abs_p1q1, work;                                            \
-                                                                               \
-  /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */          \
-  hev = _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero);  \
-  hev = _mm_cmpgt_epi16(hev, thresh);                                          \
-  hev = _mm_packs_epi16(hev, hev);                                             \
-                                                                               \
-  /* const int8_t mask = filter_mask(*limit, *blimit, */                       \
-  /*                                 p3, p2, p1, p0, q0, q1, q2, q3); */       \
-  abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0);  /* abs(p0 - q0) * 2 */\
-  abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0);  /* abs(p1 - q1) */\
-  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);                                      \
-  abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1);  /* abs(p1 - q1) / 2 */      \
-  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */                                    \
-  mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);                                    \
-  /* abs(p3 - p2), abs(p2 - p1) */                                             \
-  work = abs_diff(p3p2, p2p1);                                                 \
-  flat = _mm_max_epu8(work, flat);                                             \
-  /* abs(q3 - q2), abs(q2 - q1) */                                             \
-  work = abs_diff(q3q2, q2q1);                                                 \
-  flat = _mm_max_epu8(work, flat);                                             \
-  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));                          \
-  mask = _mm_unpacklo_epi64(mask, flat);                                       \
-  mask = _mm_subs_epu8(mask, limit);                                           \
-  mask = _mm_cmpeq_epi8(mask, zero);                                           \
-  mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));                         \
-} while (0)
-
-#define FILTER4 do {                                                           \
-  const __m128i t3t4 = _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3,                    \
-                                    4, 4, 4, 4, 4, 4, 4, 4);                   \
-  const __m128i t80 = _mm_set1_epi8(0x80);                                     \
-  __m128i filter, filter2filter1, work;                                        \
-                                                                               \
-  ps1ps0 = _mm_xor_si128(p1p0, t80);  /* ^ 0x80 */                             \
-  qs1qs0 = _mm_xor_si128(q1q0, t80);                                           \
-                                                                               \
-  /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */                    \
-  work = _mm_subs_epi8(ps1ps0, qs1qs0);                                        \
-  filter = _mm_and_si128(_mm_srli_si128(work, 8), hev);                        \
-  /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */           \
-  filter = _mm_subs_epi8(filter, work);                                        \
-  filter = _mm_subs_epi8(filter, work);                                        \
-  filter = _mm_subs_epi8(filter, work);  /* + 3 * (qs0 - ps0) */               \
-  filter = _mm_and_si128(filter, mask);  /* & mask */                          \
-  filter = _mm_unpacklo_epi64(filter, filter);                                 \
-                                                                               \
-  /* filter1 = signed_char_clamp(filter + 4) >> 3; */                          \
-  /* filter2 = signed_char_clamp(filter + 3) >> 3; */                          \
-  filter2filter1 = _mm_adds_epi8(filter, t3t4);  /* signed_char_clamp */       \
-  filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1);                  \
-  filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1);          \
-  filter2filter1 = _mm_srai_epi16(filter2filter1, 11);  /* >> 3 */             \
-  filter = _mm_srai_epi16(filter, 11);  /* >> 3 */                             \
-  filter2filter1 = _mm_packs_epi16(filter2filter1, filter);                    \
-                                                                               \
-  /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */                        \
-  filter = _mm_subs_epi8(filter2filter1, ff);  /* + 1 */                       \
-  filter = _mm_unpacklo_epi8(filter, filter);                                  \
-  filter = _mm_srai_epi16(filter, 9);  /* round */                             \
-  filter = _mm_packs_epi16(filter, filter);                                    \
-  filter = _mm_andnot_si128(hev, filter);                                      \
-                                                                               \
-  hev = _mm_unpackhi_epi64(filter2filter1, filter);                            \
-  filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter);                 \
-                                                                               \
-  /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */      \
-  qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1);                              \
-  /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */      \
-  ps1ps0 = _mm_adds_epi8(ps1ps0, hev);                                         \
-  qs1qs0 = _mm_xor_si128(qs1qs0, t80);  /* ^ 0x80 */                           \
-  ps1ps0 = _mm_xor_si128(ps1ps0, t80);  /* ^ 0x80 */                           \
-} while (0)
-
-void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
-                               const uint8_t *_blimit, const uint8_t *_limit,
-                               const uint8_t *_thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i limit =
-      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
-                         _mm_loadl_epi64((const __m128i *)_limit));
-  const __m128i thresh =
-      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
-  const __m128i ff = _mm_cmpeq_epi8(zero, zero);
-  __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
-  __m128i mask, hev;
-
-  p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
-                            _mm_loadl_epi64((__m128i *)(s - 4 * p)));
-  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
-  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 0 * p)));
-  q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
-  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
-  p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
-  q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
-  q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
-
-  FILTER_HEV_MASK;
-  FILTER4;
-
-  _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0));  // *op1
-  _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);  // *op0
-  _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);  // *oq0
-  _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0));  // *oq1
-}
-
-void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
-                             const uint8_t *_blimit, const uint8_t *_limit,
-                             const uint8_t *_thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i limit =
-      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
-                         _mm_loadl_epi64((const __m128i *)_limit));
-  const __m128i thresh =
-      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
-  const __m128i ff = _mm_cmpeq_epi8(zero, zero);
-  __m128i x0, x1, x2, x3;
-  __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
-  __m128i mask, hev;
-
-  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-  q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)),
-                           _mm_loadl_epi64((__m128i *)(s + 1 * p - 4)));
-
-  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-  x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)),
-                         _mm_loadl_epi64((__m128i *)(s + 3 * p - 4)));
-
-  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
-  x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)),
-                         _mm_loadl_epi64((__m128i *)(s + 5 * p - 4)));
-
-  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
-  x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)),
-                         _mm_loadl_epi64((__m128i *)(s + 7 * p - 4)));
-
-  // Transpose 8x8
-  // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
-  p1p0 = _mm_unpacklo_epi16(q1q0, x1);
-  // 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
-  x0 = _mm_unpacklo_epi16(x2, x3);
-  // 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
-  p3p2 = _mm_unpacklo_epi32(p1p0, x0);
-  // 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
-  p1p0 = _mm_unpackhi_epi32(p1p0, x0);
-  p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8));  // swap lo and high
-  p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8));  // swap lo and high
-
-  // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
-  q1q0 = _mm_unpackhi_epi16(q1q0, x1);
-  // 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
-  x2 = _mm_unpackhi_epi16(x2, x3);
-  // 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
-  q3q2 = _mm_unpackhi_epi32(q1q0, x2);
-  // 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
-  q1q0 = _mm_unpacklo_epi32(q1q0, x2);
-
-  q0p0 = _mm_unpacklo_epi64(p1p0, q1q0);
-  q1p1 = _mm_unpackhi_epi64(p1p0, q1q0);
-  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
-  p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
-  q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
-
-  FILTER_HEV_MASK;
-  FILTER4;
-
-  // Transpose 8x4 to 4x8
-  // qs1qs0: 20 21 22 23 24 25 26 27  30 31 32 33 34 34 36 37
-  // ps1ps0: 10 11 12 13 14 15 16 17  00 01 02 03 04 05 06 07
-  // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
-  ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8));
-  // 10 30 11 31 12 32 13 33  14 34 15 35 16 36 17 37
-  x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0);
-  // 00 20 01 21 02 22 03 23  04 24 05 25 06 26 07 27
-  ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0);
-  // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
-  qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0);
-  // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
-  ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
-
-  *(int *)(s + 0 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
-  ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  *(int *)(s + 1 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
-  ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
-  ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
-
-  *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
-  qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
-  qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
-  qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
-}
-
-void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
-                                    const unsigned char *_blimit,
-                                    const unsigned char *_limit,
-                                    const unsigned char *_thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i one = _mm_set1_epi8(1);
-  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
-  __m128i mask, hev, flat, flat2;
-  __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
-  __m128i abs_p1p0;
-
-  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
-  q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4),
-                                       (__m64 *)(s + 4 * p)));
-  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
-  q3p3 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q3p3),
-                                       (__m64 *)(s + 3 * p)));
-  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
-  q2p2 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q2p2),
-                                       (__m64 *)(s + 2 * p)));
-  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
-  q1p1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q1p1),
-                                       (__m64 *)(s + 1 * p)));
-  p1q1 = _mm_shuffle_epi32(q1p1, 78);
-  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
-  q0p0 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q0p0),
-                                       (__m64 *)(s - 0 * p)));
-  p0q0 = _mm_shuffle_epi32(q0p0, 78);
-
-  {
-    __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
-    abs_p1p0 = abs_diff(q1p1, q0p0);
-    abs_q1q0 =  _mm_srli_si128(abs_p1p0, 8);
-    fe = _mm_set1_epi8(0xfe);
-    ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
-    abs_p0q0 = abs_diff(q0p0, p0q0);
-    abs_p1q1 = abs_diff(q1p1, p1q1);
-    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-
-    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(abs_p1p0, mask);
-    // mask |= (abs(p1 - p0) > limit) * -1;
-    // mask |= (abs(q1 - q0) > limit) * -1;
-
-    work = _mm_max_epu8(abs_diff(q2p2, q1p1),
-                        abs_diff(q3p3, q2p2));
-    mask = _mm_max_epu8(work, mask);
-    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-    mask = _mm_subs_epu8(mask, limit);
-    mask = _mm_cmpeq_epi8(mask, zero);
-  }
-
-  // lp filter
-  {
-    const __m128i t4 = _mm_set1_epi8(4);
-    const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i t1 = _mm_set1_epi16(0x1);
-    __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
-    __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
-    __m128i qs0 = _mm_xor_si128(p0q0, t80);
-    __m128i qs1 = _mm_xor_si128(p1q1, t80);
-    __m128i filt;
-    __m128i work_a;
-    __m128i filter1, filter2;
-    __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
-    __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
-
-    filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
-    work_a = _mm_subs_epi8(qs0, qs0ps0);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    // (vpx_filter + 3 * (qs0 - ps0)) & mask
-    filt = _mm_and_si128(filt, mask);
-
-    filter1 = _mm_adds_epi8(filt, t4);
-    filter2 = _mm_adds_epi8(filt, t3);
-
-    filter1 = _mm_unpacklo_epi8(zero, filter1);
-    filter1 = _mm_srai_epi16(filter1, 0xB);
-    filter2 = _mm_unpacklo_epi8(zero, filter2);
-    filter2 = _mm_srai_epi16(filter2, 0xB);
-
-    // Filter1 >> 3
-    filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
-    qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
-
-    // filt >> 1
-    filt = _mm_adds_epi16(filter1, t1);
-    filt = _mm_srai_epi16(filt, 1);
-    filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
-                            filt);
-    filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
-    qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
-    // loopfilter done
-
-    {
-      __m128i work;
-      flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
-      flat = _mm_max_epu8(abs_p1p0, flat);
-      flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
-      flat = _mm_subs_epu8(flat, one);
-      flat = _mm_cmpeq_epi8(flat, zero);
-      flat = _mm_and_si128(flat, mask);
-
-      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
-      q5p5 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q5p5),
-                                           (__m64 *)(s + 5 * p)));
-
-      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
-      q6p6 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q6p6),
-                                           (__m64 *)(s + 6 * p)));
-      flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));
-
-      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
-      q7p7 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q7p7),
-                                           (__m64 *)(s + 7 * p)));
-      work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));
-      flat2 = _mm_max_epu8(work, flat2);
-      flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
-      flat2 = _mm_subs_epu8(flat2, one);
-      flat2 = _mm_cmpeq_epi8(flat2, zero);
-      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
-    }
-
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    // flat and wide flat calculations
-    {
-      const __m128i eight = _mm_set1_epi16(8);
-      const __m128i four = _mm_set1_epi16(4);
-      __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
-      __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
-      __m128i pixelFilter_p, pixelFilter_q;
-      __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
-      __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
-
-      p7_16 = _mm_unpacklo_epi8(q7p7, zero);;
-      p6_16 = _mm_unpacklo_epi8(q6p6, zero);
-      p5_16 = _mm_unpacklo_epi8(q5p5, zero);
-      p4_16 = _mm_unpacklo_epi8(q4p4, zero);
-      p3_16 = _mm_unpacklo_epi8(q3p3, zero);
-      p2_16 = _mm_unpacklo_epi8(q2p2, zero);
-      p1_16 = _mm_unpacklo_epi8(q1p1, zero);
-      p0_16 = _mm_unpacklo_epi8(q0p0, zero);
-      q0_16 = _mm_unpackhi_epi8(q0p0, zero);
-      q1_16 = _mm_unpackhi_epi8(q1p1, zero);
-      q2_16 = _mm_unpackhi_epi8(q2p2, zero);
-      q3_16 = _mm_unpackhi_epi8(q3p3, zero);
-      q4_16 = _mm_unpackhi_epi8(q4p4, zero);
-      q5_16 = _mm_unpackhi_epi8(q5p5, zero);
-      q6_16 = _mm_unpackhi_epi8(q6p6, zero);
-      q7_16 = _mm_unpackhi_epi8(q7p7, zero);
-
-      pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
-                                    _mm_add_epi16(p4_16, p3_16));
-      pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
-                                    _mm_add_epi16(q4_16, q3_16));
-
-      pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
-      pixelFilter_p =  _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
-
-      pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
-      pixelFilter_q =  _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
-      pixelFilter_p =  _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p,
-                                                         pixelFilter_q));
-      pixetFilter_p2p1p0 =   _mm_add_epi16(four,
-                                           _mm_add_epi16(pixetFilter_p2p1p0,
-                                                         pixetFilter_q2q1q0));
-      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
-                                           _mm_add_epi16(p7_16, p0_16)), 4);
-      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
-                                           _mm_add_epi16(q7_16, q0_16)), 4);
-      flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
-      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
-                                           _mm_add_epi16(p3_16, p0_16)), 3);
-      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
-                                           _mm_add_epi16(q3_16, q0_16)), 3);
-
-      flat_q0p0 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(p7_16, p7_16);
-      sum_q7 = _mm_add_epi16(q7_16, q7_16);
-      sum_p3 = _mm_add_epi16(p3_16, p3_16);
-      sum_q3 = _mm_add_epi16(q3_16, q3_16);
-
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
-      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
-                             _mm_add_epi16(sum_p7, p1_16)), 4);
-      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
-                             _mm_add_epi16(sum_q7, q1_16)), 4);
-      flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
-
-      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
-      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
-      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
-                             _mm_add_epi16(sum_p3, p1_16)), 3);
-      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
-                             _mm_add_epi16(sum_q3, q1_16)), 3);
-      flat_q1p1 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-      sum_p3 = _mm_add_epi16(sum_p3, p3_16);
-      sum_q3 = _mm_add_epi16(sum_q3, q3_16);
-
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
-      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
-                             _mm_add_epi16(sum_p7, p2_16)), 4);
-      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
-                             _mm_add_epi16(sum_q7, q2_16)), 4);
-      flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
-
-      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
-      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
-
-      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
-                                           _mm_add_epi16(sum_p3, p2_16)), 3);
-      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
-                                           _mm_add_epi16(sum_q3, q2_16)), 3);
-      flat_q2p2 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
-      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
-                             _mm_add_epi16(sum_p7, p3_16)), 4);
-      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
-                             _mm_add_epi16(sum_q7, q3_16)), 4);
-      flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
-      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
-                             _mm_add_epi16(sum_p7, p4_16)), 4);
-      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
-                             _mm_add_epi16(sum_q7, q4_16)), 4);
-      flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
-      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
-                             _mm_add_epi16(sum_p7, p5_16)), 4);
-      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
-                             _mm_add_epi16(sum_q7, q5_16)), 4);
-      flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
-      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
-                             _mm_add_epi16(sum_p7, p6_16)), 4);
-      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
-                             _mm_add_epi16(sum_q7, q6_16)), 4);
-      flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
-    }
-    // wide flat
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-    flat = _mm_shuffle_epi32(flat, 68);
-    flat2 = _mm_shuffle_epi32(flat2, 68);
-
-    q2p2 = _mm_andnot_si128(flat, q2p2);
-    flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
-    q2p2 = _mm_or_si128(q2p2, flat_q2p2);
-
-    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
-    flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
-    q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
-
-    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
-    flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
-    q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
-
-    q6p6 = _mm_andnot_si128(flat2, q6p6);
-    flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
-    q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
-    _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
-    _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
-
-    q5p5 = _mm_andnot_si128(flat2, q5p5);
-    flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
-    q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
-    _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
-    _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
-
-    q4p4 = _mm_andnot_si128(flat2, q4p4);
-    flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
-    q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
-    _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
-    _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
-
-    q3p3 = _mm_andnot_si128(flat2, q3p3);
-    flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
-    q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
-    _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
-    _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
-
-    q2p2 = _mm_andnot_si128(flat2, q2p2);
-    flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
-    q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
-    _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
-    _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
-
-    q1p1 = _mm_andnot_si128(flat2, q1p1);
-    flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
-    q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
-    _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
-    _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
-
-    q0p0 = _mm_andnot_si128(flat2, q0p0);
-    flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
-    q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
-    _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
-    _mm_storeh_pi((__m64 *)(s - 0 * p),  _mm_castsi128_ps(q0p0));
-  }
-}
-
-static INLINE __m128i filter_add2_sub2(const __m128i *const total,
-                                       const __m128i *const a1,
-                                       const __m128i *const a2,
-                                       const __m128i *const s1,
-                                       const __m128i *const s2) {
-  __m128i x = _mm_add_epi16(*a1, *total);
-  x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2);
-  return x;
-}
-
-static INLINE __m128i filter8_mask(const __m128i *const flat,
-                                   const __m128i *const other_filt,
-                                   const __m128i *const f8_lo,
-                                   const __m128i *const f8_hi) {
-  const __m128i f8 = _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3),
-                                      _mm_srli_epi16(*f8_hi, 3));
-  const __m128i result = _mm_and_si128(*flat, f8);
-  return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
-}
-
-static INLINE __m128i filter16_mask(const __m128i *const flat,
-                                    const __m128i *const other_filt,
-                                    const __m128i *const f_lo,
-                                    const __m128i *const f_hi) {
-  const __m128i f = _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4),
-                                     _mm_srli_epi16(*f_hi, 4));
-  const __m128i result = _mm_and_si128(*flat, f);
-  return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
-}
-
-void vpx_lpf_horizontal_edge_16_sse2(unsigned char *s, int p,
-                                     const unsigned char *_blimit,
-                                     const unsigned char *_limit,
-                                     const unsigned char *_thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i one = _mm_set1_epi8(1);
-  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
-  __m128i mask, hev, flat, flat2;
-  __m128i p7, p6, p5;
-  __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
-  __m128i q5, q6, q7;
-
-  __m128i op2, op1, op0, oq0, oq1, oq2;
-
-  __m128i max_abs_p1p0q1q0;
-
-  p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
-  p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
-  p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
-  p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-  q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
-  q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
-  q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
-  q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
-
-  {
-    const __m128i abs_p1p0 = abs_diff(p1, p0);
-    const __m128i abs_q1q0 = abs_diff(q1, q0);
-    const __m128i fe = _mm_set1_epi8(0xfe);
-    const __m128i ff = _mm_cmpeq_epi8(zero, zero);
-    __m128i abs_p0q0 = abs_diff(p0, q0);
-    __m128i abs_p1q1 = abs_diff(p1, q1);
-    __m128i work;
-    max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
-
-    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
-    // mask |= (abs(p1 - p0) > limit) * -1;
-    // mask |= (abs(q1 - q0) > limit) * -1;
-    work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
-    mask = _mm_max_epu8(work, mask);
-    work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
-    mask = _mm_max_epu8(work, mask);
-    mask = _mm_subs_epu8(mask, limit);
-    mask = _mm_cmpeq_epi8(mask, zero);
-  }
-
-  {
-    __m128i work;
-    work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
-    flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
-    work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
-    flat = _mm_max_epu8(work, flat);
-    work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
-    flat = _mm_subs_epu8(flat, one);
-    flat = _mm_cmpeq_epi8(flat, zero);
-    flat = _mm_and_si128(flat, mask);
-    flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
-    flat2 = _mm_max_epu8(work, flat2);
-    work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
-    flat2 = _mm_max_epu8(work, flat2);
-    work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0));
-    flat2 = _mm_max_epu8(work, flat2);
-    flat2 = _mm_subs_epu8(flat2, one);
-    flat2 = _mm_cmpeq_epi8(flat2, zero);
-    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
-  }
-
-  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  // filter4
-  {
-    const __m128i t4 = _mm_set1_epi8(4);
-    const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i te0 = _mm_set1_epi8(0xe0);
-    const __m128i t1f = _mm_set1_epi8(0x1f);
-    const __m128i t1 = _mm_set1_epi8(0x1);
-    const __m128i t7f = _mm_set1_epi8(0x7f);
-    const __m128i ff = _mm_cmpeq_epi8(t4, t4);
-
-    __m128i filt;
-    __m128i work_a;
-    __m128i filter1, filter2;
-
-    op1 = _mm_xor_si128(p1, t80);
-    op0 = _mm_xor_si128(p0, t80);
-    oq0 = _mm_xor_si128(q0, t80);
-    oq1 = _mm_xor_si128(q1, t80);
-
-    hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-    filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
-
-    work_a = _mm_subs_epi8(oq0, op0);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    // (vpx_filter + 3 * (qs0 - ps0)) & mask
-    filt = _mm_and_si128(filt, mask);
-    filter1 = _mm_adds_epi8(filt, t4);
-    filter2 = _mm_adds_epi8(filt, t3);
-
-    // Filter1 >> 3
-    work_a = _mm_cmpgt_epi8(zero, filter1);
-    filter1 = _mm_srli_epi16(filter1, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter1 = _mm_and_si128(filter1, t1f);
-    filter1 = _mm_or_si128(filter1, work_a);
-    oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
-
-    // Filter2 >> 3
-    work_a = _mm_cmpgt_epi8(zero, filter2);
-    filter2 = _mm_srli_epi16(filter2, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter2 = _mm_and_si128(filter2, t1f);
-    filter2 = _mm_or_si128(filter2, work_a);
-    op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
-
-    // filt >> 1
-    filt = _mm_adds_epi8(filter1, t1);
-    work_a = _mm_cmpgt_epi8(zero, filt);
-    filt = _mm_srli_epi16(filt, 1);
-    work_a = _mm_and_si128(work_a, t80);
-    filt = _mm_and_si128(filt, t7f);
-    filt = _mm_or_si128(filt, work_a);
-    filt = _mm_andnot_si128(hev, filt);
-    op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
-    oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
-    // loopfilter done
-
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    // filter8
-    {
-      const __m128i four = _mm_set1_epi16(4);
-      const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
-      const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
-      const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
-      const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
-      const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
-      const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
-      const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
-      const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
-
-      const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
-      const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
-      const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
-      const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
-      const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
-      const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
-      const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
-      const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
-      __m128i f8_lo, f8_hi;
-
-      f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
-                            _mm_add_epi16(p3_lo, p2_lo));
-      f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
-                            _mm_add_epi16(p2_lo, p1_lo));
-      f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
-
-      f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
-                            _mm_add_epi16(p3_hi, p2_hi));
-      f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
-                            _mm_add_epi16(p2_hi, p1_hi));
-      f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
-
-      op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
-
-      f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
-      f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
-      op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
-
-      f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
-      f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
-      op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
-
-      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
-      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
-      oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
-
-      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
-      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
-      oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
-
-      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
-      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
-      oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
-    }
-
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    // wide flat calculations
-    {
-      const __m128i eight = _mm_set1_epi16(8);
-      const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero);
-      const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
-      const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
-      const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
-      const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
-      const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
-      const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
-      const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
-      const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
-      const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
-      const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
-      const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
-      const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
-      const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
-      const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
-      const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero);
-
-      const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero);
-      const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
-      const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
-      const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
-      const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
-      const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
-      const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
-      const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
-      const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
-      const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
-      const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
-      const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
-      const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
-      const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
-      const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
-      const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero);
-
-      __m128i f_lo;
-      __m128i f_hi;
-
-      f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo);  // p7 * 7
-      f_lo = _mm_add_epi16(_mm_slli_epi16(p6_lo, 1),
-                           _mm_add_epi16(p4_lo, f_lo));
-      f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
-                           _mm_add_epi16(p2_lo, p1_lo));
-      f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
-      f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo);
-
-      f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi);  // p7 * 7
-      f_hi = _mm_add_epi16(_mm_slli_epi16(p6_hi, 1),
-                           _mm_add_epi16(p4_hi, f_hi));
-      f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
-                           _mm_add_epi16(p2_hi, p1_hi));
-      f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
-      f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
-
-      p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
-
-      f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
-      p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
-
-      f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
-      p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
-
-      f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
-      p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
-
-      f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
-      op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
-
-      f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
-      op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
-
-      f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
-      op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
-      oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
-      oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
-      oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
-      q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
-      q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
-      q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
-      q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
-    }
-    // wide flat
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  }
-}
-
-void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
-                               const unsigned char *_blimit,
-                               const unsigned char *_limit,
-                               const unsigned char *_thresh) {
-  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
-  __m128i mask, hev, flat;
-  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
-  __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
-
-  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
-  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 2 * p)));
-  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
-  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
-                            _mm_loadl_epi64((__m128i *)(s - 0 * p)));
-  p1q1 = _mm_shuffle_epi32(q1p1, 78);
-  p0q0 = _mm_shuffle_epi32(q0p0, 78);
-
-  {
-    // filter_mask and hev_mask
-    const __m128i one = _mm_set1_epi8(1);
-    const __m128i fe = _mm_set1_epi8(0xfe);
-    const __m128i ff = _mm_cmpeq_epi8(fe, fe);
-    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
-    abs_p1p0 = abs_diff(q1p1, q0p0);
-    abs_q1q0 =  _mm_srli_si128(abs_p1p0, 8);
-
-    abs_p0q0 = abs_diff(q0p0, p0q0);
-    abs_p1q1 = abs_diff(q1p1, p1q1);
-    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-
-    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(abs_p1p0, mask);
-    // mask |= (abs(p1 - p0) > limit) * -1;
-    // mask |= (abs(q1 - q0) > limit) * -1;
-
-    work = _mm_max_epu8(abs_diff(q2p2, q1p1),
-                        abs_diff(q3p3, q2p2));
-    mask = _mm_max_epu8(work, mask);
-    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-    mask = _mm_subs_epu8(mask, limit);
-    mask = _mm_cmpeq_epi8(mask, zero);
-
-    // flat_mask4
-
-    flat = _mm_max_epu8(abs_diff(q2p2, q0p0),
-                        abs_diff(q3p3, q0p0));
-    flat = _mm_max_epu8(abs_p1p0, flat);
-    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
-    flat = _mm_subs_epu8(flat, one);
-    flat = _mm_cmpeq_epi8(flat, zero);
-    flat = _mm_and_si128(flat, mask);
-  }
-
-  {
-    const __m128i four = _mm_set1_epi16(4);
-    unsigned char *src = s;
-    {
-      __m128i workp_a, workp_b, workp_shft;
-      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
-      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
-      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
-      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
-      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
-      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
-      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
-      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
-
-      workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
-      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op2[0],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op1[0],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op0[0],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq0[0],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq1[0],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq2[0],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-    }
-  }
-  // lp filter
-  {
-    const __m128i t4 = _mm_set1_epi8(4);
-    const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i t1 = _mm_set1_epi8(0x1);
-    const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
-                                      t80);
-    const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
-                                      t80);
-    const __m128i qs0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)),
-                                      t80);
-    const __m128i qs1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)),
-                                      t80);
-    __m128i filt;
-    __m128i work_a;
-    __m128i filter1, filter2;
-
-    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
-    work_a = _mm_subs_epi8(qs0, ps0);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    // (vpx_filter + 3 * (qs0 - ps0)) & mask
-    filt = _mm_and_si128(filt, mask);
-
-    filter1 = _mm_adds_epi8(filt, t4);
-    filter2 = _mm_adds_epi8(filt, t3);
-
-    // Filter1 >> 3
-    filter1 = _mm_unpacklo_epi8(zero, filter1);
-    filter1 = _mm_srai_epi16(filter1, 11);
-    filter1 = _mm_packs_epi16(filter1, filter1);
-
-    // Filter2 >> 3
-    filter2 = _mm_unpacklo_epi8(zero, filter2);
-    filter2 = _mm_srai_epi16(filter2, 11);
-    filter2 = _mm_packs_epi16(filter2, zero);
-
-    // filt >> 1
-    filt = _mm_adds_epi8(filter1, t1);
-    filt = _mm_unpacklo_epi8(zero, filt);
-    filt = _mm_srai_epi16(filt, 9);
-    filt = _mm_packs_epi16(filt, zero);
-
-    filt = _mm_andnot_si128(hev, filt);
-
-    work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
-    q0 = _mm_loadl_epi64((__m128i *)flat_oq0);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q0 = _mm_and_si128(flat, q0);
-    q0 = _mm_or_si128(work_a, q0);
-
-    work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
-    q1 = _mm_loadl_epi64((__m128i *)flat_oq1);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q1 = _mm_and_si128(flat, q1);
-    q1 = _mm_or_si128(work_a, q1);
-
-    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
-    q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q2 = _mm_and_si128(flat, q2);
-    q2 = _mm_or_si128(work_a, q2);
-
-    work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
-    p0 = _mm_loadl_epi64((__m128i *)flat_op0);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p0 = _mm_and_si128(flat, p0);
-    p0 = _mm_or_si128(work_a, p0);
-
-    work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
-    p1 = _mm_loadl_epi64((__m128i *)flat_op1);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p1 = _mm_and_si128(flat, p1);
-    p1 = _mm_or_si128(work_a, p1);
-
-    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
-    p2 = _mm_loadl_epi64((__m128i *)flat_op2);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p2 = _mm_and_si128(flat, p2);
-    p2 = _mm_or_si128(work_a, p2);
-
-    _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
-    _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
-    _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
-    _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
-    _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
-    _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
-  }
-}
-
-void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p,
-                                    const uint8_t *_blimit0,
-                                    const uint8_t *_limit0,
-                                    const uint8_t *_thresh0,
-                                    const uint8_t *_blimit1,
-                                    const uint8_t *_limit1,
-                                    const uint8_t *_thresh1) {
-  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i blimit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
-                         _mm_load_si128((const __m128i *)_blimit1));
-  const __m128i limit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
-                         _mm_load_si128((const __m128i *)_limit1));
-  const __m128i thresh =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
-                         _mm_load_si128((const __m128i *)_thresh1));
-
-  __m128i mask, hev, flat;
-  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
-
-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-  {
-    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
-                                          _mm_subs_epu8(p0, p1));
-    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
-                                          _mm_subs_epu8(q0, q1));
-    const __m128i one = _mm_set1_epi8(1);
-    const __m128i fe = _mm_set1_epi8(0xfe);
-    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
-    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
-                                    _mm_subs_epu8(q0, p0));
-    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
-                                    _mm_subs_epu8(q1, p1));
-    __m128i work;
-
-    // filter_mask and hev_mask
-    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-
-    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(flat, mask);
-    // mask |= (abs(p1 - p0) > limit) * -1;
-    // mask |= (abs(q1 - q0) > limit) * -1;
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
-                                     _mm_subs_epu8(p1, p2)),
-                         _mm_or_si128(_mm_subs_epu8(p3, p2),
-                                      _mm_subs_epu8(p2, p3)));
-    mask = _mm_max_epu8(work, mask);
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
-                                     _mm_subs_epu8(q1, q2)),
-                         _mm_or_si128(_mm_subs_epu8(q3, q2),
-                                      _mm_subs_epu8(q2, q3)));
-    mask = _mm_max_epu8(work, mask);
-    mask = _mm_subs_epu8(mask, limit);
-    mask = _mm_cmpeq_epi8(mask, zero);
-
-    // flat_mask4
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
-                                     _mm_subs_epu8(p0, p2)),
-                         _mm_or_si128(_mm_subs_epu8(q2, q0),
-                                      _mm_subs_epu8(q0, q2)));
-    flat = _mm_max_epu8(work, flat);
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
-                                     _mm_subs_epu8(p0, p3)),
-                         _mm_or_si128(_mm_subs_epu8(q3, q0),
-                                      _mm_subs_epu8(q0, q3)));
-    flat = _mm_max_epu8(work, flat);
-    flat = _mm_subs_epu8(flat, one);
-    flat = _mm_cmpeq_epi8(flat, zero);
-    flat = _mm_and_si128(flat, mask);
-  }
-  {
-    const __m128i four = _mm_set1_epi16(4);
-    unsigned char *src = s;
-    int i = 0;
-
-    do {
-      __m128i workp_a, workp_b, workp_shft;
-      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
-      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
-      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
-      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
-      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
-      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
-      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
-      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
-
-      workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
-      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op2[i * 8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op1[i * 8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op0[i * 8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq0[i * 8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq1[i * 8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq2[i * 8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      src += 8;
-    } while (++i < 2);
-  }
-  // lp filter
-  {
-    const __m128i t4 = _mm_set1_epi8(4);
-    const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i te0 = _mm_set1_epi8(0xe0);
-    const __m128i t1f = _mm_set1_epi8(0x1f);
-    const __m128i t1 = _mm_set1_epi8(0x1);
-    const __m128i t7f = _mm_set1_epi8(0x7f);
-
-    const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
-                                      t80);
-    const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
-                                      t80);
-    const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
-                                      t80);
-    const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
-                                      t80);
-    __m128i filt;
-    __m128i work_a;
-    __m128i filter1, filter2;
-
-    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
-    work_a = _mm_subs_epi8(qs0, ps0);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    // (vpx_filter + 3 * (qs0 - ps0)) & mask
-    filt = _mm_and_si128(filt, mask);
-
-    filter1 = _mm_adds_epi8(filt, t4);
-    filter2 = _mm_adds_epi8(filt, t3);
-
-    // Filter1 >> 3
-    work_a = _mm_cmpgt_epi8(zero, filter1);
-    filter1 = _mm_srli_epi16(filter1, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter1 = _mm_and_si128(filter1, t1f);
-    filter1 = _mm_or_si128(filter1, work_a);
-
-    // Filter2 >> 3
-    work_a = _mm_cmpgt_epi8(zero, filter2);
-    filter2 = _mm_srli_epi16(filter2, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter2 = _mm_and_si128(filter2, t1f);
-    filter2 = _mm_or_si128(filter2, work_a);
-
-    // filt >> 1
-    filt = _mm_adds_epi8(filter1, t1);
-    work_a = _mm_cmpgt_epi8(zero, filt);
-    filt = _mm_srli_epi16(filt, 1);
-    work_a = _mm_and_si128(work_a, t80);
-    filt = _mm_and_si128(filt, t7f);
-    filt = _mm_or_si128(filt, work_a);
-
-    filt = _mm_andnot_si128(hev, filt);
-
-    work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
-    q0 = _mm_load_si128((__m128i *)flat_oq0);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q0 = _mm_and_si128(flat, q0);
-    q0 = _mm_or_si128(work_a, q0);
-
-    work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
-    q1 = _mm_load_si128((__m128i *)flat_oq1);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q1 = _mm_and_si128(flat, q1);
-    q1 = _mm_or_si128(work_a, q1);
-
-    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
-    q2 = _mm_load_si128((__m128i *)flat_oq2);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q2 = _mm_and_si128(flat, q2);
-    q2 = _mm_or_si128(work_a, q2);
-
-    work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
-    p0 = _mm_load_si128((__m128i *)flat_op0);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p0 = _mm_and_si128(flat, p0);
-    p0 = _mm_or_si128(work_a, p0);
-
-    work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
-    p1 = _mm_load_si128((__m128i *)flat_op1);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p1 = _mm_and_si128(flat, p1);
-    p1 = _mm_or_si128(work_a, p1);
-
-    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
-    p2 = _mm_load_si128((__m128i *)flat_op2);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p2 = _mm_and_si128(flat, p2);
-    p2 = _mm_or_si128(work_a, p2);
-
-    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
-    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
-    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
-    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
-  }
-}
-
-void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
-                                    const unsigned char *_blimit0,
-                                    const unsigned char *_limit0,
-                                    const unsigned char *_thresh0,
-                                    const unsigned char *_blimit1,
-                                    const unsigned char *_limit1,
-                                    const unsigned char *_thresh1) {
-  const __m128i blimit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
-                         _mm_load_si128((const __m128i *)_blimit1));
-  const __m128i limit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
-                         _mm_load_si128((const __m128i *)_limit1));
-  const __m128i thresh =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
-                         _mm_load_si128((const __m128i *)_thresh1));
-  const __m128i zero = _mm_set1_epi16(0);
-  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
-  __m128i mask, hev, flat;
-
-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-
-  // filter_mask and hev_mask
-  {
-    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
-                                          _mm_subs_epu8(p0, p1));
-    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
-                                          _mm_subs_epu8(q0, q1));
-    const __m128i fe = _mm_set1_epi8(0xfe);
-    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
-    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
-                                    _mm_subs_epu8(q0, p0));
-    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
-                                    _mm_subs_epu8(q1, p1));
-    __m128i work;
-
-    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-
-    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(flat, mask);
-    // mask |= (abs(p1 - p0) > limit) * -1;
-    // mask |= (abs(q1 - q0) > limit) * -1;
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
-                                     _mm_subs_epu8(p1, p2)),
-                         _mm_or_si128(_mm_subs_epu8(p3, p2),
-                                      _mm_subs_epu8(p2, p3)));
-    mask = _mm_max_epu8(work, mask);
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
-                                     _mm_subs_epu8(q1, q2)),
-                         _mm_or_si128(_mm_subs_epu8(q3, q2),
-                                      _mm_subs_epu8(q2, q3)));
-    mask = _mm_max_epu8(work, mask);
-    mask = _mm_subs_epu8(mask, limit);
-    mask = _mm_cmpeq_epi8(mask, zero);
-  }
-
-  // filter4
-  {
-    const __m128i t4 = _mm_set1_epi8(4);
-    const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i te0 = _mm_set1_epi8(0xe0);
-    const __m128i t1f = _mm_set1_epi8(0x1f);
-    const __m128i t1 = _mm_set1_epi8(0x1);
-    const __m128i t7f = _mm_set1_epi8(0x7f);
-
-    const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
-                                      t80);
-    const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
-                                      t80);
-    const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
-                                      t80);
-    const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
-                                      t80);
-    __m128i filt;
-    __m128i work_a;
-    __m128i filter1, filter2;
-
-    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
-    work_a = _mm_subs_epi8(qs0, ps0);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    // (vpx_filter + 3 * (qs0 - ps0)) & mask
-    filt = _mm_and_si128(filt, mask);
-
-    filter1 = _mm_adds_epi8(filt, t4);
-    filter2 = _mm_adds_epi8(filt, t3);
-
-    // Filter1 >> 3
-    work_a = _mm_cmpgt_epi8(zero, filter1);
-    filter1 = _mm_srli_epi16(filter1, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter1 = _mm_and_si128(filter1, t1f);
-    filter1 = _mm_or_si128(filter1, work_a);
-
-    // Filter2 >> 3
-    work_a = _mm_cmpgt_epi8(zero, filter2);
-    filter2 = _mm_srli_epi16(filter2, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter2 = _mm_and_si128(filter2, t1f);
-    filter2 = _mm_or_si128(filter2, work_a);
-
-    // filt >> 1
-    filt = _mm_adds_epi8(filter1, t1);
-    work_a = _mm_cmpgt_epi8(zero, filt);
-    filt = _mm_srli_epi16(filt, 1);
-    work_a = _mm_and_si128(work_a, t80);
-    filt = _mm_and_si128(filt, t7f);
-    filt = _mm_or_si128(filt, work_a);
-
-    filt = _mm_andnot_si128(hev, filt);
-
-    q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
-    q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
-    p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
-    p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
-
-    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
-    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
-  }
-}
-
-static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
-                                 int in_p, unsigned char *out, int out_p) {
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  __m128i x8, x9, x10, x11, x12, x13, x14, x15;
-
-  // 2-way interleave w/hoisting of unpacks
-  x0 = _mm_loadl_epi64((__m128i *)in0);  // 1
-  x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));  // 3
-  x0 = _mm_unpacklo_epi8(x0, x1);  // 1
-
-  x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));  // 5
-  x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p));  // 7
-  x1 = _mm_unpacklo_epi8(x2, x3);  // 2
-
-  x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p));  // 9
-  x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p));  // 11
-  x2 = _mm_unpacklo_epi8(x4, x5);  // 3
-
-  x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p));  // 13
-  x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p));  // 15
-  x3 = _mm_unpacklo_epi8(x6, x7);  // 4
-  x4 = _mm_unpacklo_epi16(x0, x1);  // 9
-
-  x8 = _mm_loadl_epi64((__m128i *)in1);  // 2
-  x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));  // 4
-  x8 = _mm_unpacklo_epi8(x8, x9);  // 5
-  x5 = _mm_unpacklo_epi16(x2, x3);  // 10
-
-  x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));  // 6
-  x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p));  // 8
-  x9 = _mm_unpacklo_epi8(x10, x11);  // 6
-
-  x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p));  // 10
-  x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p));  // 12
-  x10 = _mm_unpacklo_epi8(x12, x13);  // 7
-  x12 = _mm_unpacklo_epi16(x8, x9);  // 11
-
-  x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p));  // 14
-  x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p));  // 16
-  x11 = _mm_unpacklo_epi8(x14, x15);  // 8
-  x13 = _mm_unpacklo_epi16(x10, x11);  // 12
-
-  x6 = _mm_unpacklo_epi32(x4, x5);  // 13
-  x7 = _mm_unpackhi_epi32(x4, x5);  // 14
-  x14 = _mm_unpacklo_epi32(x12, x13);  // 15
-  x15 = _mm_unpackhi_epi32(x12, x13);  // 16
-
-  // Store first 4-line result
-  _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
-  _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
-  _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
-  _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
-
-  x4 = _mm_unpackhi_epi16(x0, x1);
-  x5 = _mm_unpackhi_epi16(x2, x3);
-  x12 = _mm_unpackhi_epi16(x8, x9);
-  x13 = _mm_unpackhi_epi16(x10, x11);
-
-  x6 = _mm_unpacklo_epi32(x4, x5);
-  x7 = _mm_unpackhi_epi32(x4, x5);
-  x14 = _mm_unpacklo_epi32(x12, x13);
-  x15 = _mm_unpackhi_epi32(x12, x13);
-
-  // Store second 4-line result
-  _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
-  _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
-  _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
-  _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
-}
-
-static INLINE void transpose(unsigned char *src[], int in_p,
-                             unsigned char *dst[], int out_p,
-                             int num_8x8_to_transpose) {
-  int idx8x8 = 0;
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  do {
-    unsigned char *in = src[idx8x8];
-    unsigned char *out = dst[idx8x8];
-
-    x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p));  // 00 01 02 03 04 05 06 07
-    x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p));  // 10 11 12 13 14 15 16 17
-    // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-    x0 = _mm_unpacklo_epi8(x0, x1);
-
-    x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p));  // 20 21 22 23 24 25 26 27
-    x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p));  // 30 31 32 33 34 35 36 37
-    // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-    x1 = _mm_unpacklo_epi8(x2, x3);
-
-    x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p));  // 40 41 42 43 44 45 46 47
-    x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p));  // 50 51 52 53 54 55 56 57
-    // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
-    x2 = _mm_unpacklo_epi8(x4, x5);
-
-    x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p));  // 60 61 62 63 64 65 66 67
-    x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p));  // 70 71 72 73 74 75 76 77
-    // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
-    x3 = _mm_unpacklo_epi8(x6, x7);
-
-    // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-    x4 = _mm_unpacklo_epi16(x0, x1);
-    // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
-    x5 = _mm_unpacklo_epi16(x2, x3);
-    // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
-    x6 = _mm_unpacklo_epi32(x4, x5);
-    _mm_storel_pd((double *)(out + 0*out_p),
-                  _mm_castsi128_pd(x6));  // 00 10 20 30 40 50 60 70
-    _mm_storeh_pd((double *)(out + 1*out_p),
-                  _mm_castsi128_pd(x6));  // 01 11 21 31 41 51 61 71
-    // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-    x7 = _mm_unpackhi_epi32(x4, x5);
-    _mm_storel_pd((double *)(out + 2*out_p),
-                  _mm_castsi128_pd(x7));  // 02 12 22 32 42 52 62 72
-    _mm_storeh_pd((double *)(out + 3*out_p),
-                  _mm_castsi128_pd(x7));  // 03 13 23 33 43 53 63 73
-
-    // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
-    x4 = _mm_unpackhi_epi16(x0, x1);
-    // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
-    x5 = _mm_unpackhi_epi16(x2, x3);
-    // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
-    x6 = _mm_unpacklo_epi32(x4, x5);
-    _mm_storel_pd((double *)(out + 4*out_p),
-                  _mm_castsi128_pd(x6));  // 04 14 24 34 44 54 64 74
-    _mm_storeh_pd((double *)(out + 5*out_p),
-                  _mm_castsi128_pd(x6));  // 05 15 25 35 45 55 65 75
-    // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
-    x7 = _mm_unpackhi_epi32(x4, x5);
-
-    _mm_storel_pd((double *)(out + 6*out_p),
-                  _mm_castsi128_pd(x7));  // 06 16 26 36 46 56 66 76
-    _mm_storeh_pd((double *)(out + 7*out_p),
-                  _mm_castsi128_pd(x7));  // 07 17 27 37 47 57 67 77
-  } while (++idx8x8 < num_8x8_to_transpose);
-}
-
-void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
-                                  const uint8_t *limit0,
-                                  const uint8_t *thresh0,
-                                  const uint8_t *blimit1,
-                                  const uint8_t *limit1,
-                                  const uint8_t *thresh1) {
-  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
-  unsigned char *src[2];
-  unsigned char *dst[2];
-
-  // Transpose 8x16
-  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
-
-  // Loop filtering
-  vpx_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
-                                 blimit1, limit1, thresh1);
-  src[0] = t_dst;
-  src[1] = t_dst + 8;
-  dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
-
-  // Transpose back
-  transpose(src, 16, dst, p, 2);
-}
-
-void vpx_lpf_vertical_8_sse2(unsigned char *s, int p,
-                             const unsigned char *blimit,
-                             const unsigned char *limit,
-                             const unsigned char *thresh) {
-  DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]);
-  unsigned char *src[1];
-  unsigned char *dst[1];
-
-  // Transpose 8x8
-  src[0] = s - 4;
-  dst[0] = t_dst;
-
-  transpose(src, p, dst, 8, 1);
-
-  // Loop filtering
-  vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh);
-
-  src[0] = t_dst;
-  dst[0] = s - 4;
-
-  // Transpose back
-  transpose(src, 8, dst, p, 1);
-}
-
-void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
-                                  const uint8_t *limit0,
-                                  const uint8_t *thresh0,
-                                  const uint8_t *blimit1,
-                                  const uint8_t *limit1,
-                                  const uint8_t *thresh1) {
-  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
-  unsigned char *src[2];
-  unsigned char *dst[2];
-
-  // Transpose 8x16
-  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
-
-  // Loop filtering
-  vpx_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
-                                 blimit1, limit1, thresh1);
-  src[0] = t_dst;
-  src[1] = t_dst + 8;
-
-  dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
-
-  // Transpose back
-  transpose(src, 16, dst, p, 2);
-}
-
-void vpx_lpf_vertical_16_sse2(unsigned char *s, int p,
-                              const unsigned char *blimit,
-                              const unsigned char *limit,
-                              const unsigned char *thresh) {
-  DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 16]);
-  unsigned char *src[2];
-  unsigned char *dst[2];
-
-  src[0] = s - 8;
-  src[1] = s;
-  dst[0] = t_dst;
-  dst[1] = t_dst + 8 * 8;
-
-  // Transpose 16x8
-  transpose(src, p, dst, 8, 2);
-
-  // Loop filtering
-  vpx_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh);
-
-  src[0] = t_dst;
-  src[1] = t_dst + 8 * 8;
-  dst[0] = s - 8;
-  dst[1] = s;
-
-  // Transpose back
-  transpose(src, 8, dst, p, 2);
-}
-
-void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
-                                   const uint8_t *blimit, const uint8_t *limit,
-                                   const uint8_t *thresh) {
-  DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
-
-  // Transpose 16x16
-  transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
-  transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
-
-  // Loop filtering
-  vpx_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh);
-
-  // Transpose back
-  transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
-  transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
-}
diff --git a/thirdparty/libvpx/vpx_dsp/x86/txfm_common_sse2.h b/thirdparty/libvpx/vpx_dsp/x86/txfm_common_sse2.h
deleted file mode 100644
index 536b206876..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/txfm_common_sse2.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_DSP_X86_TXFM_COMMON_SSE2_H_
-#define VPX_DSP_X86_TXFM_COMMON_SSE2_H_
-
-#include <emmintrin.h>
-#include "vpx/vpx_integer.h"
-
-#define pair_set_epi16(a, b) \
-  _mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
-                (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
-
-#define dual_set_epi16(a, b) \
-  _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \
-                (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a))
-
-#define octa_set_epi16(a, b, c, d, e, f, g, h) \
-  _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \
-                 (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h))
-
-#endif  // VPX_DSP_X86_TXFM_COMMON_SSE2_H_
diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_asm_stubs.c b/thirdparty/libvpx/vpx_dsp/x86/vpx_asm_stubs.c
deleted file mode 100644
index 422b0fc422..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/vpx_asm_stubs.c
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_config.h"
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/x86/convolve.h"
-
-#if HAVE_SSE2
-filter8_1dfunction vpx_filter_block1d16_v8_sse2;
-filter8_1dfunction vpx_filter_block1d16_h8_sse2;
-filter8_1dfunction vpx_filter_block1d8_v8_sse2;
-filter8_1dfunction vpx_filter_block1d8_h8_sse2;
-filter8_1dfunction vpx_filter_block1d4_v8_sse2;
-filter8_1dfunction vpx_filter_block1d4_h8_sse2;
-filter8_1dfunction vpx_filter_block1d16_v8_avg_sse2;
-filter8_1dfunction vpx_filter_block1d16_h8_avg_sse2;
-filter8_1dfunction vpx_filter_block1d8_v8_avg_sse2;
-filter8_1dfunction vpx_filter_block1d8_h8_avg_sse2;
-filter8_1dfunction vpx_filter_block1d4_v8_avg_sse2;
-filter8_1dfunction vpx_filter_block1d4_h8_avg_sse2;
-
-filter8_1dfunction vpx_filter_block1d16_v2_sse2;
-filter8_1dfunction vpx_filter_block1d16_h2_sse2;
-filter8_1dfunction vpx_filter_block1d8_v2_sse2;
-filter8_1dfunction vpx_filter_block1d8_h2_sse2;
-filter8_1dfunction vpx_filter_block1d4_v2_sse2;
-filter8_1dfunction vpx_filter_block1d4_h2_sse2;
-filter8_1dfunction vpx_filter_block1d16_v2_avg_sse2;
-filter8_1dfunction vpx_filter_block1d16_h2_avg_sse2;
-filter8_1dfunction vpx_filter_block1d8_v2_avg_sse2;
-filter8_1dfunction vpx_filter_block1d8_h2_avg_sse2;
-filter8_1dfunction vpx_filter_block1d4_v2_avg_sse2;
-filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2;
-
-// void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const int16_t *filter_x, int x_step_q4,
-//                               const int16_t *filter_y, int y_step_q4,
-//                               int w, int h);
-// void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                              uint8_t *dst, ptrdiff_t dst_stride,
-//                              const int16_t *filter_x, int x_step_q4,
-//                              const int16_t *filter_y, int y_step_q4,
-//                              int w, int h);
-// void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                                   uint8_t *dst, ptrdiff_t dst_stride,
-//                                   const int16_t *filter_x, int x_step_q4,
-//                                   const int16_t *filter_y, int y_step_q4,
-//                                   int w, int h);
-// void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                                  uint8_t *dst, ptrdiff_t dst_stride,
-//                                  const int16_t *filter_x, int x_step_q4,
-//                                  const int16_t *filter_y, int y_step_q4,
-//                                  int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
-FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
-
-// void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                         uint8_t *dst, ptrdiff_t dst_stride,
-//                         const int16_t *filter_x, int x_step_q4,
-//                         const int16_t *filter_y, int y_step_q4,
-//                         int w, int h);
-// void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                             uint8_t *dst, ptrdiff_t dst_stride,
-//                             const int16_t *filter_x, int x_step_q4,
-//                             const int16_t *filter_y, int y_step_q4,
-//                             int w, int h);
-FUN_CONV_2D(, sse2);
-FUN_CONV_2D(avg_ , sse2);
-
-#if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
-
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
-
-// void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src,
-//                                      ptrdiff_t src_stride,
-//                                      uint8_t *dst,
-//                                      ptrdiff_t dst_stride,
-//                                      const int16_t *filter_x,
-//                                      int x_step_q4,
-//                                      const int16_t *filter_y,
-//                                      int y_step_q4,
-//                                      int w, int h, int bd);
-// void vpx_highbd_convolve8_vert_sse2(const uint8_t *src,
-//                                     ptrdiff_t src_stride,
-//                                     uint8_t *dst,
-//                                     ptrdiff_t dst_stride,
-//                                     const int16_t *filter_x,
-//                                     int x_step_q4,
-//                                     const int16_t *filter_y,
-//                                     int y_step_q4,
-//                                     int w, int h, int bd);
-// void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src,
-//                                          ptrdiff_t src_stride,
-//                                          uint8_t *dst,
-//                                          ptrdiff_t dst_stride,
-//                                          const int16_t *filter_x,
-//                                          int x_step_q4,
-//                                          const int16_t *filter_y,
-//                                          int y_step_q4,
-//                                          int w, int h, int bd);
-// void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src,
-//                                         ptrdiff_t src_stride,
-//                                         uint8_t *dst,
-//                                         ptrdiff_t dst_stride,
-//                                         const int16_t *filter_x,
-//                                         int x_step_q4,
-//                                         const int16_t *filter_y,
-//                                         int y_step_q4,
-//                                         int w, int h, int bd);
-HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
-HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
-HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
-                 sse2);
-
-// void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                                uint8_t *dst, ptrdiff_t dst_stride,
-//                                const int16_t *filter_x, int x_step_q4,
-//                                const int16_t *filter_y, int y_step_q4,
-//                                int w, int h, int bd);
-// void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                                    uint8_t *dst, ptrdiff_t dst_stride,
-//                                    const int16_t *filter_x, int x_step_q4,
-//                                    const int16_t *filter_y, int y_step_q4,
-//                                    int w, int h, int bd);
-HIGH_FUN_CONV_2D(, sse2);
-HIGH_FUN_CONV_2D(avg_ , sse2);
-#endif  // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
-#endif  // HAVE_SSE2
diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm b/thirdparty/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
deleted file mode 100644
index abc0270655..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
+++ /dev/null
@@ -1,228 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro convolve_fn 1-2
-%ifidn %1, avg
-%define AUX_XMM_REGS 4
-%else
-%define AUX_XMM_REGS 0
-%endif
-%ifidn %2, highbd
-%define pavg pavgw
-cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
-                                              dst, dst_stride, \
-                                              fx, fxs, fy, fys, w, h, bd
-%else
-%define pavg pavgb
-cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
-                                           dst, dst_stride, \
-                                           fx, fxs, fy, fys, w, h
-%endif
-  mov r4d, dword wm
-%ifidn %2, highbd
-  shl r4d, 1
-  shl srcq, 1
-  shl src_strideq, 1
-  shl dstq, 1
-  shl dst_strideq, 1
-%else
-  cmp r4d, 4
-  je .w4
-%endif
-  cmp r4d, 8
-  je .w8
-  cmp r4d, 16
-  je .w16
-  cmp r4d, 32
-  je .w32
-%ifidn %2, highbd
-  cmp r4d, 64
-  je .w64
-
-  mov                    r4d, dword hm
-.loop128:
-  movu                    m0, [srcq]
-  movu                    m1, [srcq+16]
-  movu                    m2, [srcq+32]
-  movu                    m3, [srcq+48]
-%ifidn %1, avg
-  pavg                    m0, [dstq]
-  pavg                    m1, [dstq+16]
-  pavg                    m2, [dstq+32]
-  pavg                    m3, [dstq+48]
-%endif
-  mova             [dstq   ], m0
-  mova             [dstq+16], m1
-  mova             [dstq+32], m2
-  mova             [dstq+48], m3
-  movu                    m0, [srcq+64]
-  movu                    m1, [srcq+80]
-  movu                    m2, [srcq+96]
-  movu                    m3, [srcq+112]
-  add                   srcq, src_strideq
-%ifidn %1, avg
-  pavg                    m0, [dstq+64]
-  pavg                    m1, [dstq+80]
-  pavg                    m2, [dstq+96]
-  pavg                    m3, [dstq+112]
-%endif
-  mova             [dstq+64], m0
-  mova             [dstq+80], m1
-  mova             [dstq+96], m2
-  mova            [dstq+112], m3
-  add                   dstq, dst_strideq
-  dec                    r4d
-  jnz .loop128
-  RET
-%endif
-
-.w64
-  mov                    r4d, dword hm
-.loop64:
-  movu                    m0, [srcq]
-  movu                    m1, [srcq+16]
-  movu                    m2, [srcq+32]
-  movu                    m3, [srcq+48]
-  add                   srcq, src_strideq
-%ifidn %1, avg
-  pavg                    m0, [dstq]
-  pavg                    m1, [dstq+16]
-  pavg                    m2, [dstq+32]
-  pavg                    m3, [dstq+48]
-%endif
-  mova             [dstq   ], m0
-  mova             [dstq+16], m1
-  mova             [dstq+32], m2
-  mova             [dstq+48], m3
-  add                   dstq, dst_strideq
-  dec                    r4d
-  jnz .loop64
-  RET
-
-.w32:
-  mov                    r4d, dword hm
-.loop32:
-  movu                    m0, [srcq]
-  movu                    m1, [srcq+16]
-  movu                    m2, [srcq+src_strideq]
-  movu                    m3, [srcq+src_strideq+16]
-  lea                   srcq, [srcq+src_strideq*2]
-%ifidn %1, avg
-  pavg                    m0, [dstq]
-  pavg                    m1, [dstq            +16]
-  pavg                    m2, [dstq+dst_strideq]
-  pavg                    m3, [dstq+dst_strideq+16]
-%endif
-  mova [dstq               ], m0
-  mova [dstq            +16], m1
-  mova [dstq+dst_strideq   ], m2
-  mova [dstq+dst_strideq+16], m3
-  lea                   dstq, [dstq+dst_strideq*2]
-  sub                    r4d, 2
-  jnz .loop32
-  RET
-
-.w16:
-  mov                    r4d, dword hm
-  lea                    r5q, [src_strideq*3]
-  lea                    r6q, [dst_strideq*3]
-.loop16:
-  movu                    m0, [srcq]
-  movu                    m1, [srcq+src_strideq]
-  movu                    m2, [srcq+src_strideq*2]
-  movu                    m3, [srcq+r5q]
-  lea                   srcq, [srcq+src_strideq*4]
-%ifidn %1, avg
-  pavg                    m0, [dstq]
-  pavg                    m1, [dstq+dst_strideq]
-  pavg                    m2, [dstq+dst_strideq*2]
-  pavg                    m3, [dstq+r6q]
-%endif
-  mova  [dstq              ], m0
-  mova  [dstq+dst_strideq  ], m1
-  mova  [dstq+dst_strideq*2], m2
-  mova  [dstq+r6q          ], m3
-  lea                   dstq, [dstq+dst_strideq*4]
-  sub                    r4d, 4
-  jnz .loop16
-  RET
-
-.w8:
-  mov                    r4d, dword hm
-  lea                    r5q, [src_strideq*3]
-  lea                    r6q, [dst_strideq*3]
-.loop8:
-  movh                    m0, [srcq]
-  movh                    m1, [srcq+src_strideq]
-  movh                    m2, [srcq+src_strideq*2]
-  movh                    m3, [srcq+r5q]
-  lea                   srcq, [srcq+src_strideq*4]
-%ifidn %1, avg
-  movh                    m4, [dstq]
-  movh                    m5, [dstq+dst_strideq]
-  movh                    m6, [dstq+dst_strideq*2]
-  movh                    m7, [dstq+r6q]
-  pavg                    m0, m4
-  pavg                    m1, m5
-  pavg                    m2, m6
-  pavg                    m3, m7
-%endif
-  movh  [dstq              ], m0
-  movh  [dstq+dst_strideq  ], m1
-  movh  [dstq+dst_strideq*2], m2
-  movh  [dstq+r6q          ], m3
-  lea                   dstq, [dstq+dst_strideq*4]
-  sub                    r4d, 4
-  jnz .loop8
-  RET
-
-%ifnidn %2, highbd
-.w4:
-  mov                    r4d, dword hm
-  lea                    r5q, [src_strideq*3]
-  lea                    r6q, [dst_strideq*3]
-.loop4:
-  movd                    m0, [srcq]
-  movd                    m1, [srcq+src_strideq]
-  movd                    m2, [srcq+src_strideq*2]
-  movd                    m3, [srcq+r5q]
-  lea                   srcq, [srcq+src_strideq*4]
-%ifidn %1, avg
-  movd                    m4, [dstq]
-  movd                    m5, [dstq+dst_strideq]
-  movd                    m6, [dstq+dst_strideq*2]
-  movd                    m7, [dstq+r6q]
-  pavg                    m0, m4
-  pavg                    m1, m5
-  pavg                    m2, m6
-  pavg                    m3, m7
-%endif
-  movd  [dstq              ], m0
-  movd  [dstq+dst_strideq  ], m1
-  movd  [dstq+dst_strideq*2], m2
-  movd  [dstq+r6q          ], m3
-  lea                   dstq, [dstq+dst_strideq*4]
-  sub                    r4d, 4
-  jnz .loop4
-  RET
-%endif
-%endmacro
-
-INIT_XMM sse2
-convolve_fn copy
-convolve_fn avg
-%if CONFIG_VP9_HIGHBITDEPTH
-convolve_fn copy, highbd
-convolve_fn avg, highbd
-%endif
diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
deleted file mode 100644
index d8a92354c9..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ /dev/null
@@ -1,606 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-// Due to a header conflict between math.h and intrinsics includes with ceil()
-// in certain configurations under vs9 this include needs to precede
-// immintrin.h.
-
-#include <immintrin.h>
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/x86/convolve.h"
-#include "vpx_ports/mem.h"
-
-// filters for 16_h8 and 16_v8
-DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
-  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
-  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
-  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
-  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
-  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
-  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
-  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
-  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
-};
-
-#if defined(__clang__)
-// -- GODOT start -
-# if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3) || \
-    (!defined(__MACPORTS__) && defined(__APPLE__) && \
-        ((__clang_major__ == 4 && __clang_minor__ <= 2) || \
-            (__clang_major__ == 5 && __clang_minor__ == 0)))
-// -- GODOT end --
-#  define MM256_BROADCASTSI128_SI256(x) \
-       _mm_broadcastsi128_si256((__m128i const *)&(x))
-# else  // clang > 3.3, and not 5.0 on macosx.
-#  define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-# endif  // clang <= 3.3
-#elif defined(__GNUC__)
-# if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
-#  define MM256_BROADCASTSI128_SI256(x) \
-       _mm_broadcastsi128_si256((__m128i const *)&(x))
-# elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
-#  define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
-# else  // gcc > 4.7
-#  define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-# endif  // gcc <= 4.6
-#else  // !(gcc || clang)
-# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-#endif  // __clang__
-
-static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
-                                         ptrdiff_t src_pixels_per_line,
-                                         uint8_t *output_ptr,
-                                         ptrdiff_t output_pitch,
-                                         uint32_t output_height,
-                                         const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
-  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
-  __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
-  __m256i srcReg32b1, srcReg32b2, filtersReg32;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to 8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the first 16 bits (first and second byte)
-  // across 256 bit register
-  firstFilters = _mm256_shuffle_epi8(filtersReg32,
-                 _mm256_set1_epi16(0x100u));
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32,
-                  _mm256_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32,
-                 _mm256_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits (seventh and eighth byte)
-  // across 256 bit register
-  forthFilters = _mm256_shuffle_epi8(filtersReg32,
-                 _mm256_set1_epi16(0x706u));
-
-  filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);
-  filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);
-  filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);
-  filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);
-
-  // multiple the size of the source and destination stride by two
-  src_stride = src_pixels_per_line << 1;
-  dst_stride = output_pitch << 1;
-  for (i = output_height; i > 1; i-=2) {
-    // load the 2 strides of source
-    srcReg32b1 = _mm256_castsi128_si256(
-                 _mm_loadu_si128((const __m128i *)(src_ptr - 3)));
-    srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
-                 _mm_loadu_si128((const __m128i *)
-                 (src_ptr+src_pixels_per_line-3)), 1);
-
-    // filter the source buffer
-    srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
-    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
-
-    // filter the source buffer
-    srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
-    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
-                       _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
-
-    // reading 2 strides of the next 16 bytes
-    // (part of it was being read by earlier read)
-    srcReg32b2 = _mm256_castsi128_si256(
-                 _mm_loadu_si128((const __m128i *)(src_ptr + 5)));
-    srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
-                 _mm_loadu_si128((const __m128i *)
-                 (src_ptr+src_pixels_per_line+5)), 1);
-
-    // add and saturate the results together
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
-                       _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
-
-    // filter the source buffer
-    srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
-
-    // filter the source buffer
-    srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
-    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
-                       _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
-    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
-                       _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
-
-
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);
-
-    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64);
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
-    srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1,
-                                           srcRegFilt32b2_1);
-
-    src_ptr+=src_stride;
-
-    // save 16 bytes
-    _mm_store_si128((__m128i*)output_ptr,
-    _mm256_castsi256_si128(srcRegFilt32b1_1));
-
-    // save the next 16 bits
-    _mm_store_si128((__m128i*)(output_ptr+output_pitch),
-    _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
-    output_ptr+=dst_stride;
-  }
-
-  // if the number of strides is odd.
-  // process only 16 bytes
-  if (i > 0) {
-    __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
-    __m128i srcRegFilt2, srcRegFilt3;
-
-    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
-
-    // filter the source buffer
-    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,
-                    _mm256_castsi256_si128(filt1Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg1,
-                  _mm256_castsi256_si128(filt4Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,
-                    _mm256_castsi256_si128(firstFilters));
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
-                  _mm256_castsi256_si128(forthFilters));
-
-    // add and saturate the results together
-    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
-
-    // filter the source buffer
-    srcRegFilt3= _mm_shuffle_epi8(srcReg1,
-                 _mm256_castsi256_si128(filt2Reg));
-    srcRegFilt2= _mm_shuffle_epi8(srcReg1,
-                 _mm256_castsi256_si128(filt3Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
-                  _mm256_castsi256_si128(secondFilters));
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
-                  _mm256_castsi256_si128(thirdFilters));
-
-    // add and saturate the results together
-    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
-                    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
-
-    // reading the next 16 bytes
-    // (part of it was being read by earlier read)
-    srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
-
-    // add and saturate the results together
-    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
-                    _mm_max_epi16(srcRegFilt3, srcRegFilt2));
-
-    // filter the source buffer
-    srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2,
-                    _mm256_castsi256_si128(filt1Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
-                  _mm256_castsi256_si128(filt4Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1,
-                    _mm256_castsi256_si128(firstFilters));
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
-                  _mm256_castsi256_si128(forthFilters));
-
-    // add and saturate the results together
-    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
-
-    // filter the source buffer
-    srcRegFilt3 = _mm_shuffle_epi8(srcReg2,
-                  _mm256_castsi256_si128(filt2Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
-                  _mm256_castsi256_si128(filt3Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
-                  _mm256_castsi256_si128(secondFilters));
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
-                  _mm256_castsi256_si128(thirdFilters));
-
-    // add and saturate the results together
-    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
-                    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
-    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
-                    _mm_max_epi16(srcRegFilt3, srcRegFilt2));
-
-
-    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
-                    _mm256_castsi256_si128(addFilterReg64));
-
-    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
-                    _mm256_castsi256_si128(addFilterReg64));
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
-    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
-
-    // save 16 bytes
-    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
-  }
-}
-
-static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
-                                         ptrdiff_t src_pitch,
-                                         uint8_t *output_ptr,
-                                         ptrdiff_t out_pitch,
-                                         uint32_t output_height,
-                                         const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i addFilterReg64;
-  __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
-  __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
-  __m256i srcReg32b11, srcReg32b12, filtersReg32;
-  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the
-  // same data in both lanes of 128 bit register.
-  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the first 16 bits (first and second byte)
-  // across 256 bit register
-  firstFilters = _mm256_shuffle_epi8(filtersReg32,
-                 _mm256_set1_epi16(0x100u));
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32,
-                  _mm256_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32,
-                 _mm256_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits (seventh and eighth byte)
-  // across 256 bit register
-  forthFilters = _mm256_shuffle_epi8(filtersReg32,
-                 _mm256_set1_epi16(0x706u));
-
-  // multiple the size of the source and destination stride by two
-  src_stride = src_pitch << 1;
-  dst_stride = out_pitch << 1;
-
-  // load 16 bytes 7 times in stride of src_pitch
-  srcReg32b1 = _mm256_castsi128_si256(
-               _mm_loadu_si128((const __m128i *)(src_ptr)));
-  srcReg32b2 = _mm256_castsi128_si256(
-               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)));
-  srcReg32b3 = _mm256_castsi128_si256(
-               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)));
-  srcReg32b4 = _mm256_castsi128_si256(
-               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)));
-  srcReg32b5 = _mm256_castsi128_si256(
-               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
-  srcReg32b6 = _mm256_castsi128_si256(
-               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
-  srcReg32b7 = _mm256_castsi128_si256(
-               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
-
-  // have each consecutive loads on the same 256 register
-  srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
-               _mm256_castsi256_si128(srcReg32b2), 1);
-  srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
-               _mm256_castsi256_si128(srcReg32b3), 1);
-  srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,
-               _mm256_castsi256_si128(srcReg32b4), 1);
-  srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,
-               _mm256_castsi256_si128(srcReg32b5), 1);
-  srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,
-               _mm256_castsi256_si128(srcReg32b6), 1);
-  srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,
-               _mm256_castsi256_si128(srcReg32b7), 1);
-
-  // merge every two consecutive registers except the last one
-  srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
-  srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
-
-  // save
-  srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
-
-  // save
-  srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
-
-  // save
-  srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
-
-  // save
-  srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
-
-
-  for (i = output_height; i > 1; i-=2) {
-     // load the last 2 loads of 16 bytes and have every two
-     // consecutive loads in the same 256 bit register
-     srcReg32b8 = _mm256_castsi128_si256(
-     _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
-     srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
-     _mm256_castsi256_si128(srcReg32b8), 1);
-     srcReg32b9 = _mm256_castsi128_si256(
-     _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
-     srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
-     _mm256_castsi256_si128(srcReg32b9), 1);
-
-     // merge every two consecutive registers
-     // save
-     srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
-     srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
-
-     // multiply 2 adjacent elements with the filter and add the result
-     srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
-     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
-
-     // add and saturate the results together
-     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
-
-     // multiply 2 adjacent elements with the filter and add the result
-     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
-     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
-
-     // add and saturate the results together
-     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
-                   _mm256_min_epi16(srcReg32b8, srcReg32b12));
-     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
-                   _mm256_max_epi16(srcReg32b8, srcReg32b12));
-
-     // multiply 2 adjacent elements with the filter and add the result
-     srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
-     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
-
-     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
-
-     // multiply 2 adjacent elements with the filter and add the result
-     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
-     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
-
-     // add and saturate the results together
-     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
-                  _mm256_min_epi16(srcReg32b8, srcReg32b12));
-     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
-                  _mm256_max_epi16(srcReg32b8, srcReg32b12));
-
-     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
-     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
-
-     // shift by 7 bit each 16 bit
-     srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
-     srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);
-
-     // shrink to 8 bit each 16 bits, the first lane contain the first
-     // convolve result and the second lane contain the second convolve
-     // result
-     srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
-
-     src_ptr+=src_stride;
-
-     // save 16 bytes
-     _mm_store_si128((__m128i*)output_ptr,
-     _mm256_castsi256_si128(srcReg32b1));
-
-     // save the next 16 bits
-     _mm_store_si128((__m128i*)(output_ptr+out_pitch),
-     _mm256_extractf128_si256(srcReg32b1, 1));
-
-     output_ptr+=dst_stride;
-
-     // save part of the registers for next strides
-     srcReg32b10 = srcReg32b11;
-     srcReg32b1 = srcReg32b3;
-     srcReg32b11 = srcReg32b2;
-     srcReg32b3 = srcReg32b5;
-     srcReg32b2 = srcReg32b4;
-     srcReg32b5 = srcReg32b7;
-     srcReg32b7 = srcReg32b9;
-  }
-  if (i > 0) {
-    __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
-    __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
-    // load the last 16 bytes
-    srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
-
-    // merge the last 2 results together
-    srcRegFilt4 = _mm_unpacklo_epi8(
-                  _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
-    srcRegFilt7 = _mm_unpackhi_epi8(
-                  _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
-                  _mm256_castsi256_si128(firstFilters));
-    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4,
-                  _mm256_castsi256_si128(forthFilters));
-    srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
-                  _mm256_castsi256_si128(firstFilters));
-    srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7,
-                  _mm256_castsi256_si128(forthFilters));
-
-    // add and saturate the results together
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
-
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
-                  _mm256_castsi256_si128(secondFilters));
-    srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
-                  _mm256_castsi256_si128(secondFilters));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
-                  _mm256_castsi256_si128(thirdFilters));
-    srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
-                  _mm256_castsi256_si128(thirdFilters));
-
-    // add and saturate the results together
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
-                  _mm_min_epi16(srcRegFilt4, srcRegFilt6));
-    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
-                  _mm_min_epi16(srcRegFilt5, srcRegFilt7));
-
-    // add and saturate the results together
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
-                  _mm_max_epi16(srcRegFilt4, srcRegFilt6));
-    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
-                  _mm_max_epi16(srcRegFilt5, srcRegFilt7));
-
-
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
-                  _mm256_castsi256_si128(addFilterReg64));
-    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
-                  _mm256_castsi256_si128(addFilterReg64));
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-    srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
-
-    // save 16 bytes
-    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
-  }
-}
-
-#if HAVE_AVX2 && HAVE_SSSE3
-filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
-#if ARCH_X86_64
-filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
-#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_intrin_ssse3
-#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_intrin_ssse3
-#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_intrin_ssse3
-#else  // ARCH_X86
-filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
-filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
-filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
-#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_ssse3
-#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3
-#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3
-#endif  // ARCH_X86_64
-filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
-filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
-filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
-filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
-filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
-filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
-#define vpx_filter_block1d4_v8_avx2 vpx_filter_block1d4_v8_ssse3
-#define vpx_filter_block1d16_v2_avx2 vpx_filter_block1d16_v2_ssse3
-#define vpx_filter_block1d16_h2_avx2 vpx_filter_block1d16_h2_ssse3
-#define vpx_filter_block1d8_v2_avx2  vpx_filter_block1d8_v2_ssse3
-#define vpx_filter_block1d8_h2_avx2  vpx_filter_block1d8_h2_ssse3
-#define vpx_filter_block1d4_v2_avx2  vpx_filter_block1d4_v2_ssse3
-#define vpx_filter_block1d4_h2_avx2  vpx_filter_block1d4_h2_ssse3
-// void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
-//                                uint8_t *dst, ptrdiff_t dst_stride,
-//                                const int16_t *filter_x, int x_step_q4,
-//                                const int16_t *filter_y, int y_step_q4,
-//                                int w, int h);
-// void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
-//                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const int16_t *filter_x, int x_step_q4,
-//                               const int16_t *filter_y, int y_step_q4,
-//                               int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
-
-// void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
-//                          uint8_t *dst, ptrdiff_t dst_stride,
-//                          const int16_t *filter_x, int x_step_q4,
-//                          const int16_t *filter_y, int y_step_q4,
-//                          int w, int h);
-FUN_CONV_2D(, avx2);
-#endif  // HAVE_AX2 && HAVE_SSSE3
diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
deleted file mode 100644
index 6fd52087c7..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ /dev/null
@@ -1,915 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-// Due to a header conflict between math.h and intrinsics includes with ceil()
-// in certain configurations under vs9 this include needs to precede
-// tmmintrin.h.
-
-#include <tmmintrin.h>
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/vpx_filter.h"
-#include "vpx_dsp/x86/convolve.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vpx_ports/mem.h"
-#include "vpx_ports/emmintrin_compat.h"
-
-// filters only for the 4_h8 convolution
-DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
-  0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
-  4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-// filters for 8_h8 and 16_h8
-DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
-  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
-  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
-  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
-  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
-};
-
-// These are reused by the avx2 intrinsics.
-filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
-
-void vpx_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr,
-                                         ptrdiff_t src_pixels_per_line,
-                                         uint8_t *output_ptr,
-                                         ptrdiff_t output_pitch,
-                                         uint32_t output_height,
-                                         const int16_t *filter) {
-  __m128i firstFilters, secondFilters, shuffle1, shuffle2;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
-  __m128i addFilterReg64, filtersReg, srcReg, minReg;
-  unsigned int i;
-
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the first 16 bits in the filter into the first lane
-  firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
-  // duplicate only the third 16 bit in the filter into the first lane
-  secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
-  // duplicate only the seconds 16 bits in the filter into the second lane
-  // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
-  firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
-  // duplicate only the forth 16 bits in the filter into the second lane
-  // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
-  secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
-
-  // loading the local filters
-  shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8);
-  shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
-
-  for (i = 0; i < output_height; i++) {
-    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
-
-    // filter the source buffer
-    srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1);
-    srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
-    // extract the higher half of the lane
-    srcRegFilt3 =  _mm_srli_si128(srcRegFilt1, 8);
-    srcRegFilt4 =  _mm_srli_si128(srcRegFilt2, 8);
-
-    minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
-
-    // add and saturate all the results together
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-    srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-    // shift by 7 bit each 16 bits
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-    // shrink to 8 bit each 16 bits
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-    src_ptr+=src_pixels_per_line;
-
-    // save only 4 bytes
-    *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
-
-    output_ptr+=output_pitch;
-  }
-}
-
-void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,
-                                         ptrdiff_t src_pixels_per_line,
-                                         uint8_t *output_ptr,
-                                         ptrdiff_t output_pitch,
-                                         uint32_t output_height,
-                                         const int16_t *filter) {
-  __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
-  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
-  __m128i addFilterReg64, filtersReg, minReg;
-  unsigned int i;
-
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the first 16 bits (first and second byte)
-  // across 128 bit register
-  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 128 bit register
-  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 128 bit register
-  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits (seventh and eighth byte)
-  // across 128 bit register
-  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
-  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
-  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
-  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
-
-  for (i = 0; i < output_height; i++) {
-    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
-
-    // filter the source buffer
-    srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
-    srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
-    // filter the source buffer
-    srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);
-    srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
-    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
-
-    // add and saturate all the results together
-    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-
-    srcRegFilt2= _mm_max_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-    // shift by 7 bit each 16 bits
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-    // shrink to 8 bit each 16 bits
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
-    src_ptr+=src_pixels_per_line;
-
-    // save only 8 bytes
-    _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
-
-    output_ptr+=output_pitch;
-  }
-}
-
-void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
-                                         ptrdiff_t src_pitch,
-                                         uint8_t *output_ptr,
-                                         ptrdiff_t out_pitch,
-                                         uint32_t output_height,
-                                         const int16_t *filter) {
-  __m128i addFilterReg64, filtersReg, minReg;
-  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
-  __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
-  __m128i srcReg8;
-  unsigned int i;
-
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the first 16 bits in the filter
-  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-  // duplicate only the second 16 bits in the filter
-  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-  // duplicate only the third 16 bits in the filter
-  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits in the filter
-  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-  // load the first 7 rows of 8 bytes
-  srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
-  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
-  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
-  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
-  srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
-  srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
-  srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
-
-  for (i = 0; i < output_height; i++) {
-    // load the last 8 bytes
-    srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
-
-    // merge the result together
-    srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
-    srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
-
-    // merge the result together
-    srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
-    srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
-    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
-
-    // add and saturate the results together
-    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
-    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-    // shrink to 8 bit each 16 bits
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
-    src_ptr+=src_pitch;
-
-    // shift down a row
-    srcReg1 = srcReg2;
-    srcReg2 = srcReg3;
-    srcReg3 = srcReg4;
-    srcReg4 = srcReg5;
-    srcReg5 = srcReg6;
-    srcReg6 = srcReg7;
-    srcReg7 = srcReg8;
-
-    // save only 8 bytes convolve result
-    _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
-
-    output_ptr+=out_pitch;
-  }
-}
-
-filter8_1dfunction vpx_filter_block1d16_v8_ssse3;
-filter8_1dfunction vpx_filter_block1d16_h8_ssse3;
-filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
-filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
-filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
-filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
-filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3;
-filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3;
-filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
-filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;
-filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3;
-filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3;
-
-filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
-filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
-filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
-filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
-filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
-filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
-filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3;
-filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3;
-filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3;
-filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3;
-filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3;
-filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
-
-// void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                                uint8_t *dst, ptrdiff_t dst_stride,
-//                                const int16_t *filter_x, int x_step_q4,
-//                                const int16_t *filter_y, int y_step_q4,
-//                                int w, int h);
-// void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const int16_t *filter_x, int x_step_q4,
-//                               const int16_t *filter_y, int y_step_q4,
-//                               int w, int h);
-// void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                                    uint8_t *dst, ptrdiff_t dst_stride,
-//                                    const int16_t *filter_x, int x_step_q4,
-//                                    const int16_t *filter_y, int y_step_q4,
-//                                    int w, int h);
-// void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                                   uint8_t *dst, ptrdiff_t dst_stride,
-//                                   const int16_t *filter_x, int x_step_q4,
-//                                   const int16_t *filter_y, int y_step_q4,
-//                                   int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
-FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
-            ssse3);
-
-#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,           \
-                      out0, out1, out2, out3, out4, out5, out6, out7) { \
-  const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1);                    \
-  const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3);                    \
-  const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5);                    \
-  const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7);                    \
-                                                                        \
-  const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1);               \
-  const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1);               \
-  const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3);               \
-  const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3);               \
-                                                                        \
-  const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2);               \
-  const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2);               \
-  const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3);               \
-  const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3);               \
-                                                                        \
-  out0 = _mm_unpacklo_epi64(tr2_0, tr2_0);                              \
-  out1 = _mm_unpackhi_epi64(tr2_0, tr2_0);                              \
-  out2 = _mm_unpacklo_epi64(tr2_1, tr2_1);                              \
-  out3 = _mm_unpackhi_epi64(tr2_1, tr2_1);                              \
-  out4 = _mm_unpacklo_epi64(tr2_2, tr2_2);                              \
-  out5 = _mm_unpackhi_epi64(tr2_2, tr2_2);                              \
-  out6 = _mm_unpacklo_epi64(tr2_3, tr2_3);                              \
-  out7 = _mm_unpackhi_epi64(tr2_3, tr2_3);                              \
-}
-
-static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch,
-                                  uint8_t *dst, const int16_t *x_filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)x_filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_loadl_epi64((const __m128i *)src_x);
-  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch));
-  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2));
-  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3));
-  const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4));
-  const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5));
-  const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6));
-  const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7));
-  // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
-  const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
-  // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
-  const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
-  // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57
-  const __m128i tr0_2 = _mm_unpacklo_epi16(E, F);
-  // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77
-  const __m128i tr0_3 = _mm_unpacklo_epi16(G, H);
-  // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
-  const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73
-  const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77
-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-  // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
-  const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2);
-  const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2);
-  const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3);
-  const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
-  // shrink to 8 bit each 16 bits
-  temp = _mm_packus_epi16(temp, temp);
-  // save only 8 bytes convolve result
-  _mm_storel_epi64((__m128i*)dst, temp);
-}
-
-static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride) {
-  __m128i A, B, C, D, E, F, G, H;
-
-  A = _mm_loadl_epi64((const __m128i *)src);
-  B = _mm_loadl_epi64((const __m128i *)(src + src_stride));
-  C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
-  D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
-  E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4));
-  F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5));
-  G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6));
-  H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7));
-
-  TRANSPOSE_8X8(A, B, C, D, E, F, G, H,
-                A, B, C, D, E, F, G, H);
-
-  _mm_storel_epi64((__m128i*)dst, A);
-  _mm_storel_epi64((__m128i*)(dst + dst_stride * 1), B);
-  _mm_storel_epi64((__m128i*)(dst + dst_stride * 2), C);
-  _mm_storel_epi64((__m128i*)(dst + dst_stride * 3), D);
-  _mm_storel_epi64((__m128i*)(dst + dst_stride * 4), E);
-  _mm_storel_epi64((__m128i*)(dst + dst_stride * 5), F);
-  _mm_storel_epi64((__m128i*)(dst + dst_stride * 6), G);
-  _mm_storel_epi64((__m128i*)(dst + dst_stride * 7), H);
-}
-
-static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *x_filters,
-                                    int x0_q4, int x_step_q4, int w, int h) {
-  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
-  int x, y, z;
-  src -= SUBPEL_TAPS / 2 - 1;
-
-  // This function processes 8x8 areas.  The intermediate height is not always
-  // a multiple of 8, so force it to be a multiple of 8 here.
-  y = h + (8 - (h & 0x7));
-
-  do {
-    int x_q4 = x0_q4;
-    for (x = 0; x < w; x += 8) {
-      // process 8 src_x steps
-      for (z = 0; z < 8; ++z) {
-        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-        const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-        if (x_q4 & SUBPEL_MASK) {
-          filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter);
-        } else {
-          int i;
-          for (i = 0; i < 8; ++i) {
-            temp[z * 8 + i] = src_x[i * src_stride + 3];
-          }
-        }
-        x_q4 += x_step_q4;
-      }
-
-      // transpose the 8x8 filters values back to dst
-      transpose8x8_to_dst(temp, 8, dst + x, dst_stride);
-    }
-
-    src += src_stride * 8;
-    dst += dst_stride * 8;
-  } while (y -= 8);
-}
-
-static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                  uint8_t *dst, const int16_t *filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
-  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
-  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
-  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
-  // TRANSPOSE...
-  // 00 01 02 03 04 05 06 07
-  // 10 11 12 13 14 15 16 17
-  // 20 21 22 23 24 25 26 27
-  // 30 31 32 33 34 35 36 37
-  //
-  // TO
-  //
-  // 00 10 20 30
-  // 01 11 21 31
-  // 02 12 22 32
-  // 03 13 23 33
-  // 04 14 24 34
-  // 05 15 25 35
-  // 06 16 26 36
-  // 07 17 27 37
-  //
-  // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
-  const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
-  // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
-  const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
-  // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
-  const __m128i s1s0  = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
-  const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  // 02 03 12 13 22 23 32 33
-  const __m128i s3s2 = _mm_srli_si128(s1s0, 8);
-  // 06 07 16 17 26 27 36 37
-  const __m128i s7s6 = _mm_srli_si128(s5s4, 8);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
-  // shrink to 8 bit each 16 bits
-  temp = _mm_packus_epi16(temp, temp);
-  // save only 4 bytes
-  *(int *)dst = _mm_cvtsi128_si32(temp);
-}
-
-static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride) {
-  __m128i A = _mm_cvtsi32_si128(*(const int *)src);
-  __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride));
-  __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2));
-  __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3));
-  // 00 10 01 11 02 12 03 13
-  const __m128i tr0_0 = _mm_unpacklo_epi8(A, B);
-  // 20 30 21 31 22 32 23 33
-  const __m128i tr0_1 = _mm_unpacklo_epi8(C, D);
-  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-  A = _mm_unpacklo_epi16(tr0_0, tr0_1);
-  B = _mm_srli_si128(A, 4);
-  C = _mm_srli_si128(A, 8);
-  D = _mm_srli_si128(A, 12);
-
-  *(int *)(dst) =  _mm_cvtsi128_si32(A);
-  *(int *)(dst + dst_stride) =  _mm_cvtsi128_si32(B);
-  *(int *)(dst + dst_stride * 2) =  _mm_cvtsi128_si32(C);
-  *(int *)(dst + dst_stride * 3) =  _mm_cvtsi128_si32(D);
-}
-
-static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *x_filters,
-                                    int x0_q4, int x_step_q4, int w, int h) {
-  DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
-  int x, y, z;
-  src -= SUBPEL_TAPS / 2 - 1;
-
-  for (y = 0; y < h; y += 4) {
-    int x_q4 = x0_q4;
-    for (x = 0; x < w; x += 4) {
-      // process 4 src_x steps
-      for (z = 0; z < 4; ++z) {
-        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-        const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-        if (x_q4 & SUBPEL_MASK) {
-          filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter);
-        } else {
-          int i;
-          for (i = 0; i < 4; ++i) {
-            temp[z * 4 + i] = src_x[i * src_stride + 3];
-          }
-        }
-        x_q4 += x_step_q4;
-      }
-
-      // transpose the 4x4 filters values back to dst
-      transpose4x4_to_dst(temp, 4, dst + x, dst_stride);
-    }
-
-    src += src_stride * 4;
-    dst += dst_stride * 4;
-  }
-}
-
-static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                 uint8_t *dst, const int16_t *filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr);
-  const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch));
-  const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2));
-  const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3));
-  const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4));
-  const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5));
-  const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6));
-  const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7));
-  const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
-  const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
-  const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
-  const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
-  // shrink to 8 bit each 16 bits
-  temp = _mm_packus_epi16(temp, temp);
-  // save only 4 bytes
-  *(int *)dst = _mm_cvtsi128_si32(temp);
-}
-
-static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const InterpKernel *y_filters,
-                                   int y0_q4, int y_step_q4, int w, int h) {
-  int y;
-  int y_q4 = y0_q4;
-
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  for (y = 0; y < h; ++y) {
-    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-
-    if (y_q4 & SUBPEL_MASK) {
-      filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
-    } else {
-      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
-    }
-
-    y_q4 += y_step_q4;
-  }
-}
-
-static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                 uint8_t *dst, const int16_t *filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
-  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
-  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
-  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
-  const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
-  const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
-  const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
-  const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
-  const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
-  const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
-  const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
-  const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
-  // shrink to 8 bit each 16 bits
-  temp = _mm_packus_epi16(temp, temp);
-  // save only 8 bytes convolve result
-  _mm_storel_epi64((__m128i*)dst, temp);
-}
-
-static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const InterpKernel *y_filters,
-                                   int y0_q4, int y_step_q4, int w, int h) {
-  int y;
-  int y_q4 = y0_q4;
-
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  for (y = 0; y < h; ++y) {
-    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-    if (y_q4 & SUBPEL_MASK) {
-      filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
-    } else {
-      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
-    }
-    y_q4 += y_step_q4;
-  }
-}
-
-static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                  uint8_t *dst, const int16_t *filter, int w) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  int i;
-
-  for (i = 0; i < w; i += 16) {
-    const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr);
-    const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
-    const __m128i C =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
-    const __m128i D =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
-    const __m128i E =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
-    const __m128i F =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
-    const __m128i G =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
-    const __m128i H =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
-    // merge the result together
-    const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B);
-    const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H);
-    const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B);
-    const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H);
-    // multiply 2 adjacent elements with the filter and add the result
-    const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0);
-    const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6);
-    const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0);
-    const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6);
-    // add and saturate the results together
-    const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo);
-    const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi);
-    // merge the result together
-    const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D);
-    const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D);
-    // multiply 2 adjacent elements with the filter and add the result
-    const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2);
-    const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2);
-    // merge the result together
-    const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F);
-    const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F);
-    // multiply 2 adjacent elements with the filter and add the result
-    const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4);
-    const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4);
-    // add and saturate the results together
-    __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo));
-    __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi));
-
-    // add and saturate the results together
-    temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo));
-    temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi));
-    // round and shift by 7 bit each 16 bit
-    temp_lo = _mm_mulhrs_epi16(temp_lo, k_256);
-    temp_hi = _mm_mulhrs_epi16(temp_hi, k_256);
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
-    src_ptr += 16;
-     // save 16 bytes convolve result
-    _mm_store_si128((__m128i*)&dst[i], temp_hi);
-  }
-}
-
-static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *y_filters,
-                                    int y0_q4, int y_step_q4, int w, int h) {
-  int y;
-  int y_q4 = y0_q4;
-
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  for (y = 0; y < h; ++y) {
-    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-    if (y_q4 & SUBPEL_MASK) {
-      filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter,
-                            w);
-    } else {
-      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
-    }
-    y_q4 += y_step_q4;
-  }
-}
-
-static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const InterpKernel *const x_filters,
-                             int x0_q4, int x_step_q4,
-                             const InterpKernel *const y_filters,
-                             int y0_q4, int y_step_q4,
-                             int w, int h) {
-  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
-  // 2d filtering proceeds in 2 steps:
-  //   (1) Interpolate horizontally into an intermediate buffer, temp.
-  //   (2) Interpolate temp vertically to derive the sub-pixel result.
-  // Deriving the maximum number of rows in the temp buffer (135):
-  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
-  // --Largest block size is 64x64 pixels.
-  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
-  //   original frame (in 1/16th pixel units).
-  // --Must round-up because block may be located at sub-pixel position.
-  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
-  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  // --Require an additional 8 rows for the horiz_w8 transpose tail.
-  DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
-  const int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
-  assert(w <= 64);
-  assert(h <= 64);
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
-
-  if (w >= 8) {
-    scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
-                            w, intermediate_height);
-  } else {
-    scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
-                            w, intermediate_height);
-  }
-
-  if (w >= 16) {
-    scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                            dst_stride, y_filters, y0_q4, y_step_q4, w, h);
-  } else if (w == 8) {
-    scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                           dst_stride, y_filters, y0_q4, y_step_q4, w, h);
-  } else {
-    scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                           dst_stride, y_filters, y0_q4, y_step_q4, w, h);
-  }
-}
-
-static const InterpKernel *get_filter_base(const int16_t *filter) {
-  // NOTE: This assumes that the filter table is 256-byte aligned.
-  // TODO(agrange) Modify to make independent of table alignment.
-  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
-}
-
-static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
-  return (int)((const InterpKernel *)(intptr_t)f - base);
-}
-
-void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, ptrdiff_t dst_stride,
-                         const int16_t *filter_x, int x_step_q4,
-                         const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  scaledconvolve2d(src, src_stride, dst, dst_stride,
-                   filters_x, x0_q4, x_step_q4,
-                   filters_y, y0_q4, y_step_q4, w, h);
-}
-
-// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                          uint8_t *dst, ptrdiff_t dst_stride,
-//                          const int16_t *filter_x, int x_step_q4,
-//                          const int16_t *filter_y, int y_step_q4,
-//                          int w, int h);
-// void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                              uint8_t *dst, ptrdiff_t dst_stride,
-//                              const int16_t *filter_x, int x_step_q4,
-//                              const int16_t *filter_y, int y_step_q4,
-//                              int w, int h);
-FUN_CONV_2D(, ssse3);
-FUN_CONV_2D(avg_ , ssse3);
diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm
deleted file mode 100644
index 08f3d6a6cf..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm
+++ /dev/null
@@ -1,987 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;Note: tap3 and tap4 have to be applied and added after other taps to avoid
-;overflow.
-
-%macro GET_FILTERS_4 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm7, [rdx]                 ;load filters
-    pshuflw     xmm0, xmm7, 0b              ;k0
-    pshuflw     xmm1, xmm7, 01010101b       ;k1
-    pshuflw     xmm2, xmm7, 10101010b       ;k2
-    pshuflw     xmm3, xmm7, 11111111b       ;k3
-    psrldq      xmm7, 8
-    pshuflw     xmm4, xmm7, 0b              ;k4
-    pshuflw     xmm5, xmm7, 01010101b       ;k5
-    pshuflw     xmm6, xmm7, 10101010b       ;k6
-    pshuflw     xmm7, xmm7, 11111111b       ;k7
-
-    punpcklqdq  xmm0, xmm1
-    punpcklqdq  xmm2, xmm3
-    punpcklqdq  xmm5, xmm4
-    punpcklqdq  xmm6, xmm7
-
-    movdqa      k0k1, xmm0
-    movdqa      k2k3, xmm2
-    movdqa      k5k4, xmm5
-    movdqa      k6k7, xmm6
-
-    movq        xmm6, rcx
-    pshufd      xmm6, xmm6, 0
-    movdqa      krd, xmm6
-
-    pxor        xmm7, xmm7
-    movdqa      zero, xmm7
-%endm
-
-%macro APPLY_FILTER_4 1
-    punpckldq   xmm0, xmm1                  ;two row in one register
-    punpckldq   xmm6, xmm7
-    punpckldq   xmm2, xmm3
-    punpckldq   xmm5, xmm4
-
-    punpcklbw   xmm0, zero                  ;unpack to word
-    punpcklbw   xmm6, zero
-    punpcklbw   xmm2, zero
-    punpcklbw   xmm5, zero
-
-    pmullw      xmm0, k0k1                  ;multiply the filter factors
-    pmullw      xmm6, k6k7
-    pmullw      xmm2, k2k3
-    pmullw      xmm5, k5k4
-
-    paddsw      xmm0, xmm6                  ;sum
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 8
-    paddsw      xmm0, xmm1
-    paddsw      xmm0, xmm2
-    psrldq      xmm2, 8
-    paddsw      xmm0, xmm5
-    psrldq      xmm5, 8
-    paddsw      xmm0, xmm2
-    paddsw      xmm0, xmm5
-
-    paddsw      xmm0, krd                   ;rounding
-    psraw       xmm0, 7                     ;shift
-    packuswb    xmm0, xmm0                  ;pack to byte
-
-%if %1
-    movd        xmm1, [rdi]
-    pavgb       xmm0, xmm1
-%endif
-    movd        [rdi], xmm0
-%endm
-
-%macro GET_FILTERS 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm7, [rdx]                 ;load filters
-    pshuflw     xmm0, xmm7, 0b              ;k0
-    pshuflw     xmm1, xmm7, 01010101b       ;k1
-    pshuflw     xmm2, xmm7, 10101010b       ;k2
-    pshuflw     xmm3, xmm7, 11111111b       ;k3
-    pshufhw     xmm4, xmm7, 0b              ;k4
-    pshufhw     xmm5, xmm7, 01010101b       ;k5
-    pshufhw     xmm6, xmm7, 10101010b       ;k6
-    pshufhw     xmm7, xmm7, 11111111b       ;k7
-
-    punpcklwd   xmm0, xmm0
-    punpcklwd   xmm1, xmm1
-    punpcklwd   xmm2, xmm2
-    punpcklwd   xmm3, xmm3
-    punpckhwd   xmm4, xmm4
-    punpckhwd   xmm5, xmm5
-    punpckhwd   xmm6, xmm6
-    punpckhwd   xmm7, xmm7
-
-    movdqa      k0,   xmm0                  ;store filter factors on stack
-    movdqa      k1,   xmm1
-    movdqa      k2,   xmm2
-    movdqa      k3,   xmm3
-    movdqa      k4,   xmm4
-    movdqa      k5,   xmm5
-    movdqa      k6,   xmm6
-    movdqa      k7,   xmm7
-
-    movq        xmm6, rcx
-    pshufd      xmm6, xmm6, 0
-    movdqa      krd, xmm6                   ;rounding
-
-    pxor        xmm7, xmm7
-    movdqa      zero, xmm7
-%endm
-
-%macro LOAD_VERT_8 1
-    movq        xmm0, [rsi + %1]            ;0
-    movq        xmm1, [rsi + rax + %1]      ;1
-    movq        xmm6, [rsi + rdx * 2 + %1]  ;6
-    lea         rsi,  [rsi + rax]
-    movq        xmm7, [rsi + rdx * 2 + %1]  ;7
-    movq        xmm2, [rsi + rax + %1]      ;2
-    movq        xmm3, [rsi + rax * 2 + %1]  ;3
-    movq        xmm4, [rsi + rdx + %1]      ;4
-    movq        xmm5, [rsi + rax * 4 + %1]  ;5
-%endm
-
-%macro APPLY_FILTER_8 2
-    punpcklbw   xmm0, zero
-    punpcklbw   xmm1, zero
-    punpcklbw   xmm6, zero
-    punpcklbw   xmm7, zero
-    punpcklbw   xmm2, zero
-    punpcklbw   xmm5, zero
-    punpcklbw   xmm3, zero
-    punpcklbw   xmm4, zero
-
-    pmullw      xmm0, k0
-    pmullw      xmm1, k1
-    pmullw      xmm6, k6
-    pmullw      xmm7, k7
-    pmullw      xmm2, k2
-    pmullw      xmm5, k5
-    pmullw      xmm3, k3
-    pmullw      xmm4, k4
-
-    paddsw      xmm0, xmm1
-    paddsw      xmm0, xmm6
-    paddsw      xmm0, xmm7
-    paddsw      xmm0, xmm2
-    paddsw      xmm0, xmm5
-    paddsw      xmm0, xmm3
-    paddsw      xmm0, xmm4
-
-    paddsw      xmm0, krd                   ;rounding
-    psraw       xmm0, 7                     ;shift
-    packuswb    xmm0, xmm0                  ;pack back to byte
-%if %1
-    movq        xmm1, [rdi + %2]
-    pavgb       xmm0, xmm1
-%endif
-    movq        [rdi + %2], xmm0
-%endm
-
-;void vpx_filter_block1d4_v8_sse2
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
-;)
-global sym(vpx_filter_block1d4_v8_sse2) PRIVATE
-sym(vpx_filter_block1d4_v8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 6
-    %define k0k1 [rsp + 16 * 0]
-    %define k2k3 [rsp + 16 * 1]
-    %define k5k4 [rsp + 16 * 2]
-    %define k6k7 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define zero [rsp + 16 * 5]
-
-    GET_FILTERS_4
-
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movd        xmm0, [rsi]                 ;load src: row 0
-    movd        xmm1, [rsi + rax]           ;1
-    movd        xmm6, [rsi + rdx * 2]       ;6
-    lea         rsi,  [rsi + rax]
-    movd        xmm7, [rsi + rdx * 2]       ;7
-    movd        xmm2, [rsi + rax]           ;2
-    movd        xmm3, [rsi + rax * 2]       ;3
-    movd        xmm4, [rsi + rdx]           ;4
-    movd        xmm5, [rsi + rax * 4]       ;5
-
-    APPLY_FILTER_4 0
-
-    lea         rdi, [rdi + rbx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 6
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vpx_filter_block1d8_v8_sse2
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
-;)
-global sym(vpx_filter_block1d8_v8_sse2) PRIVATE
-sym(vpx_filter_block1d8_v8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    LOAD_VERT_8 0
-    APPLY_FILTER_8 0, 0
-
-    lea         rdi, [rdi + rbx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vpx_filter_block1d16_v8_sse2
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
-;)
-global sym(vpx_filter_block1d16_v8_sse2) PRIVATE
-sym(vpx_filter_block1d16_v8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    LOAD_VERT_8 0
-    APPLY_FILTER_8 0, 0
-    sub         rsi, rax
-
-    LOAD_VERT_8 8
-    APPLY_FILTER_8 0, 8
-    add         rdi, rbx
-
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d4_v8_avg_sse2) PRIVATE
-sym(vpx_filter_block1d4_v8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 6
-    %define k0k1 [rsp + 16 * 0]
-    %define k2k3 [rsp + 16 * 1]
-    %define k5k4 [rsp + 16 * 2]
-    %define k6k7 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define zero [rsp + 16 * 5]
-
-    GET_FILTERS_4
-
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movd        xmm0, [rsi]                 ;load src: row 0
-    movd        xmm1, [rsi + rax]           ;1
-    movd        xmm6, [rsi + rdx * 2]       ;6
-    lea         rsi,  [rsi + rax]
-    movd        xmm7, [rsi + rdx * 2]       ;7
-    movd        xmm2, [rsi + rax]           ;2
-    movd        xmm3, [rsi + rax * 2]       ;3
-    movd        xmm4, [rsi + rdx]           ;4
-    movd        xmm5, [rsi + rax * 4]       ;5
-
-    APPLY_FILTER_4 1
-
-    lea         rdi, [rdi + rbx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 6
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d8_v8_avg_sse2) PRIVATE
-sym(vpx_filter_block1d8_v8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-.loop:
-    LOAD_VERT_8 0
-    APPLY_FILTER_8 1, 0
-
-    lea         rdi, [rdi + rbx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d16_v8_avg_sse2) PRIVATE
-sym(vpx_filter_block1d16_v8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-.loop:
-    LOAD_VERT_8 0
-    APPLY_FILTER_8 1, 0
-    sub         rsi, rax
-
-    LOAD_VERT_8 8
-    APPLY_FILTER_8 1, 8
-    add         rdi, rbx
-
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vpx_filter_block1d4_h8_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
-;)
-global sym(vpx_filter_block1d4_h8_sse2) PRIVATE
-sym(vpx_filter_block1d4_h8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 6
-    %define k0k1 [rsp + 16 * 0]
-    %define k2k3 [rsp + 16 * 1]
-    %define k5k4 [rsp + 16 * 2]
-    %define k6k7 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define zero [rsp + 16 * 5]
-
-    GET_FILTERS_4
-
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 3]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm3, 3
-    psrldq      xmm5, 5
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_4 0
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 6
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vpx_filter_block1d8_h8_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
-;)
-global sym(vpx_filter_block1d8_h8_sse2) PRIVATE
-sym(vpx_filter_block1d8_h8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 3]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm5, 5
-    psrldq      xmm3, 3
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_8 0, 0
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vpx_filter_block1d16_h8_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
-;)
-global sym(vpx_filter_block1d16_h8_sse2) PRIVATE
-sym(vpx_filter_block1d16_h8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 3]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm5, 5
-    psrldq      xmm3, 3
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_8 0, 0
-
-    movdqu      xmm0,   [rsi + 5]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm5, 5
-    psrldq      xmm3, 3
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_8 0, 8
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d4_h8_avg_sse2) PRIVATE
-sym(vpx_filter_block1d4_h8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 6
-    %define k0k1 [rsp + 16 * 0]
-    %define k2k3 [rsp + 16 * 1]
-    %define k5k4 [rsp + 16 * 2]
-    %define k6k7 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define zero [rsp + 16 * 5]
-
-    GET_FILTERS_4
-
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 3]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm3, 3
-    psrldq      xmm5, 5
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_4 1
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 6
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d8_h8_avg_sse2) PRIVATE
-sym(vpx_filter_block1d8_h8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 3]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm5, 5
-    psrldq      xmm3, 3
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_8 1, 0
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d16_h8_avg_sse2) PRIVATE
-sym(vpx_filter_block1d16_h8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 3]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm5, 5
-    psrldq      xmm3, 3
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_8 1, 0
-
-    movdqu      xmm0,   [rsi + 5]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm5, 5
-    psrldq      xmm3, 3
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_8 1, 8
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
deleted file mode 100644
index d2cb8ea292..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
+++ /dev/null
@@ -1,629 +0,0 @@
-;
-;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pw_64:    times 8 dw 64
-
-; %define USE_PMULHRSW
-; NOTE: pmulhrsw has a latency of 5 cycles.  Tests showed a performance loss
-; when using this instruction.
-;
-; The add order below (based on ffvp9) must be followed to prevent outranges.
-; x = k0k1 + k4k5
-; y = k2k3 + k6k7
-; z = signed SAT(x + y)
-
-SECTION .text
-%if ARCH_X86_64
-  %define LOCAL_VARS_SIZE 16*4
-%else
-  %define LOCAL_VARS_SIZE 16*6
-%endif
-
-%macro SETUP_LOCAL_VARS 0
-    ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 +
-    ; pmaddubsw has a higher latency on some platforms, this might be eased by
-    ; interleaving the instructions.
-    %define    k0k1  [rsp + 16*0]
-    %define    k2k3  [rsp + 16*1]
-    %define    k4k5  [rsp + 16*2]
-    %define    k6k7  [rsp + 16*3]
-    packsswb     m4, m4
-    ; TODO(slavarnway): multiple pshufb instructions had a higher latency on
-    ; some platforms.
-    pshuflw      m0, m4, 0b              ;k0_k1
-    pshuflw      m1, m4, 01010101b       ;k2_k3
-    pshuflw      m2, m4, 10101010b       ;k4_k5
-    pshuflw      m3, m4, 11111111b       ;k6_k7
-    punpcklqdq   m0, m0
-    punpcklqdq   m1, m1
-    punpcklqdq   m2, m2
-    punpcklqdq   m3, m3
-    mova       k0k1, m0
-    mova       k2k3, m1
-    mova       k4k5, m2
-    mova       k6k7, m3
-%if ARCH_X86_64
-    %define     krd  m12
-    %define     tmp  m13
-    mova        krd, [GLOBAL(pw_64)]
-%else
-    %define     tmp  [rsp + 16*4]
-    %define     krd  [rsp + 16*5]
-%if CONFIG_PIC=0
-    mova         m6, [GLOBAL(pw_64)]
-%else
-    ; build constants without accessing global memory
-    pcmpeqb      m6, m6                  ;all ones
-    psrlw        m6, 15
-    psllw        m6, 6                   ;aka pw_64
-%endif
-    mova        krd, m6
-%endif
-%endm
-
-%macro HORIZx4_ROW 2
-    mova      %2, %1
-    punpcklbw %1, %1
-    punpckhbw %2, %2
-
-    mova      m3, %2
-    palignr   %2, %1, 1
-    palignr   m3, %1, 5
-
-    pmaddubsw %2, k0k1k4k5
-    pmaddubsw m3, k2k3k6k7
-    mova      m4, %2        ;k0k1
-    mova      m5, m3        ;k2k3
-    psrldq    %2, 8         ;k4k5
-    psrldq    m3, 8         ;k6k7
-    paddsw    %2, m4
-    paddsw    m5, m3
-    paddsw    %2, m5
-    paddsw    %2, krd
-    psraw     %2, 7
-    packuswb  %2, %2
-%endm
-
-;-------------------------------------------------------------------------------
-%macro SUBPIX_HFILTER4 1
-cglobal filter_block1d4_%1, 6, 6+(ARCH_X86_64*2), 11, LOCAL_VARS_SIZE, \
-                            src, sstride, dst, dstride, height, filter
-    mova                m4, [filterq]
-    packsswb            m4, m4
-%if ARCH_X86_64
-    %define       k0k1k4k5 m8
-    %define       k2k3k6k7 m9
-    %define            krd m10
-    %define    orig_height r7d
-    mova               krd, [GLOBAL(pw_64)]
-    pshuflw       k0k1k4k5, m4, 0b              ;k0_k1
-    pshufhw       k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5
-    pshuflw       k2k3k6k7, m4, 01010101b       ;k2_k3
-    pshufhw       k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7
-%else
-    %define       k0k1k4k5 [rsp + 16*0]
-    %define       k2k3k6k7 [rsp + 16*1]
-    %define            krd [rsp + 16*2]
-    %define    orig_height [rsp + 16*3]
-    pshuflw             m6, m4, 0b              ;k0_k1
-    pshufhw             m6, m6, 10101010b       ;k0_k1_k4_k5
-    pshuflw             m7, m4, 01010101b       ;k2_k3
-    pshufhw             m7, m7, 11111111b       ;k2_k3_k6_k7
-%if CONFIG_PIC=0
-    mova                m1, [GLOBAL(pw_64)]
-%else
-    ; build constants without accessing global memory
-    pcmpeqb             m1, m1                  ;all ones
-    psrlw               m1, 15
-    psllw               m1, 6                   ;aka pw_64
-%endif
-    mova          k0k1k4k5, m6
-    mova          k2k3k6k7, m7
-    mova               krd, m1
-%endif
-    mov        orig_height, heightd
-    shr            heightd, 1
-.loop:
-    ;Do two rows at once
-    movh                m0, [srcq - 3]
-    movh                m1, [srcq + 5]
-    punpcklqdq          m0, m1
-    mova                m1, m0
-    movh                m2, [srcq + sstrideq - 3]
-    movh                m3, [srcq + sstrideq + 5]
-    punpcklqdq          m2, m3
-    mova                m3, m2
-    punpcklbw           m0, m0
-    punpckhbw           m1, m1
-    punpcklbw           m2, m2
-    punpckhbw           m3, m3
-    mova                m4, m1
-    palignr             m4, m0,  1
-    pmaddubsw           m4, k0k1k4k5
-    palignr             m1, m0,  5
-    pmaddubsw           m1, k2k3k6k7
-    mova                m7, m3
-    palignr             m7, m2,  1
-    pmaddubsw           m7, k0k1k4k5
-    palignr             m3, m2,  5
-    pmaddubsw           m3, k2k3k6k7
-    mova                m0, m4                  ;k0k1
-    mova                m5, m1                  ;k2k3
-    mova                m2, m7                  ;k0k1 upper
-    psrldq              m4, 8                   ;k4k5
-    psrldq              m1, 8                   ;k6k7
-    paddsw              m4, m0
-    paddsw              m5, m1
-    mova                m1, m3                  ;k2k3 upper
-    psrldq              m7, 8                   ;k4k5 upper
-    psrldq              m3, 8                   ;k6k7 upper
-    paddsw              m7, m2
-    paddsw              m4, m5
-    paddsw              m1, m3
-    paddsw              m7, m1
-    paddsw              m4, krd
-    psraw               m4, 7
-    packuswb            m4, m4
-    paddsw              m7, krd
-    psraw               m7, 7
-    packuswb            m7, m7
-
-%ifidn %1, h8_avg
-    movd                m0, [dstq]
-    pavgb               m4, m0
-    movd                m2, [dstq + dstrideq]
-    pavgb               m7, m2
-%endif
-    movd            [dstq], m4
-    movd [dstq + dstrideq], m7
-
-    lea               srcq, [srcq + sstrideq        ]
-    prefetcht0              [srcq + 4 * sstrideq - 3]
-    lea               srcq, [srcq + sstrideq        ]
-    lea               dstq, [dstq + 2 * dstrideq    ]
-    prefetcht0              [srcq + 2 * sstrideq - 3]
-
-    dec            heightd
-    jnz              .loop
-
-    ; Do last row if output_height is odd
-    mov            heightd, orig_height
-    and            heightd, 1
-    je               .done
-
-    movh                m0, [srcq - 3]    ; load src
-    movh                m1, [srcq + 5]
-    punpcklqdq          m0, m1
-
-    HORIZx4_ROW         m0, m1
-%ifidn %1, h8_avg
-    movd                m0, [dstq]
-    pavgb               m1, m0
-%endif
-    movd            [dstq], m1
-.done
-    RET
-%endm
-
-%macro HORIZx8_ROW 5
-    mova        %2, %1
-    punpcklbw   %1, %1
-    punpckhbw   %2, %2
-
-    mova        %3, %2
-    mova        %4, %2
-    mova        %5, %2
-
-    palignr     %2, %1, 1
-    palignr     %3, %1, 5
-    palignr     %4, %1, 9
-    palignr     %5, %1, 13
-
-    pmaddubsw   %2, k0k1
-    pmaddubsw   %3, k2k3
-    pmaddubsw   %4, k4k5
-    pmaddubsw   %5, k6k7
-    paddsw      %2, %4
-    paddsw      %5, %3
-    paddsw      %2, %5
-    paddsw      %2, krd
-    psraw       %2, 7
-    packuswb    %2, %2
-    SWAP        %1, %2
-%endm
-
-;-------------------------------------------------------------------------------
-%macro SUBPIX_HFILTER8 1
-cglobal filter_block1d8_%1, 6, 6+(ARCH_X86_64*1), 14, LOCAL_VARS_SIZE, \
-                            src, sstride, dst, dstride, height, filter
-    mova                 m4, [filterq]
-    SETUP_LOCAL_VARS
-%if ARCH_X86_64
-    %define     orig_height r7d
-%else
-    %define     orig_height heightmp
-%endif
-    mov         orig_height, heightd
-    shr             heightd, 1
-
-.loop:
-    movh                 m0, [srcq - 3]
-    movh                 m3, [srcq + 5]
-    movh                 m4, [srcq + sstrideq - 3]
-    movh                 m7, [srcq + sstrideq + 5]
-    punpcklqdq           m0, m3
-    mova                 m1, m0
-    punpcklbw            m0, m0
-    punpckhbw            m1, m1
-    mova                 m5, m1
-    palignr              m5, m0, 13
-    pmaddubsw            m5, k6k7
-    mova                 m2, m1
-    mova                 m3, m1
-    palignr              m1, m0, 1
-    pmaddubsw            m1, k0k1
-    punpcklqdq           m4, m7
-    mova                 m6, m4
-    punpcklbw            m4, m4
-    palignr              m2, m0, 5
-    punpckhbw            m6, m6
-    palignr              m3, m0, 9
-    mova                 m7, m6
-    pmaddubsw            m2, k2k3
-    pmaddubsw            m3, k4k5
-
-    palignr              m7, m4, 13
-    mova                 m0, m6
-    palignr              m0, m4, 5
-    pmaddubsw            m7, k6k7
-    paddsw               m1, m3
-    paddsw               m2, m5
-    paddsw               m1, m2
-    mova                 m5, m6
-    palignr              m6, m4, 1
-    pmaddubsw            m0, k2k3
-    pmaddubsw            m6, k0k1
-    palignr              m5, m4, 9
-    paddsw               m1, krd
-    pmaddubsw            m5, k4k5
-    psraw                m1, 7
-    paddsw               m0, m7
-%ifidn %1, h8_avg
-    movh                 m7, [dstq]
-    movh                 m2, [dstq + dstrideq]
-%endif
-    packuswb             m1, m1
-    paddsw               m6, m5
-    paddsw               m6, m0
-    paddsw               m6, krd
-    psraw                m6, 7
-    packuswb             m6, m6
-%ifidn %1, h8_avg
-    pavgb                m1, m7
-    pavgb                m6, m2
-%endif
-    movh             [dstq], m1
-    movh  [dstq + dstrideq], m6
-
-    lea                srcq, [srcq + sstrideq        ]
-    prefetcht0               [srcq + 4 * sstrideq - 3]
-    lea                srcq, [srcq + sstrideq        ]
-    lea                dstq, [dstq + 2 * dstrideq    ]
-    prefetcht0               [srcq + 2 * sstrideq - 3]
-    dec             heightd
-    jnz             .loop
-
-    ;Do last row if output_height is odd
-    mov             heightd, orig_height
-    and             heightd, 1
-    je                .done
-
-    movh                 m0, [srcq - 3]
-    movh                 m3, [srcq + 5]
-    punpcklqdq           m0, m3
-
-    HORIZx8_ROW          m0, m1, m2, m3, m4
-
-%ifidn %1, h8_avg
-    movh                 m1, [dstq]
-    pavgb                m0, m1
-%endif
-    movh             [dstq], m0
-.done:
-    RET
-%endm
-
-;-------------------------------------------------------------------------------
-%macro SUBPIX_HFILTER16 1
-cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*0), 14, LOCAL_VARS_SIZE, \
-                             src, sstride, dst, dstride, height, filter
-    mova          m4, [filterq]
-    SETUP_LOCAL_VARS
-.loop:
-    prefetcht0        [srcq + 2 * sstrideq -3]
-
-    movh          m0, [srcq -  3]
-    movh          m4, [srcq +  5]
-    movh          m6, [srcq + 13]
-    punpcklqdq    m0, m4
-    mova          m7, m0
-    punpckhbw     m0, m0
-    mova          m1, m0
-    punpcklqdq    m4, m6
-    mova          m3, m0
-    punpcklbw     m7, m7
-
-    palignr       m3, m7, 13
-    mova          m2, m0
-    pmaddubsw     m3, k6k7
-    palignr       m0, m7, 1
-    pmaddubsw     m0, k0k1
-    palignr       m1, m7, 5
-    pmaddubsw     m1, k2k3
-    palignr       m2, m7, 9
-    pmaddubsw     m2, k4k5
-    paddsw        m1, m3
-    mova          m3, m4
-    punpckhbw     m4, m4
-    mova          m5, m4
-    punpcklbw     m3, m3
-    mova          m7, m4
-    palignr       m5, m3, 5
-    mova          m6, m4
-    palignr       m4, m3, 1
-    pmaddubsw     m4, k0k1
-    pmaddubsw     m5, k2k3
-    palignr       m6, m3, 9
-    pmaddubsw     m6, k4k5
-    palignr       m7, m3, 13
-    pmaddubsw     m7, k6k7
-    paddsw        m0, m2
-    paddsw        m0, m1
-%ifidn %1, h8_avg
-    mova          m1, [dstq]
-%endif
-    paddsw        m4, m6
-    paddsw        m5, m7
-    paddsw        m4, m5
-    paddsw        m0, krd
-    paddsw        m4, krd
-    psraw         m0, 7
-    psraw         m4, 7
-    packuswb      m0, m4
-%ifidn %1, h8_avg
-    pavgb         m0, m1
-%endif
-    lea         srcq, [srcq + sstrideq]
-    mova      [dstq], m0
-    lea         dstq, [dstq + dstrideq]
-    dec      heightd
-    jnz        .loop
-    RET
-%endm
-
-INIT_XMM ssse3
-SUBPIX_HFILTER16 h8
-SUBPIX_HFILTER16 h8_avg
-SUBPIX_HFILTER8  h8
-SUBPIX_HFILTER8  h8_avg
-SUBPIX_HFILTER4  h8
-SUBPIX_HFILTER4  h8_avg
-
-;-------------------------------------------------------------------------------
-%macro SUBPIX_VFILTER 2
-cglobal filter_block1d%2_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
-                             src, sstride, dst, dstride, height, filter
-    mova          m4, [filterq]
-    SETUP_LOCAL_VARS
-%if ARCH_X86_64
-    %define      src1q r7
-    %define  sstride6q r8
-    %define dst_stride dstrideq
-%else
-    %define      src1q filterq
-    %define  sstride6q dstrideq
-    %define dst_stride dstridemp
-%endif
-    mov       src1q, srcq
-    add       src1q, sstrideq
-    lea   sstride6q, [sstrideq + sstrideq * 4]
-    add   sstride6q, sstrideq                   ;pitch * 6
-
-%ifidn %2, 8
-    %define movx movh
-%else
-    %define movx movd
-%endif
-.loop:
-    movx         m0, [srcq                ]     ;A
-    movx         m1, [srcq + sstrideq     ]     ;B
-    punpcklbw    m0, m1                         ;A B
-    movx         m2, [srcq + sstrideq * 2 ]     ;C
-    pmaddubsw    m0, k0k1
-    mova         m6, m2
-    movx         m3, [src1q + sstrideq * 2]     ;D
-    punpcklbw    m2, m3                         ;C D
-    pmaddubsw    m2, k2k3
-    movx         m4, [srcq + sstrideq * 4 ]     ;E
-    mova         m7, m4
-    movx         m5, [src1q + sstrideq * 4]     ;F
-    punpcklbw    m4, m5                         ;E F
-    pmaddubsw    m4, k4k5
-    punpcklbw    m1, m6                         ;A B next iter
-    movx         m6, [srcq + sstride6q    ]     ;G
-    punpcklbw    m5, m6                         ;E F next iter
-    punpcklbw    m3, m7                         ;C D next iter
-    pmaddubsw    m5, k4k5
-    movx         m7, [src1q + sstride6q   ]     ;H
-    punpcklbw    m6, m7                         ;G H
-    pmaddubsw    m6, k6k7
-    pmaddubsw    m3, k2k3
-    pmaddubsw    m1, k0k1
-    paddsw       m0, m4
-    paddsw       m2, m6
-    movx         m6, [srcq + sstrideq * 8 ]     ;H next iter
-    punpcklbw    m7, m6
-    pmaddubsw    m7, k6k7
-    paddsw       m0, m2
-    paddsw       m0, krd
-    psraw        m0, 7
-    paddsw       m1, m5
-    packuswb     m0, m0
-
-    paddsw       m3, m7
-    paddsw       m1, m3
-    paddsw       m1, krd
-    psraw        m1, 7
-    lea        srcq, [srcq + sstrideq * 2 ]
-    lea       src1q, [src1q + sstrideq * 2]
-    packuswb     m1, m1
-
-%ifidn %1, v8_avg
-    movx         m2, [dstq]
-    pavgb        m0, m2
-%endif
-    movx     [dstq], m0
-    add        dstq, dst_stride
-%ifidn %1, v8_avg
-    movx         m3, [dstq]
-    pavgb        m1, m3
-%endif
-    movx     [dstq], m1
-    add        dstq, dst_stride
-    sub     heightd, 2
-    cmp     heightd, 1
-    jg        .loop
-
-    cmp     heightd, 0
-    je        .done
-
-    movx         m0, [srcq                ]     ;A
-    movx         m1, [srcq + sstrideq     ]     ;B
-    movx         m6, [srcq + sstride6q    ]     ;G
-    punpcklbw    m0, m1                         ;A B
-    movx         m7, [src1q + sstride6q   ]     ;H
-    pmaddubsw    m0, k0k1
-    movx         m2, [srcq + sstrideq * 2 ]     ;C
-    punpcklbw    m6, m7                         ;G H
-    movx         m3, [src1q + sstrideq * 2]     ;D
-    pmaddubsw    m6, k6k7
-    movx         m4, [srcq + sstrideq * 4 ]     ;E
-    punpcklbw    m2, m3                         ;C D
-    movx         m5, [src1q + sstrideq * 4]     ;F
-    punpcklbw    m4, m5                         ;E F
-    pmaddubsw    m2, k2k3
-    pmaddubsw    m4, k4k5
-    paddsw       m2, m6
-    paddsw       m0, m4
-    paddsw       m0, m2
-    paddsw       m0, krd
-    psraw        m0, 7
-    packuswb     m0, m0
-%ifidn %1, v8_avg
-    movx         m1, [dstq]
-    pavgb        m0, m1
-%endif
-    movx     [dstq], m0
-.done:
-    RET
-%endm
-
-;-------------------------------------------------------------------------------
-%macro SUBPIX_VFILTER16 1
-cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
-                             src, sstride, dst, dstride, height, filter
-    mova          m4, [filterq]
-    SETUP_LOCAL_VARS
-%if ARCH_X86_64
-    %define      src1q r7
-    %define  sstride6q r8
-    %define dst_stride dstrideq
-%else
-    %define      src1q filterq
-    %define  sstride6q dstrideq
-    %define dst_stride dstridemp
-%endif
-    mov        src1q, srcq
-    add        src1q, sstrideq
-    lea    sstride6q, [sstrideq + sstrideq * 4]
-    add    sstride6q, sstrideq                   ;pitch * 6
-
-.loop:
-    movh          m0, [srcq                ]     ;A
-    movh          m1, [srcq + sstrideq     ]     ;B
-    movh          m2, [srcq + sstrideq * 2 ]     ;C
-    movh          m3, [src1q + sstrideq * 2]     ;D
-    movh          m4, [srcq + sstrideq * 4 ]     ;E
-    movh          m5, [src1q + sstrideq * 4]     ;F
-
-    punpcklbw     m0, m1                         ;A B
-    movh          m6, [srcq + sstride6q]         ;G
-    punpcklbw     m2, m3                         ;C D
-    movh          m7, [src1q + sstride6q]        ;H
-    punpcklbw     m4, m5                         ;E F
-    pmaddubsw     m0, k0k1
-    movh          m3, [srcq + 8]                 ;A
-    pmaddubsw     m2, k2k3
-    punpcklbw     m6, m7                         ;G H
-    movh          m5, [srcq + sstrideq + 8]      ;B
-    pmaddubsw     m4, k4k5
-    punpcklbw     m3, m5                         ;A B
-    movh          m7, [srcq + sstrideq * 2 + 8]  ;C
-    pmaddubsw     m6, k6k7
-    movh          m5, [src1q + sstrideq * 2 + 8] ;D
-    punpcklbw     m7, m5                         ;C D
-    paddsw        m2, m6
-    pmaddubsw     m3, k0k1
-    movh          m1, [srcq + sstrideq * 4 + 8]  ;E
-    paddsw        m0, m4
-    pmaddubsw     m7, k2k3
-    movh          m6, [src1q + sstrideq * 4 + 8] ;F
-    punpcklbw     m1, m6                         ;E F
-    paddsw        m0, m2
-    paddsw        m0, krd
-    movh          m2, [srcq + sstride6q + 8]     ;G
-    pmaddubsw     m1, k4k5
-    movh          m5, [src1q + sstride6q + 8]    ;H
-    psraw         m0, 7
-    punpcklbw     m2, m5                         ;G H
-    pmaddubsw     m2, k6k7
-%ifidn %1, v8_avg
-    mova          m4, [dstq]
-%endif
-    movh      [dstq], m0
-    paddsw        m7, m2
-    paddsw        m3, m1
-    paddsw        m3, m7
-    paddsw        m3, krd
-    psraw         m3, 7
-    packuswb      m0, m3
-
-    add         srcq, sstrideq
-    add        src1q, sstrideq
-%ifidn %1, v8_avg
-    pavgb         m0, m4
-%endif
-    mova      [dstq], m0
-    add         dstq, dst_stride
-    dec      heightd
-    jnz        .loop
-    RET
-%endm
-
-INIT_XMM ssse3
-SUBPIX_VFILTER16     v8
-SUBPIX_VFILTER16 v8_avg
-SUBPIX_VFILTER       v8, 8
-SUBPIX_VFILTER   v8_avg, 8
-SUBPIX_VFILTER       v8, 4
-SUBPIX_VFILTER   v8_avg, 4
diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm
deleted file mode 100644
index a378dd0402..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm
+++ /dev/null
@@ -1,448 +0,0 @@
-;
-;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro GET_PARAM_4 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm3, [rdx]                 ;load filters
-    pshuflw     xmm4, xmm3, 11111111b       ;k3
-    psrldq      xmm3, 8
-    pshuflw     xmm3, xmm3, 0b              ;k4
-    punpcklqdq  xmm4, xmm3                  ;k3k4
-
-    movq        xmm3, rcx                   ;rounding
-    pshufd      xmm3, xmm3, 0
-
-    pxor        xmm2, xmm2
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-%endm
-
-%macro APPLY_FILTER_4 1
-
-    punpckldq   xmm0, xmm1                  ;two row in one register
-    punpcklbw   xmm0, xmm2                  ;unpack to word
-    pmullw      xmm0, xmm4                  ;multiply the filter factors
-
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 8
-    paddsw      xmm0, xmm1
-
-    paddsw      xmm0, xmm3                  ;rounding
-    psraw       xmm0, 7                     ;shift
-    packuswb    xmm0, xmm0                  ;pack to byte
-
-%if %1
-    movd        xmm1, [rdi]
-    pavgb       xmm0, xmm1
-%endif
-
-    movd        [rdi], xmm0
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-%endm
-
-%macro GET_PARAM 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm7, [rdx]                 ;load filters
-
-    pshuflw     xmm6, xmm7, 11111111b       ;k3
-    pshufhw     xmm7, xmm7, 0b              ;k4
-    punpcklwd   xmm6, xmm6
-    punpckhwd   xmm7, xmm7
-
-    movq        xmm4, rcx                   ;rounding
-    pshufd      xmm4, xmm4, 0
-
-    pxor        xmm5, xmm5
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-%endm
-
-%macro APPLY_FILTER_8 1
-    punpcklbw   xmm0, xmm5
-    punpcklbw   xmm1, xmm5
-
-    pmullw      xmm0, xmm6
-    pmullw      xmm1, xmm7
-    paddsw      xmm0, xmm1
-    paddsw      xmm0, xmm4                  ;rounding
-    psraw       xmm0, 7                     ;shift
-    packuswb    xmm0, xmm0                  ;pack back to byte
-%if %1
-    movq        xmm1, [rdi]
-    pavgb       xmm0, xmm1
-%endif
-    movq        [rdi], xmm0                 ;store the result
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-%endm
-
-%macro APPLY_FILTER_16 1
-    punpcklbw   xmm0, xmm5
-    punpcklbw   xmm1, xmm5
-    punpckhbw   xmm2, xmm5
-    punpckhbw   xmm3, xmm5
-
-    pmullw      xmm0, xmm6
-    pmullw      xmm1, xmm7
-    pmullw      xmm2, xmm6
-    pmullw      xmm3, xmm7
-
-    paddsw      xmm0, xmm1
-    paddsw      xmm2, xmm3
-
-    paddsw      xmm0, xmm4                  ;rounding
-    paddsw      xmm2, xmm4
-    psraw       xmm0, 7                     ;shift
-    psraw       xmm2, 7
-    packuswb    xmm0, xmm2                  ;pack back to byte
-%if %1
-    movdqu      xmm1, [rdi]
-    pavgb       xmm0, xmm1
-%endif
-    movdqu      [rdi], xmm0                 ;store the result
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-%endm
-
-global sym(vpx_filter_block1d4_v2_sse2) PRIVATE
-sym(vpx_filter_block1d4_v2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movd        xmm0, [rsi]                 ;load src
-    movd        xmm1, [rsi + rax]
-
-    APPLY_FILTER_4 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d8_v2_sse2) PRIVATE
-sym(vpx_filter_block1d8_v2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movq        xmm0, [rsi]                 ;0
-    movq        xmm1, [rsi + rax]           ;1
-
-    APPLY_FILTER_8 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d16_v2_sse2) PRIVATE
-sym(vpx_filter_block1d16_v2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu        xmm0, [rsi]               ;0
-    movdqu        xmm1, [rsi + rax]         ;1
-    movdqa        xmm2, xmm0
-    movdqa        xmm3, xmm1
-
-    APPLY_FILTER_16 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d4_v2_avg_sse2) PRIVATE
-sym(vpx_filter_block1d4_v2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movd        xmm0, [rsi]                 ;load src
-    movd        xmm1, [rsi + rax]
-
-    APPLY_FILTER_4 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d8_v2_avg_sse2) PRIVATE
-sym(vpx_filter_block1d8_v2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movq        xmm0, [rsi]                 ;0
-    movq        xmm1, [rsi + rax]           ;1
-
-    APPLY_FILTER_8 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d16_v2_avg_sse2) PRIVATE
-sym(vpx_filter_block1d16_v2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu        xmm0, [rsi]               ;0
-    movdqu        xmm1, [rsi + rax]         ;1
-    movdqa        xmm2, xmm0
-    movdqa        xmm3, xmm1
-
-    APPLY_FILTER_16 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d4_h2_sse2) PRIVATE
-sym(vpx_filter_block1d4_h2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_4 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d8_h2_sse2) PRIVATE
-sym(vpx_filter_block1d8_h2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_8 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d16_h2_sse2) PRIVATE
-sym(vpx_filter_block1d16_h2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0,   [rsi]               ;load src
-    movdqu      xmm1,   [rsi + 1]
-    movdqa      xmm2, xmm0
-    movdqa      xmm3, xmm1
-
-    APPLY_FILTER_16 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d4_h2_avg_sse2) PRIVATE
-sym(vpx_filter_block1d4_h2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_4 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d8_h2_avg_sse2) PRIVATE
-sym(vpx_filter_block1d8_h2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_8 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d16_h2_avg_sse2) PRIVATE
-sym(vpx_filter_block1d16_h2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0,   [rsi]               ;load src
-    movdqu      xmm1,   [rsi + 1]
-    movdqa      xmm2, xmm0
-    movdqa      xmm3, xmm1
-
-    APPLY_FILTER_16 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
deleted file mode 100644
index 3c8cfd2253..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
+++ /dev/null
@@ -1,422 +0,0 @@
-;
-;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro GET_PARAM_4 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm3, [rdx]                 ;load filters
-    psrldq      xmm3, 6
-    packsswb    xmm3, xmm3
-    pshuflw     xmm3, xmm3, 0b              ;k3_k4
-
-    movq        xmm2, rcx                   ;rounding
-    pshufd      xmm2, xmm2, 0
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-%endm
-
-%macro APPLY_FILTER_4 1
-    punpcklbw   xmm0, xmm1
-    pmaddubsw   xmm0, xmm3
-
-    paddsw      xmm0, xmm2                  ;rounding
-    psraw       xmm0, 7                     ;shift
-    packuswb    xmm0, xmm0                  ;pack to byte
-
-%if %1
-    movd        xmm1, [rdi]
-    pavgb       xmm0, xmm1
-%endif
-    movd        [rdi], xmm0
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-%endm
-
-%macro GET_PARAM 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm7, [rdx]                 ;load filters
-    psrldq      xmm7, 6
-    packsswb    xmm7, xmm7
-    pshuflw     xmm7, xmm7, 0b              ;k3_k4
-    punpcklwd   xmm7, xmm7
-
-    movq        xmm6, rcx                   ;rounding
-    pshufd      xmm6, xmm6, 0
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-%endm
-
-%macro APPLY_FILTER_8 1
-    punpcklbw   xmm0, xmm1
-    pmaddubsw   xmm0, xmm7
-
-    paddsw      xmm0, xmm6                  ;rounding
-    psraw       xmm0, 7                     ;shift
-    packuswb    xmm0, xmm0                  ;pack back to byte
-
-%if %1
-    movq        xmm1, [rdi]
-    pavgb       xmm0, xmm1
-%endif
-    movq        [rdi], xmm0                 ;store the result
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-%endm
-
-%macro APPLY_FILTER_16 1
-    punpcklbw   xmm0, xmm1
-    punpckhbw   xmm2, xmm1
-    pmaddubsw   xmm0, xmm7
-    pmaddubsw   xmm2, xmm7
-
-    paddsw      xmm0, xmm6                  ;rounding
-    paddsw      xmm2, xmm6
-    psraw       xmm0, 7                     ;shift
-    psraw       xmm2, 7
-    packuswb    xmm0, xmm2                  ;pack back to byte
-
-%if %1
-    movdqu      xmm1, [rdi]
-    pavgb       xmm0, xmm1
-%endif
-    movdqu      [rdi], xmm0                 ;store the result
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-%endm
-
-global sym(vpx_filter_block1d4_v2_ssse3) PRIVATE
-sym(vpx_filter_block1d4_v2_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movd        xmm0, [rsi]                 ;load src
-    movd        xmm1, [rsi + rax]
-
-    APPLY_FILTER_4 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d8_v2_ssse3) PRIVATE
-sym(vpx_filter_block1d8_v2_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movq        xmm0, [rsi]                 ;0
-    movq        xmm1, [rsi + rax]           ;1
-
-    APPLY_FILTER_8 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d16_v2_ssse3) PRIVATE
-sym(vpx_filter_block1d16_v2_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu        xmm0, [rsi]               ;0
-    movdqu        xmm1, [rsi + rax]         ;1
-    movdqa        xmm2, xmm0
-
-    APPLY_FILTER_16 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d4_v2_avg_ssse3) PRIVATE
-sym(vpx_filter_block1d4_v2_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movd        xmm0, [rsi]                 ;load src
-    movd        xmm1, [rsi + rax]
-
-    APPLY_FILTER_4 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d8_v2_avg_ssse3) PRIVATE
-sym(vpx_filter_block1d8_v2_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movq        xmm0, [rsi]                 ;0
-    movq        xmm1, [rsi + rax]           ;1
-
-    APPLY_FILTER_8 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d16_v2_avg_ssse3) PRIVATE
-sym(vpx_filter_block1d16_v2_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu        xmm0, [rsi]               ;0
-    movdqu        xmm1, [rsi + rax]         ;1
-    movdqa        xmm2, xmm0
-
-    APPLY_FILTER_16 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d4_h2_ssse3) PRIVATE
-sym(vpx_filter_block1d4_h2_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_4 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d8_h2_ssse3) PRIVATE
-sym(vpx_filter_block1d8_h2_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_8 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d16_h2_ssse3) PRIVATE
-sym(vpx_filter_block1d16_h2_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0,   [rsi]               ;load src
-    movdqu      xmm1,   [rsi + 1]
-    movdqa      xmm2, xmm0
-
-    APPLY_FILTER_16 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d4_h2_avg_ssse3) PRIVATE
-sym(vpx_filter_block1d4_h2_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_4 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d8_h2_avg_ssse3) PRIVATE
-sym(vpx_filter_block1d8_h2_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_8 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d16_h2_avg_ssse3) PRIVATE
-sym(vpx_filter_block1d16_h2_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0,   [rsi]               ;load src
-    movdqu      xmm1,   [rsi + 1]
-    movdqa      xmm2, xmm0
-
-    APPLY_FILTER_16 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
author	Rémi Verschelde <rverschelde@gmail.com>	2021-10-15 12:05:32 +0200
committer	Rémi Verschelde <rverschelde@gmail.com>	2021-10-15 12:09:11 +0200
commit	ae74e78909ae0bc476112fb43b9580e969879dcd (patch)
tree	49144c84e18719a7ca54a243effc319ea128ab70 /thirdparty/libvpx/vpx_dsp
parent	e2bfb27efb858c4a1314d314386531cbcdfcf335 (diff)