18 files changed, 0 insertions, 15330 deletions
diff --git a/thirdparty/libvpx/vpx_dsp/x86/convolve.h b/thirdparty/libvpx/vpx_dsp/x86/convolve.h
deleted file mode 100644
index 7e43eb7c72..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/convolve.h
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#ifndef VPX_DSP_X86_CONVOLVE_H_
-#define VPX_DSP_X86_CONVOLVE_H_
-
-#include <assert.h>
-
-#include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
-#include "vpx_ports/mem.h"
-
-typedef void filter8_1dfunction (
-  const uint8_t *src_ptr,
-  ptrdiff_t src_pitch,
-  uint8_t *output_ptr,
-  ptrdiff_t out_pitch,
-  uint32_t output_height,
-  const int16_t *filter
-);
-
-#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
-  void vpx_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
-                                    uint8_t *dst, ptrdiff_t dst_stride, \
-                                    const int16_t *filter_x, int x_step_q4, \
-                                    const int16_t *filter_y, int y_step_q4, \
-                                    int w, int h) { \
-  assert(filter[3] != 128); \
-  assert(step_q4 == 16); \
-  if (filter[0] | filter[1] | filter[2]) { \
-    while (w >= 16) { \
-      vpx_filter_block1d16_##dir##8_##avg##opt(src_start, \
-                                               src_stride, \
-                                               dst, \
-                                               dst_stride, \
-                                               h, \
-                                               filter); \
-      src += 16; \
-      dst += 16; \
-      w -= 16; \
-    } \
-    if (w == 8) { \
-      vpx_filter_block1d8_##dir##8_##avg##opt(src_start, \
-                                              src_stride, \
-                                              dst, \
-                                              dst_stride, \
-                                              h, \
-                                              filter); \
-    } else if (w == 4) { \
-      vpx_filter_block1d4_##dir##8_##avg##opt(src_start, \
-                                              src_stride, \
-                                              dst, \
-                                              dst_stride, \
-                                              h, \
-                                              filter); \
-    } \
-  } else { \
-    while (w >= 16) { \
-      vpx_filter_block1d16_##dir##2_##avg##opt(src, \
-                                               src_stride, \
-                                               dst, \
-                                               dst_stride, \
-                                               h, \
-                                               filter); \
-      src += 16; \
-      dst += 16; \
-      w -= 16; \
-    } \
-    if (w == 8) { \
-      vpx_filter_block1d8_##dir##2_##avg##opt(src, \
-                                              src_stride, \
-                                              dst, \
-                                              dst_stride, \
-                                              h, \
-                                              filter); \
-    } else if (w == 4) { \
-      vpx_filter_block1d4_##dir##2_##avg##opt(src, \
-                                              src_stride, \
-                                              dst, \
-                                              dst_stride, \
-                                              h, \
-                                              filter); \
-    } \
-  } \
-}
-
-#define FUN_CONV_2D(avg, opt) \
-void vpx_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
-                              uint8_t *dst, ptrdiff_t dst_stride, \
-                              const int16_t *filter_x, int x_step_q4, \
-                              const int16_t *filter_y, int y_step_q4, \
-                              int w, int h) { \
-  assert(filter_x[3] != 128); \
-  assert(filter_y[3] != 128); \
-  assert(w <= 64); \
-  assert(h <= 64); \
-  assert(x_step_q4 == 16); \
-  assert(y_step_q4 == 16); \
-  if (filter_x[0] | filter_x[1] | filter_x[2]) { \
-    DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \
-    vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
-                              filter_x, x_step_q4, filter_y, y_step_q4, \
-                              w, h + 7); \
-    vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
-                                    filter_x, x_step_q4, filter_y, \
-                                    y_step_q4, w, h); \
-  } else { \
-    DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \
-    vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
-                              filter_x, x_step_q4, filter_y, y_step_q4, \
-                              w, h + 1); \
-    vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
-                                    filter_x, x_step_q4, filter_y, \
-                                    y_step_q4, w, h); \
-  } \
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-
-typedef void highbd_filter8_1dfunction (
-  const uint16_t *src_ptr,
-  const ptrdiff_t src_pitch,
-  uint16_t *output_ptr,
-  ptrdiff_t out_pitch,
-  unsigned int output_height,
-  const int16_t *filter,
-  int bd
-);
-
-#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
-  void vpx_highbd_convolve8_##name##_##opt(const uint8_t *src8, \
-                                           ptrdiff_t src_stride, \
-                                           uint8_t *dst8, \
-                                           ptrdiff_t dst_stride, \
-                                           const int16_t *filter_x, \
-                                           int x_step_q4, \
-                                           const int16_t *filter_y, \
-                                           int y_step_q4, \
-                                           int w, int h, int bd) { \
-  if (step_q4 == 16 && filter[3] != 128) { \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
-    if (filter[0] | filter[1] | filter[2]) { \
-      while (w >= 16) { \
-        vpx_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \
-                                                        src_stride, \
-                                                        dst, \
-                                                        dst_stride, \
-                                                        h, \
-                                                        filter, \
-                                                        bd); \
-        src += 16; \
-        dst += 16; \
-        w -= 16; \
-      } \
-      while (w >= 8) { \
-        vpx_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \
-                                                       src_stride, \
-                                                       dst, \
-                                                       dst_stride, \
-                                                       h, \
-                                                       filter, \
-                                                       bd); \
-        src += 8; \
-        dst += 8; \
-        w -= 8; \
-      } \
-      while (w >= 4) { \
-        vpx_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \
-                                                       src_stride, \
-                                                       dst, \
-                                                       dst_stride, \
-                                                       h, \
-                                                       filter, \
-                                                       bd); \
-        src += 4; \
-        dst += 4; \
-        w -= 4; \
-      } \
-    } else { \
-      while (w >= 16) { \
-        vpx_highbd_filter_block1d16_##dir##2_##avg##opt(src, \
-                                                        src_stride, \
-                                                        dst, \
-                                                        dst_stride, \
-                                                        h, \
-                                                        filter, \
-                                                        bd); \
-        src += 16; \
-        dst += 16; \
-        w -= 16; \
-      } \
-      while (w >= 8) { \
-        vpx_highbd_filter_block1d8_##dir##2_##avg##opt(src, \
-                                                       src_stride, \
-                                                       dst, \
-                                                       dst_stride, \
-                                                       h, \
-                                                       filter, \
-                                                       bd); \
-        src += 8; \
-        dst += 8; \
-        w -= 8; \
-      } \
-      while (w >= 4) { \
-        vpx_highbd_filter_block1d4_##dir##2_##avg##opt(src, \
-                                                       src_stride, \
-                                                       dst, \
-                                                       dst_stride, \
-                                                       h, \
-                                                       filter, \
-                                                       bd); \
-        src += 4; \
-        dst += 4; \
-        w -= 4; \
-      } \
-    } \
-  } \
-  if (w) { \
-    vpx_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \
-                                    filter_x, x_step_q4, filter_y, y_step_q4, \
-                                    w, h, bd); \
-  } \
-}
-
-#define HIGH_FUN_CONV_2D(avg, opt) \
-void vpx_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
-                                     uint8_t *dst, ptrdiff_t dst_stride, \
-                                     const int16_t *filter_x, int x_step_q4, \
-                                     const int16_t *filter_y, int y_step_q4, \
-                                     int w, int h, int bd) { \
-  assert(w <= 64); \
-  assert(h <= 64); \
-  if (x_step_q4 == 16 && y_step_q4 == 16) { \
-    if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) { \
-      DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
-      vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
-                                       CONVERT_TO_BYTEPTR(fdata2), 64, \
-                                       filter_x, x_step_q4, \
-                                       filter_y, y_step_q4, \
-                                       w, h + 7, bd); \
-      vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \
-                                             64, dst, dst_stride, \
-                                             filter_x, x_step_q4, \
-                                             filter_y, y_step_q4, \
-                                             w, h, bd); \
-    } else { \
-      DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \
-      vpx_highbd_convolve8_horiz_##opt(src, src_stride, \
-                                       CONVERT_TO_BYTEPTR(fdata2), 64, \
-                                       filter_x, x_step_q4, \
-                                       filter_y, y_step_q4, \
-                                       w, h + 1, bd); \
-      vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \
-                                             dst, dst_stride, \
-                                             filter_x, x_step_q4, \
-                                             filter_y, y_step_q4, \
-                                             w, h, bd); \
-    } \
-  } else { \
-    vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
-                                  filter_x, x_step_q4, filter_y, y_step_q4, w, \
-                                  h, bd); \
-  } \
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-#endif  // VPX_DSP_X86_CONVOLVE_H_
diff --git a/thirdparty/libvpx/vpx_dsp/x86/intrapred_sse2.asm b/thirdparty/libvpx/vpx_dsp/x86/intrapred_sse2.asm
deleted file mode 100644
index cd6a6ae982..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/intrapred_sse2.asm
+++ /dev/null
@@ -1,860 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pb_1: times 16 db 1
-pw_4:  times 8 dw 4
-pw_8:  times 8 dw 8
-pw_16: times 8 dw 16
-pw_32: times 8 dw 32
-dc_128: times 16 db 128
-pw2_4:  times 8 dw 2
-pw2_8:  times 8 dw 4
-pw2_16:  times 8 dw 8
-pw2_32:  times 8 dw 16
-
-SECTION .text
-
-; ------------------------------------------
-; input: x, y, z, result
-;
-; trick from pascal
-; (x+2y+z+2)>>2 can be calculated as:
-; result = avg(x,z)
-; result -= xor(x,z) & 1
-; result = avg(result,y)
-; ------------------------------------------
-%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
-  pavgb               %4, %1, %3
-  pxor                %3, %1
-  pand                %3, [GLOBAL(pb_1)]
-  psubb               %4, %3
-  pavgb               %4, %2
-%endmacro
-
-INIT_XMM sse2
-cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset
-  GET_GOT     goffsetq
-
-  movq                 m0, [aboveq]
-  DEFINE_ARGS dst, stride, temp
-  psrldq               m1, m0, 1
-  psrldq               m2, m0, 2
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
-
-  ; store 4 lines
-  movd   [dstq          ], m3
-  psrlq                m3, 8
-  movd   [dstq+strideq  ], m3
-  lea                dstq, [dstq+strideq*2]
-  psrlq                m3, 8
-  movd   [dstq          ], m3
-  psrlq                m3, 8
-  movd   [dstq+strideq  ], m3
-  psrlq                m0, 56
-  movd              tempq, m0
-  mov    [dstq+strideq+3], tempb
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset
-  GET_GOT     goffsetq
-
-  movu                m1, [aboveq]
-  pslldq              m0, m1, 1
-  psrldq              m2, m1, 1
-  DEFINE_ARGS dst, stride, stride3
-  lea           stride3q, [strideq*3]
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
-  punpckhbw           m0, m0 ; 7 7
-  punpcklwd           m0, m0 ; 7 7 7 7
-  punpckldq           m0, m0 ; 7 7 7 7 7 7 7 7
-  punpcklqdq          m3, m0 ; -1 0 1 2 3 4 5 6 7 7 7 7 7 7 7 7
-
- ; store 4 lines
-  psrldq                m3, 1
-  movq    [dstq          ], m3
-  psrldq                m3, 1
-  movq    [dstq+strideq  ], m3
-  psrldq                m3, 1
-  movq    [dstq+strideq*2], m3
-  psrldq                m3, 1
-  movq    [dstq+stride3q ], m3
-  lea                 dstq, [dstq+strideq*4]
-
-  ; store next 4 lines
-  psrldq                m3, 1
-  movq    [dstq          ], m3
-  psrldq                m3, 1
-  movq    [dstq+strideq  ], m3
-  psrldq                m3, 1
-  movq    [dstq+strideq*2], m3
-  psrldq                m3, 1
-  movq    [dstq+stride3q ], m3
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal d207_predictor_4x4, 4, 4, 5, dst, stride, unused, left, goffset
-  GET_GOT     goffsetq
-
-  movd                m0, [leftq]                ; abcd [byte]
-  punpcklbw           m4, m0, m0                 ; aabb ccdd
-  punpcklwd           m4, m4                     ; aaaa bbbb cccc dddd
-  psrldq              m4, 12                     ; dddd
-  punpckldq           m0, m4                     ; abcd dddd
-  psrldq              m1, m0, 1                  ; bcdd
-  psrldq              m2, m0, 2                  ; cddd
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3   ; a2bc b2cd c3d d
-  pavgb               m1, m0                     ; ab, bc, cd, d [byte]
-
-  punpcklbw           m1, m3             ; ab, a2bc, bc, b2cd, cd, c3d, d, d
-  movd    [dstq        ], m1
-  psrlq               m1, 16             ; bc, b2cd, cd, c3d, d, d
-  movd    [dstq+strideq], m1
-
-  lea               dstq, [dstq+strideq*2]
-  psrlq               m1, 16             ; cd, c3d, d, d
-  movd    [dstq        ], m1
-  movd    [dstq+strideq], m4             ; d, d, d, d
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  movd                  m2, [leftq]
-  movd                  m0, [aboveq]
-  pxor                  m1, m1
-  punpckldq             m0, m2
-  psadbw                m0, m1
-  paddw                 m0, [GLOBAL(pw_4)]
-  psraw                 m0, 3
-  pshuflw               m0, m0, 0x0
-  packuswb              m0, m0
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-  lea                 dstq, [dstq+strideq*2]
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset
-  movifnidn          leftq, leftmp
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  movd                  m0, [leftq]
-  psadbw                m0, m1
-  paddw                 m0, [GLOBAL(pw2_4)]
-  psraw                 m0, 2
-  pshuflw               m0, m0, 0x0
-  packuswb              m0, m0
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-  lea                 dstq, [dstq+strideq*2]
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  movd                  m0, [aboveq]
-  psadbw                m0, m1
-  paddw                 m0, [GLOBAL(pw2_4)]
-  psraw                 m0, 2
-  pshuflw               m0, m0, 0x0
-  packuswb              m0, m0
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-  lea                 dstq, [dstq+strideq*2]
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  movq                  m0, [aboveq]
-  movq                  m2, [leftq]
-  DEFINE_ARGS dst, stride, stride3
-  lea             stride3q, [strideq*3]
-  psadbw                m0, m1
-  psadbw                m2, m1
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw_8)]
-  psraw                 m0, 4
-  punpcklbw             m0, m0
-  pshuflw               m0, m0, 0x0
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  movq                  m0, [aboveq]
-  DEFINE_ARGS dst, stride, stride3
-  lea             stride3q, [strideq*3]
-  psadbw                m0, m1
-  paddw                 m0, [GLOBAL(pw2_8)]
-  psraw                 m0, 3
-  punpcklbw             m0, m0
-  pshuflw               m0, m0, 0x0
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset
-  movifnidn          leftq, leftmp
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  movq                  m0, [leftq]
-  DEFINE_ARGS dst, stride, stride3
-  lea             stride3q, [strideq*3]
-  psadbw                m0, m1
-  paddw                 m0, [GLOBAL(pw2_8)]
-  psraw                 m0, 3
-  punpcklbw             m0, m0
-  pshuflw               m0, m0, 0x0
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  DEFINE_ARGS dst, stride, stride3
-  lea             stride3q, [strideq*3]
-  movd     m0,        [GLOBAL(dc_128)]
-  movd    [dstq          ], m0
-  movd    [dstq+strideq  ], m0
-  movd    [dstq+strideq*2], m0
-  movd    [dstq+stride3q ], m0
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  DEFINE_ARGS dst, stride, stride3
-  lea             stride3q, [strideq*3]
-  movq    m0,        [GLOBAL(dc_128)]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [aboveq]
-  mova                  m2, [leftq]
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 4
-  psadbw                m0, m1
-  psadbw                m2, m1
-  paddw                 m0, m2
-  movhlps               m2, m0
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw_16)]
-  psraw                 m0, 5
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-  packuswb              m0, m0
-.loop:
-  mova    [dstq          ], m0
-  mova    [dstq+strideq  ], m0
-  mova    [dstq+strideq*2], m0
-  mova    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-
-INIT_XMM sse2
-cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [aboveq]
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 4
-  psadbw                m0, m1
-  movhlps               m2, m0
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw2_16)]
-  psraw                 m0, 4
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-  packuswb              m0, m0
-.loop:
-  mova    [dstq          ], m0
-  mova    [dstq+strideq  ], m0
-  mova    [dstq+strideq*2], m0
-  mova    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM sse2
-cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [leftq]
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 4
-  psadbw                m0, m1
-  movhlps               m2, m0
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw2_16)]
-  psraw                 m0, 4
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-  packuswb              m0, m0
-.loop:
-  mova    [dstq          ], m0
-  mova    [dstq+strideq  ], m0
-  mova    [dstq+strideq*2], m0
-  mova    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM sse2
-cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 4
-  mova    m0,        [GLOBAL(dc_128)]
-.loop:
-  mova    [dstq          ], m0
-  mova    [dstq+strideq  ], m0
-  mova    [dstq+strideq*2], m0
-  mova    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-  RESTORE_GOT
-  RET
-
-
-INIT_XMM sse2
-cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [aboveq]
-  mova                  m2, [aboveq+16]
-  mova                  m3, [leftq]
-  mova                  m4, [leftq+16]
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 8
-  psadbw                m0, m1
-  psadbw                m2, m1
-  psadbw                m3, m1
-  psadbw                m4, m1
-  paddw                 m0, m2
-  paddw                 m0, m3
-  paddw                 m0, m4
-  movhlps               m2, m0
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw_32)]
-  psraw                 m0, 6
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-  packuswb              m0, m0
-.loop:
-  mova [dstq             ], m0
-  mova [dstq          +16], m0
-  mova [dstq+strideq     ], m0
-  mova [dstq+strideq  +16], m0
-  mova [dstq+strideq*2   ], m0
-  mova [dstq+strideq*2+16], m0
-  mova [dstq+stride3q    ], m0
-  mova [dstq+stride3q +16], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM sse2
-cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [aboveq]
-  mova                  m2, [aboveq+16]
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 8
-  psadbw                m0, m1
-  psadbw                m2, m1
-  paddw                 m0, m2
-  movhlps               m2, m0
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw2_32)]
-  psraw                 m0, 5
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-  packuswb              m0, m0
-.loop:
-  mova [dstq             ], m0
-  mova [dstq          +16], m0
-  mova [dstq+strideq     ], m0
-  mova [dstq+strideq  +16], m0
-  mova [dstq+strideq*2   ], m0
-  mova [dstq+strideq*2+16], m0
-  mova [dstq+stride3q    ], m0
-  mova [dstq+stride3q +16], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM sse2
-cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [leftq]
-  mova                  m2, [leftq+16]
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 8
-  psadbw                m0, m1
-  psadbw                m2, m1
-  paddw                 m0, m2
-  movhlps               m2, m0
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw2_32)]
-  psraw                 m0, 5
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-  packuswb              m0, m0
-.loop:
-  mova [dstq             ], m0
-  mova [dstq          +16], m0
-  mova [dstq+strideq     ], m0
-  mova [dstq+strideq  +16], m0
-  mova [dstq+strideq*2   ], m0
-  mova [dstq+strideq*2+16], m0
-  mova [dstq+stride3q    ], m0
-  mova [dstq+stride3q +16], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM sse2
-cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 8
-  mova    m0,        [GLOBAL(dc_128)]
-.loop:
-  mova [dstq             ], m0
-  mova [dstq          +16], m0
-  mova [dstq+strideq     ], m0
-  mova [dstq+strideq  +16], m0
-  mova [dstq+strideq*2   ], m0
-  mova [dstq+strideq*2+16], m0
-  mova [dstq+stride3q    ], m0
-  mova [dstq+stride3q +16], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above
-  movd                  m0, [aboveq]
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-  lea                 dstq, [dstq+strideq*2]
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-  RET
-
-INIT_XMM sse2
-cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above
-  movq                  m0, [aboveq]
-  DEFINE_ARGS dst, stride, stride3
-  lea             stride3q, [strideq*3]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  RET
-
-INIT_XMM sse2
-cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above
-  mova                  m0, [aboveq]
-  DEFINE_ARGS dst, stride, stride3, nlines4
-  lea             stride3q, [strideq*3]
-  mov              nlines4d, 4
-.loop:
-  mova    [dstq          ], m0
-  mova    [dstq+strideq  ], m0
-  mova    [dstq+strideq*2], m0
-  mova    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec             nlines4d
-  jnz .loop
-  REP_RET
-
-INIT_XMM sse2
-cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above
-  mova                  m0, [aboveq]
-  mova                  m1, [aboveq+16]
-  DEFINE_ARGS dst, stride, stride3, nlines4
-  lea             stride3q, [strideq*3]
-  mov              nlines4d, 8
-.loop:
-  mova [dstq             ], m0
-  mova [dstq          +16], m1
-  mova [dstq+strideq     ], m0
-  mova [dstq+strideq  +16], m1
-  mova [dstq+strideq*2   ], m0
-  mova [dstq+strideq*2+16], m1
-  mova [dstq+stride3q    ], m0
-  mova [dstq+stride3q +16], m1
-  lea                 dstq, [dstq+strideq*4]
-  dec             nlines4d
-  jnz .loop
-  REP_RET
-
-INIT_XMM sse2
-cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left
-  movifnidn          leftq, leftmp
-  movd                  m0, [leftq]
-  punpcklbw             m0, m0
-  punpcklbw             m0, m0
-  pshufd                m1, m0, 0x1
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m1
-  pshufd                m2, m0, 0x2
-  lea                 dstq, [dstq+strideq*2]
-  pshufd                m3, m0, 0x3
-  movd      [dstq        ], m2
-  movd      [dstq+strideq], m3
-  RET
-
-INIT_XMM sse2
-cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left
-  movifnidn          leftq, leftmp
-  mov                lineq, -2
-  DEFINE_ARGS  dst, stride, line, left, stride3
-  lea             stride3q, [strideq*3]
-  movq                  m0, [leftq    ]
-  punpcklbw             m0, m0              ; l1 l1 l2 l2 ... l8 l8
-.loop:
-  pshuflw               m1, m0, 0x0         ; l1 l1 l1 l1 l1 l1 l1 l1
-  pshuflw               m2, m0, 0x55        ; l2 l2 l2 l2 l2 l2 l2 l2
-  movq      [dstq        ], m1
-  movq      [dstq+strideq], m2
-  pshuflw               m1, m0, 0xaa
-  pshuflw               m2, m0, 0xff
-  movq    [dstq+strideq*2], m1
-  movq    [dstq+stride3q ], m2
-  pshufd                m0, m0, 0xe         ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8
-  inc                lineq
-  lea                 dstq, [dstq+strideq*4]
-  jnz .loop
-  REP_RET
-
-INIT_XMM sse2
-cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left
-  movifnidn          leftq, leftmp
-  mov                lineq, -4
-  DEFINE_ARGS dst, stride, line, left, stride3
-  lea             stride3q, [strideq*3]
-.loop:
-  movd                  m0, [leftq]
-  punpcklbw             m0, m0
-  punpcklbw             m0, m0              ; l1 to l4 each repeated 4 times
-  pshufd            m1, m0, 0x0             ; l1 repeated 16 times
-  pshufd            m2, m0, 0x55            ; l2 repeated 16 times
-  mova    [dstq          ], m1
-  mova    [dstq+strideq  ], m2
-  pshufd            m1, m0, 0xaa
-  pshufd            m2, m0, 0xff
-  mova    [dstq+strideq*2], m1
-  mova    [dstq+stride3q ], m2
-  inc                lineq
-  lea                leftq, [leftq+4       ]
-  lea                 dstq, [dstq+strideq*4]
-  jnz .loop
-  REP_RET
-
-INIT_XMM sse2
-cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left
-  movifnidn              leftq, leftmp
-  mov                    lineq, -8
-  DEFINE_ARGS dst, stride, line, left, stride3
-  lea                 stride3q, [strideq*3]
-.loop:
-  movd                      m0, [leftq]
-  punpcklbw                 m0, m0
-  punpcklbw                 m0, m0              ; l1 to l4 each repeated 4 times
-  pshufd                m1, m0, 0x0             ; l1 repeated 16 times
-  pshufd                m2, m0, 0x55            ; l2 repeated 16 times
-  mova     [dstq             ], m1
-  mova     [dstq+16          ], m1
-  mova     [dstq+strideq     ], m2
-  mova     [dstq+strideq+16  ], m2
-  pshufd                m1, m0, 0xaa
-  pshufd                m2, m0, 0xff
-  mova     [dstq+strideq*2   ], m1
-  mova     [dstq+strideq*2+16], m1
-  mova     [dstq+stride3q    ], m2
-  mova     [dstq+stride3q+16 ], m2
-  inc                    lineq
-  lea                    leftq, [leftq+4       ]
-  lea                     dstq, [dstq+strideq*4]
-  jnz .loop
-  REP_RET
-
-INIT_XMM sse2
-cglobal tm_predictor_4x4, 4, 4, 5, dst, stride, above, left
-  pxor                  m1, m1
-  movq                  m0, [aboveq-1]; [63:0] tl t1 t2 t3 t4 x x x
-  punpcklbw             m0, m1
-  pshuflw               m2, m0, 0x0   ; [63:0] tl tl tl tl [word]
-  psrldq                m0, 2
-  psubw                 m0, m2        ; [63:0] t1-tl t2-tl t3-tl t4-tl [word]
-  movd                  m2, [leftq]
-  punpcklbw             m2, m1
-  pshuflw               m4, m2, 0x0   ; [63:0] l1 l1 l1 l1 [word]
-  pshuflw               m3, m2, 0x55  ; [63:0] l2 l2 l2 l2 [word]
-  paddw                 m4, m0
-  paddw                 m3, m0
-  packuswb              m4, m4
-  packuswb              m3, m3
-  movd      [dstq        ], m4
-  movd      [dstq+strideq], m3
-  lea                 dstq, [dstq+strideq*2]
-  pshuflw               m4, m2, 0xaa
-  pshuflw               m3, m2, 0xff
-  paddw                 m4, m0
-  paddw                 m3, m0
-  packuswb              m4, m4
-  packuswb              m3, m3
-  movd      [dstq        ], m4
-  movd      [dstq+strideq], m3
-  RET
-
-INIT_XMM sse2
-cglobal tm_predictor_8x8, 4, 4, 5, dst, stride, above, left
-  pxor                  m1, m1
-  movd                  m2, [aboveq-1]
-  movq                  m0, [aboveq]
-  punpcklbw             m2, m1
-  punpcklbw             m0, m1        ; t1 t2 t3 t4 t5 t6 t7 t8 [word]
-  pshuflw               m2, m2, 0x0   ; [63:0] tl tl tl tl [word]
-  DEFINE_ARGS dst, stride, line, left
-  mov                lineq, -4
-  punpcklqdq            m2, m2        ; tl tl tl tl tl tl tl tl [word]
-  psubw                 m0, m2        ; t1-tl t2-tl ... t8-tl [word]
-  movq                  m2, [leftq]
-  punpcklbw             m2, m1        ; l1 l2 l3 l4 l5 l6 l7 l8 [word]
-.loop
-  pshuflw               m4, m2, 0x0   ; [63:0] l1 l1 l1 l1 [word]
-  pshuflw               m3, m2, 0x55  ; [63:0] l2 l2 l2 l2 [word]
-  punpcklqdq            m4, m4        ; l1 l1 l1 l1 l1 l1 l1 l1 [word]
-  punpcklqdq            m3, m3        ; l2 l2 l2 l2 l2 l2 l2 l2 [word]
-  paddw                 m4, m0
-  paddw                 m3, m0
-  packuswb              m4, m3
-  movq      [dstq        ], m4
-  movhps    [dstq+strideq], m4
-  lea                 dstq, [dstq+strideq*2]
-  psrldq                m2, 4
-  inc                lineq
-  jnz .loop
-  REP_RET
-
-INIT_XMM sse2
-cglobal tm_predictor_16x16, 4, 5, 8, dst, stride, above, left
-  pxor                  m1, m1
-  mova                  m2, [aboveq-16];
-  mova                  m0, [aboveq]   ; t1 t2 ... t16 [byte]
-  punpckhbw             m2, m1         ; [127:112] tl [word]
-  punpckhbw             m4, m0, m1
-  punpcklbw             m0, m1         ; m0:m4 t1 t2 ... t16 [word]
-  DEFINE_ARGS dst, stride, line, left, stride8
-  mov                lineq, -8
-  pshufhw               m2, m2, 0xff
-  mova                  m3, [leftq]    ; l1 l2 ... l16 [byte]
-  punpckhqdq            m2, m2         ; tl repeated 8 times [word]
-  psubw                 m0, m2
-  psubw                 m4, m2         ; m0:m4 t1-tl t2-tl ... t16-tl [word]
-  punpckhbw             m5, m3, m1
-  punpcklbw             m3, m1         ; m3:m5 l1 l2 ... l16 [word]
-  lea             stride8q, [strideq*8]
-.loop:
-  pshuflw               m6, m3, 0x0
-  pshuflw               m7, m5, 0x0
-  punpcklqdq            m6, m6         ; l1 repeated 8 times [word]
-  punpcklqdq            m7, m7         ; l8 repeated 8 times [word]
-  paddw                 m1, m6, m0
-  paddw                 m6, m4         ; m1:m6 ti-tl+l1 [i=1,15] [word]
-  psrldq                m5, 2
-  packuswb              m1, m6
-  mova     [dstq         ], m1
-  paddw                 m1, m7, m0
-  paddw                 m7, m4         ; m1:m7 ti-tl+l8 [i=1,15] [word]
-  psrldq                m3, 2
-  packuswb              m1, m7
-  mova     [dstq+stride8q], m1
-  inc                lineq
-  lea                 dstq, [dstq+strideq]
-  jnz .loop
-  REP_RET
-
-INIT_XMM sse2
-cglobal tm_predictor_32x32, 4, 4, 8, dst, stride, above, left
-  pxor                  m1, m1
-  movd                  m2, [aboveq-1]
-  mova                  m0, [aboveq]
-  mova                  m4, [aboveq+16]
-  punpcklbw             m2, m1
-  punpckhbw             m3, m0, m1
-  punpckhbw             m5, m4, m1
-  punpcklbw             m0, m1
-  punpcklbw             m4, m1
-  pshuflw               m2, m2, 0x0
-  DEFINE_ARGS dst, stride, line, left
-  mov                lineq, -16
-  punpcklqdq            m2, m2
-  add                leftq, 32
-  psubw                 m0, m2
-  psubw                 m3, m2
-  psubw                 m4, m2
-  psubw                 m5, m2
-.loop:
-  movd                  m2, [leftq+lineq*2]
-  pxor                  m1, m1
-  punpcklbw             m2, m1
-  pshuflw               m7, m2, 0x55
-  pshuflw               m2, m2, 0x0
-  punpcklqdq            m2, m2
-  punpcklqdq            m7, m7
-  paddw                 m6, m2, m3
-  paddw                 m1, m2, m0
-  packuswb              m1, m6
-  mova   [dstq           ], m1
-  paddw                 m6, m2, m5
-  paddw                 m1, m2, m4
-  packuswb              m1, m6
-  mova   [dstq+16        ], m1
-  paddw                 m6, m7, m3
-  paddw                 m1, m7, m0
-  packuswb              m1, m6
-  mova   [dstq+strideq   ], m1
-  paddw                 m6, m7, m5
-  paddw                 m1, m7, m4
-  packuswb              m1, m6
-  mova   [dstq+strideq+16], m1
-  lea                 dstq, [dstq+strideq*2]
-  inc                lineq
-  jnz .loop
-  REP_RET
diff --git a/thirdparty/libvpx/vpx_dsp/x86/intrapred_ssse3.asm b/thirdparty/libvpx/vpx_dsp/x86/intrapred_ssse3.asm
deleted file mode 100644
index 5e0139fa8d..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/intrapred_ssse3.asm
+++ /dev/null
@@ -1,871 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-
-pb_1: times 16 db 1
-sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
-sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
-sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
-sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
-sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
-sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0
-sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
-sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-
-SECTION .text
-
-INIT_XMM ssse3
-cglobal d45_predictor_16x16, 3, 6, 4, dst, stride, above, dst8, line, goffset
-  GET_GOT     goffsetq
-
-  mova                   m0, [aboveq]
-  DEFINE_ARGS dst, stride, stride3, dst8, line
-  lea              stride3q, [strideq*3]
-  lea                 dst8q, [dstq+strideq*8]
-  mova                   m1, [GLOBAL(sh_b123456789abcdeff)]
-  pshufb                 m2, m0, [GLOBAL(sh_b23456789abcdefff)]
-  pavgb                  m3, m2, m0
-  pxor                   m2, m0
-  pshufb                 m0, m1
-  pand                   m2, [GLOBAL(pb_1)]
-  psubb                  m3, m2
-  pavgb                  m0, m3
-
-  ; first 4 lines and first half of 3rd 4 lines
-  mov                 lined, 2
-.loop:
-  mova   [dstq            ], m0
-  movhps [dst8q           ], m0
-  pshufb                 m0, m1
-  mova   [dstq +strideq   ], m0
-  movhps [dst8q+strideq   ], m0
-  pshufb                 m0, m1
-  mova   [dstq +strideq*2 ], m0
-  movhps [dst8q+strideq*2 ], m0
-  pshufb                 m0, m1
-  mova   [dstq +stride3q  ], m0
-  movhps [dst8q+stride3q  ], m0
-  pshufb                 m0, m1
-  lea                  dstq, [dstq +strideq*4]
-  lea                 dst8q, [dst8q+strideq*4]
-  dec                 lined
-  jnz .loop
-
-  ; bottom-right 8x8 block
-  movhps [dstq          +8], m0
-  movhps [dstq+strideq  +8], m0
-  movhps [dstq+strideq*2+8], m0
-  movhps [dstq+stride3q +8], m0
-  lea                  dstq, [dstq+strideq*4]
-  movhps [dstq          +8], m0
-  movhps [dstq+strideq  +8], m0
-  movhps [dstq+strideq*2+8], m0
-  movhps [dstq+stride3q +8], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM ssse3
-cglobal d45_predictor_32x32, 3, 6, 7, dst, stride, above, dst16, line, goffset
-  GET_GOT     goffsetq
-
-  mova                   m0, [aboveq]
-  mova                   m4, [aboveq+16]
-  DEFINE_ARGS dst, stride, stride3, dst16, line
-  lea              stride3q, [strideq*3]
-  lea                dst16q, [dstq  +strideq*8]
-  lea                dst16q, [dst16q+strideq*8]
-  mova                   m1, [GLOBAL(sh_b123456789abcdeff)]
-  pshufb                 m2, m4, [GLOBAL(sh_b23456789abcdefff)]
-  pavgb                  m3, m2, m4
-  pxor                   m2, m4
-  palignr                m5, m4, m0, 1
-  palignr                m6, m4, m0, 2
-  pshufb                 m4, m1
-  pand                   m2, [GLOBAL(pb_1)]
-  psubb                  m3, m2
-  pavgb                  m4, m3
-  pavgb                  m3, m0, m6
-  pxor                   m0, m6
-  pand                   m0, [GLOBAL(pb_1)]
-  psubb                  m3, m0
-  pavgb                  m5, m3
-
-  ; write 4x4 lines (and the first half of the second 4x4 lines)
-  mov                  lined, 4
-.loop:
-  mova [dstq               ], m5
-  mova [dstq            +16], m4
-  mova [dst16q             ], m4
-  palignr                 m3, m4, m5, 1
-  pshufb                  m4, m1
-  mova [dstq  +strideq     ], m3
-  mova [dstq  +strideq  +16], m4
-  mova [dst16q+strideq     ], m4
-  palignr                 m5, m4, m3, 1
-  pshufb                  m4, m1
-  mova [dstq  +strideq*2   ], m5
-  mova [dstq  +strideq*2+16], m4
-  mova [dst16q+strideq*2   ], m4
-  palignr                 m3, m4, m5, 1
-  pshufb                  m4, m1
-  mova [dstq  +stride3q    ], m3
-  mova [dstq  +stride3q +16], m4
-  mova [dst16q+stride3q    ], m4
-  palignr                 m5, m4, m3, 1
-  pshufb                  m4, m1
-  lea                  dstq, [dstq  +strideq*4]
-  lea                dst16q, [dst16q+strideq*4]
-  dec                 lined
-  jnz .loop
-
-  ; write second half of second 4x4 lines
-  mova [dstq            +16], m4
-  mova [dstq  +strideq  +16], m4
-  mova [dstq  +strideq*2+16], m4
-  mova [dstq  +stride3q +16], m4
-  lea                  dstq, [dstq  +strideq*4]
-  mova [dstq            +16], m4
-  mova [dstq  +strideq  +16], m4
-  mova [dstq  +strideq*2+16], m4
-  mova [dstq  +stride3q +16], m4
-  lea                  dstq, [dstq  +strideq*4]
-  mova [dstq            +16], m4
-  mova [dstq  +strideq  +16], m4
-  mova [dstq  +strideq*2+16], m4
-  mova [dstq  +stride3q +16], m4
-  lea                  dstq, [dstq  +strideq*4]
-  mova [dstq            +16], m4
-  mova [dstq  +strideq  +16], m4
-  mova [dstq  +strideq*2+16], m4
-  mova [dstq  +stride3q +16], m4
-
-  RESTORE_GOT
-  RET
-
-; ------------------------------------------
-; input: x, y, z, result
-;
-; trick from pascal
-; (x+2y+z+2)>>2 can be calculated as:
-; result = avg(x,z)
-; result -= xor(x,z) & 1
-; result = avg(result,y)
-; ------------------------------------------
-%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
-  pavgb               %4, %1, %3
-  pxor                %3, %1
-  pand                %3, [GLOBAL(pb_1)]
-  psubb               %4, %3
-  pavgb               %4, %2
-%endmacro
-
-INIT_XMM ssse3
-cglobal d63_predictor_4x4, 3, 4, 5, dst, stride, above, goffset
-  GET_GOT     goffsetq
-
-  movq                m3, [aboveq]
-  pshufb              m1, m3, [GLOBAL(sh_b23456777)]
-  pshufb              m2, m3, [GLOBAL(sh_b12345677)]
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4
-  pavgb               m3, m2
-
-  ; store 4 lines
-  movd    [dstq        ], m3
-  movd    [dstq+strideq], m4
-  lea               dstq, [dstq+strideq*2]
-  psrldq              m3, 1
-  psrldq              m4, 1
-  movd    [dstq        ], m3
-  movd    [dstq+strideq], m4
-  RESTORE_GOT
-  RET
-
-INIT_XMM ssse3
-cglobal d63_predictor_8x8, 3, 4, 5, dst, stride, above, goffset
-  GET_GOT     goffsetq
-
-  movq                m3, [aboveq]
-  DEFINE_ARGS dst, stride, stride3
-  lea           stride3q, [strideq*3]
-  pshufb              m1, m3, [GLOBAL(sh_b2345677777777777)]
-  pshufb              m0, m3, [GLOBAL(sh_b0123456777777777)]
-  pshufb              m2, m3, [GLOBAL(sh_b1234567777777777)]
-  pshufb              m3, [GLOBAL(sh_b0123456777777777)]
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m4
-  pavgb               m3, m2
-
-  ; store 4 lines
-  movq    [dstq        ], m3
-  movq    [dstq+strideq], m4
-  psrldq              m3, 1
-  psrldq              m4, 1
-  movq  [dstq+strideq*2], m3
-  movq  [dstq+stride3q ], m4
-  lea               dstq, [dstq+strideq*4]
-  psrldq              m3, 1
-  psrldq              m4, 1
-
-  ; store 4 lines
-  movq    [dstq        ], m3
-  movq    [dstq+strideq], m4
-  psrldq              m3, 1
-  psrldq              m4, 1
-  movq  [dstq+strideq*2], m3
-  movq  [dstq+stride3q ], m4
-  RESTORE_GOT
-  RET
-
-INIT_XMM ssse3
-cglobal d63_predictor_16x16, 3, 5, 5, dst, stride, above, line, goffset
-  GET_GOT     goffsetq
-
-  mova                m0, [aboveq]
-  DEFINE_ARGS dst, stride, stride3, line
-  lea           stride3q, [strideq*3]
-  mova                m1, [GLOBAL(sh_b123456789abcdeff)]
-  pshufb              m2, m0, [GLOBAL(sh_b23456789abcdefff)]
-  pshufb              m3, m0, m1
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m3, m2, m4
-  pavgb               m0, m3
-
-  mov              lined, 4
-.loop:
-  mova  [dstq          ], m0
-  mova  [dstq+strideq  ], m4
-  pshufb              m0, m1
-  pshufb              m4, m1
-  mova  [dstq+strideq*2], m0
-  mova  [dstq+stride3q ], m4
-  pshufb              m0, m1
-  pshufb              m4, m1
-  lea               dstq, [dstq+strideq*4]
-  dec              lined
-  jnz .loop
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM ssse3
-cglobal d63_predictor_32x32, 3, 5, 8, dst, stride, above, line, goffset
-  GET_GOT     goffsetq
-
-  mova                   m0, [aboveq]
-  mova                   m7, [aboveq+16]
-  DEFINE_ARGS dst, stride, stride3, line
-  mova                   m1, [GLOBAL(sh_b123456789abcdeff)]
-  lea              stride3q, [strideq*3]
-  pshufb                 m2, m7, [GLOBAL(sh_b23456789abcdefff)]
-  pshufb                 m3, m7, m1
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m2, m4
-  palignr                m6, m7, m0, 1
-  palignr                m5, m7, m0, 2
-  pavgb                  m7, m3
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m6, m5, m2
-  pavgb                  m0, m6
-
-  mov                 lined, 8
-.loop:
-  mova  [dstq             ], m0
-  mova  [dstq          +16], m7
-  mova  [dstq+strideq     ], m2
-  mova  [dstq+strideq  +16], m4
-  palignr                m3, m7, m0, 1
-  palignr                m5, m4, m2, 1
-  pshufb                 m7, m1
-  pshufb                 m4, m1
-
-  mova  [dstq+strideq*2   ], m3
-  mova  [dstq+strideq*2+16], m7
-  mova  [dstq+stride3q    ], m5
-  mova  [dstq+stride3q +16], m4
-  palignr                m0, m7, m3, 1
-  palignr                m2, m4, m5, 1
-  pshufb                 m7, m1
-  pshufb                 m4, m1
-  lea                  dstq, [dstq+strideq*4]
-  dec                 lined
-  jnz .loop
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM ssse3
-cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-  movd                m0, [leftq]               ; l1, l2, l3, l4
-  movd                m1, [aboveq-1]            ; tl, t1, t2, t3
-  punpckldq           m0, m1                    ; l1, l2, l3, l4, tl, t1, t2, t3
-  pshufb              m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3
-  psrldq              m1, m0, 1                 ; l3, l2, l1, tl, t1, t2, t3
-  psrldq              m2, m0, 2                 ; l2, l1, tl, t1, t2, t3
-  ; comments below are for a predictor like this
-  ; A1 B1 C1 D1
-  ; A2 B2 A1 B1
-  ; A3 B3 A2 B2
-  ; A4 B4 A3 B3
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3  ; 3-tap avg B4 B3 B2 B1 C1 D1
-  pavgb               m1, m0                    ; 2-tap avg A4 A3 A2 A1
-
-  punpcklqdq          m3, m1                    ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 ..
-
-  DEFINE_ARGS dst, stride, stride3
-  lea           stride3q, [strideq*3]
-  pshufb              m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 ..
-  movd  [dstq+stride3q ], m3
-  psrldq              m3, 2                     ; A3 B3 A2 B2 A1 B1 C1 D1 ..
-  movd  [dstq+strideq*2], m3
-  psrldq              m3, 2                     ; A2 B2 A1 B1 C1 D1 ..
-  movd  [dstq+strideq  ], m3
-  psrldq              m3, 2                     ; A1 B1 C1 D1 ..
-  movd  [dstq          ], m3
-  RESTORE_GOT
-  RET
-
-INIT_XMM ssse3
-cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-  movq                m0, [leftq]                     ; [0- 7] l1-8 [byte]
-  movhps              m0, [aboveq-1]                  ; [8-15] tl, t1-7 [byte]
-  pshufb              m1, m0, [GLOBAL(sh_b76543210)]  ; l8-1 [word]
-  pshufb              m2, m0, [GLOBAL(sh_b65432108)]  ; l7-1,tl [word]
-  pshufb              m3, m0, [GLOBAL(sh_b54321089)]  ; l6-1,tl,t1 [word]
-  pshufb              m0, [GLOBAL(sh_b89abcdef)]      ; tl,t1-7 [word]
-  psrldq              m4, m0, 1                       ; t1-7 [word]
-  psrldq              m5, m0, 2                       ; t2-7 [word]
-  ; comments below are for a predictor like this
-  ; A1 B1 C1 D1 E1 F1 G1 H1
-  ; A2 B2 A1 B1 C1 D1 E1 F1
-  ; A3 B3 A2 B2 A1 B1 C1 D1
-  ; A4 B4 A3 B3 A2 B2 A1 B1
-  ; A5 B5 A4 B4 A3 B3 A2 B2
-  ; A6 B6 A5 B5 A4 B4 A3 B3
-  ; A7 B7 A6 B6 A5 B5 A4 B4
-  ; A8 B8 A7 B7 A6 B6 A5 B5
-  pavgb               m6, m1, m2                ; 2-tap avg A8-A1
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7  ; 3-tap avg C-H1
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0  ; 3-tap avg B8-1
-
-  punpcklbw           m6, m0                    ; A-B8, A-B7 ... A-B2, A-B1
-
-  DEFINE_ARGS dst, stride, stride3
-  lea           stride3q, [strideq*3]
-
-  movhps [dstq+stride3q], m6                    ; A-B4, A-B3, A-B2, A-B1
-  palignr             m0, m7, m6, 10            ; A-B3, A-B2, A-B1, C-H1
-  movq  [dstq+strideq*2], m0
-  psrldq              m0, 2                     ; A-B2, A-B1, C-H1
-  movq  [dstq+strideq  ], m0
-  psrldq              m0, 2                     ; A-H1
-  movq  [dstq          ], m0
-  lea               dstq, [dstq+strideq*4]
-  movq  [dstq+stride3q ], m6                    ; A-B8, A-B7, A-B6, A-B5
-  psrldq              m6, 2                     ; A-B7, A-B6, A-B5, A-B4
-  movq  [dstq+strideq*2], m6
-  psrldq              m6, 2                     ; A-B6, A-B5, A-B4, A-B3
-  movq  [dstq+strideq  ], m6
-  psrldq              m6, 2                     ; A-B5, A-B4, A-B3, A-B2
-  movq  [dstq          ], m6
-  RESTORE_GOT
-  RET
-
-INIT_XMM ssse3
-cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-  mova                m0, [leftq]
-  movu                m7, [aboveq-1]
-  ; comments below are for a predictor like this
-  ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1
-  ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1
-  ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1
-  ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1
-  ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1
-  ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1
-  ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1
-  ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1
-  ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2
-  ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3
-  ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4
-  ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5
-  ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6
-  ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7
-  ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8
-  ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9
-  pshufb              m6, m7, [GLOBAL(sh_bfedcba9876543210)]
-  palignr             m5, m0, m6, 15
-  palignr             m3, m0, m6, 14
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4          ; 3-tap avg B3-Bg
-  pshufb              m1, m0, [GLOBAL(sh_b123456789abcdeff)]
-  pavgb               m5, m0                            ; A1 - Ag
-
-  punpcklbw           m0, m4, m5                        ; A-B8 ... A-B1
-  punpckhbw           m4, m5                            ; A-B9 ... A-Bg
-
-  pshufb              m3, m7, [GLOBAL(sh_b123456789abcdeff)]
-  pshufb              m5, m7, [GLOBAL(sh_b23456789abcdefff)]
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1          ; 3-tap avg C1-P1
-
-  pshufb              m6, m0, [GLOBAL(sh_bfedcba9876543210)]
-  DEFINE_ARGS dst, stride, stride3
-  lea           stride3q, [strideq*3]
-  palignr             m2, m1, m6, 14
-  mova  [dstq          ], m2
-  palignr             m2, m1, m6, 12
-  mova  [dstq+strideq  ], m2
-  palignr             m2, m1, m6, 10
-  mova  [dstq+strideq*2], m2
-  palignr             m2, m1, m6, 8
-  mova  [dstq+stride3q ], m2
-  lea               dstq, [dstq+strideq*4]
-  palignr             m2, m1, m6, 6
-  mova  [dstq          ], m2
-  palignr             m2, m1, m6, 4
-  mova  [dstq+strideq  ], m2
-  palignr             m2, m1, m6, 2
-  mova  [dstq+strideq*2], m2
-  pshufb              m4, [GLOBAL(sh_bfedcba9876543210)]
-  mova  [dstq+stride3q ], m6
-  lea               dstq, [dstq+strideq*4]
-
-  palignr             m2, m6, m4, 14
-  mova  [dstq          ], m2
-  palignr             m2, m6, m4, 12
-  mova  [dstq+strideq  ], m2
-  palignr             m2, m6, m4, 10
-  mova  [dstq+strideq*2], m2
-  palignr             m2, m6, m4, 8
-  mova  [dstq+stride3q ], m2
-  lea               dstq, [dstq+strideq*4]
-  palignr             m2, m6, m4, 6
-  mova  [dstq          ], m2
-  palignr             m2, m6, m4, 4
-  mova  [dstq+strideq  ], m2
-  palignr             m2, m6, m4, 2
-  mova  [dstq+strideq*2], m2
-  mova  [dstq+stride3q ], m4
-  RESTORE_GOT
-  RET
-
-INIT_XMM ssse3
-cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-  mova                  m0, [leftq]
-  movu                  m7, [aboveq-1]
-  movu                  m1, [aboveq+15]
-
-  pshufb                m4, m1, [GLOBAL(sh_b123456789abcdeff)]
-  pshufb                m6, m1, [GLOBAL(sh_b23456789abcdefff)]
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2          ; 3-tap avg above [high]
-
-  palignr               m3, m1, m7, 1
-  palignr               m5, m1, m7, 2
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1          ; 3-tap avg above [low]
-
-  pshufb                m7, [GLOBAL(sh_bfedcba9876543210)]
-  palignr               m5, m0, m7, 15
-  palignr               m3, m0, m7, 14
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4          ; 3-tap avg B3-Bg
-  pavgb                 m5, m0                            ; A1 - Ag
-  punpcklbw             m6, m4, m5                        ; A-B8 ... A-B1
-  punpckhbw             m4, m5                            ; A-B9 ... A-Bg
-  pshufb                m6, [GLOBAL(sh_bfedcba9876543210)]
-  pshufb                m4, [GLOBAL(sh_bfedcba9876543210)]
-
-  DEFINE_ARGS dst, stride, stride3, left, line
-  lea             stride3q, [strideq*3]
-
-  palignr               m5, m2, m1, 14
-  palignr               m7, m1, m6, 14
-  mova  [dstq            ], m7
-  mova  [dstq+16         ], m5
-  palignr               m5, m2, m1, 12
-  palignr               m7, m1, m6, 12
-  mova  [dstq+strideq    ], m7
-  mova  [dstq+strideq+16 ], m5
-  palignr                m5, m2, m1, 10
-  palignr                m7, m1, m6, 10
-  mova  [dstq+strideq*2   ], m7
-  mova  [dstq+strideq*2+16], m5
-  palignr                m5, m2, m1, 8
-  palignr                m7, m1, m6, 8
-  mova  [dstq+stride3q    ], m7
-  mova  [dstq+stride3q+16 ], m5
-  lea                  dstq, [dstq+strideq*4]
-  palignr                m5, m2, m1, 6
-  palignr                m7, m1, m6, 6
-  mova  [dstq             ], m7
-  mova  [dstq+16          ], m5
-  palignr                m5, m2, m1, 4
-  palignr                m7, m1, m6, 4
-  mova  [dstq+strideq     ], m7
-  mova  [dstq+strideq+16  ], m5
-  palignr                m5, m2, m1, 2
-  palignr                m7, m1, m6, 2
-  mova  [dstq+strideq*2   ], m7
-  mova  [dstq+strideq*2+16], m5
-  mova  [dstq+stride3q    ], m6
-  mova  [dstq+stride3q+16 ], m1
-  lea                  dstq, [dstq+strideq*4]
-
-  palignr                m5, m1, m6, 14
-  palignr                m3, m6, m4, 14
-  mova  [dstq             ], m3
-  mova  [dstq+16          ], m5
-  palignr                m5, m1, m6, 12
-  palignr                m3, m6, m4, 12
-  mova  [dstq+strideq     ], m3
-  mova  [dstq+strideq+16  ], m5
-  palignr                m5, m1, m6, 10
-  palignr                m3, m6, m4, 10
-  mova  [dstq+strideq*2   ], m3
-  mova  [dstq+strideq*2+16], m5
-  palignr                m5, m1, m6, 8
-  palignr                m3, m6, m4, 8
-  mova  [dstq+stride3q    ], m3
-  mova  [dstq+stride3q+16 ], m5
-  lea                  dstq, [dstq+strideq*4]
-  palignr                m5, m1, m6, 6
-  palignr                m3, m6, m4, 6
-  mova  [dstq             ], m3
-  mova  [dstq+16          ], m5
-  palignr                m5, m1, m6, 4
-  palignr                m3, m6, m4, 4
-  mova  [dstq+strideq     ], m3
-  mova  [dstq+strideq+16  ], m5
-  palignr                m5, m1, m6, 2
-  palignr                m3, m6, m4, 2
-  mova  [dstq+strideq*2   ], m3
-  mova  [dstq+strideq*2+16], m5
-  mova  [dstq+stride3q    ], m4
-  mova  [dstq+stride3q+16 ], m6
-  lea               dstq, [dstq+strideq*4]
-
-  mova                   m7, [leftq]
-  mova                   m3, [leftq+16]
-  palignr                m5, m3, m7, 15
-  palignr                m0, m3, m7, 14
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2          ; 3-tap avg Bh -
-  pavgb                  m5, m3                            ; Ah -
-  punpcklbw              m3, m2, m5                        ; A-B8 ... A-B1
-  punpckhbw              m2, m5                            ; A-B9 ... A-Bg
-  pshufb                 m3, [GLOBAL(sh_bfedcba9876543210)]
-  pshufb                 m2, [GLOBAL(sh_bfedcba9876543210)]
-
-  palignr                m7, m6, m4, 14
-  palignr                m0, m4, m3, 14
-  mova  [dstq             ], m0
-  mova  [dstq+16          ], m7
-  palignr                m7, m6, m4, 12
-  palignr                m0, m4, m3, 12
-  mova  [dstq+strideq     ], m0
-  mova  [dstq+strideq+16  ], m7
-  palignr                m7, m6, m4, 10
-  palignr                m0, m4, m3, 10
-  mova  [dstq+strideq*2   ], m0
-  mova  [dstq+strideq*2+16], m7
-  palignr                m7, m6, m4, 8
-  palignr                m0, m4, m3, 8
-  mova  [dstq+stride3q    ], m0
-  mova  [dstq+stride3q+16 ], m7
-  lea                  dstq, [dstq+strideq*4]
-  palignr                m7, m6, m4, 6
-  palignr                m0, m4, m3, 6
-  mova  [dstq             ], m0
-  mova  [dstq+16          ], m7
-  palignr                m7, m6, m4, 4
-  palignr                m0, m4, m3, 4
-  mova  [dstq+strideq     ], m0
-  mova  [dstq+strideq+16  ], m7
-  palignr                m7, m6, m4, 2
-  palignr                m0, m4, m3, 2
-  mova  [dstq+strideq*2   ], m0
-  mova  [dstq+strideq*2+16], m7
-  mova  [dstq+stride3q    ], m3
-  mova  [dstq+stride3q+16 ], m4
-  lea                  dstq, [dstq+strideq*4]
-
-  palignr                m7, m4, m3, 14
-  palignr                m0, m3, m2, 14
-  mova  [dstq             ], m0
-  mova  [dstq+16          ], m7
-  palignr                m7, m4, m3, 12
-  palignr                m0, m3, m2, 12
-  mova  [dstq+strideq     ], m0
-  mova  [dstq+strideq+16  ], m7
-  palignr                m7, m4, m3, 10
-  palignr                m0, m3, m2, 10
-  mova  [dstq+strideq*2   ], m0
-  mova  [dstq+strideq*2+16], m7
-  palignr                m7, m4, m3, 8
-  palignr                m0, m3, m2, 8
-  mova  [dstq+stride3q    ], m0
-  mova  [dstq+stride3q+16 ], m7
-  lea                  dstq, [dstq+strideq*4]
-  palignr                m7, m4, m3, 6
-  palignr                m0, m3, m2, 6
-  mova  [dstq             ], m0
-  mova  [dstq+16          ], m7
-  palignr                m7, m4, m3, 4
-  palignr                m0, m3, m2, 4
-  mova  [dstq+strideq     ], m0
-  mova  [dstq+strideq+16  ], m7
-  palignr                m7, m4, m3, 2
-  palignr                m0, m3, m2, 2
-  mova  [dstq+strideq*2   ], m0
-  mova  [dstq+strideq*2+16], m7
-  mova  [dstq+stride3q    ], m2
-  mova  [dstq+stride3q+16 ], m3
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM ssse3
-cglobal d207_predictor_8x8, 4, 5, 4, dst, stride, stride3, left, goffset
-  GET_GOT     goffsetq
-  movq                m3, [leftq]            ; abcdefgh [byte]
-  lea           stride3q, [strideq*3]
-
-  pshufb              m1, m3, [GLOBAL(sh_b2345677777777777)]
-  pshufb              m0, m3, [GLOBAL(sh_b0123456777777777)]
-  pshufb              m2, m3, [GLOBAL(sh_b1234567777777777)]
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m3
-  pavgb               m0, m2
-  punpcklbw           m0, m3        ; interleaved output
-
-  movq  [dstq          ], m0
-  psrldq              m0, 2
-  movq  [dstq+strideq  ], m0
-  psrldq              m0, 2
-  movq  [dstq+strideq*2], m0
-  psrldq              m0, 2
-  movq  [dstq+stride3q ], m0
-  lea               dstq, [dstq+strideq*4]
-  pshufhw             m0, m0, q0000 ; de, d2ef, ef, e2fg, fg, f2gh, gh, g3h, 8xh
-  psrldq              m0, 2
-  movq  [dstq          ], m0
-  psrldq              m0, 2
-  movq  [dstq+strideq  ], m0
-  psrldq              m0, 2
-  movq  [dstq+strideq*2], m0
-  psrldq              m0, 2
-  movq  [dstq+stride3q ], m0
-  RESTORE_GOT
-  RET
-
-INIT_XMM ssse3
-cglobal d207_predictor_16x16, 4, 5, 5, dst, stride, stride3, left, goffset
-  GET_GOT     goffsetq
-  lea           stride3q, [strideq*3]
-  mova                m0, [leftq]            ; abcdefghijklmnop [byte]
-  pshufb              m1, m0, [GLOBAL(sh_b123456789abcdeff)] ; bcdefghijklmnopp
-  pshufb              m2, m0, [GLOBAL(sh_b23456789abcdefff)]
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
-  pavgb               m1, m0                 ; ab, bc, cd .. no, op, pp [byte]
-
-  punpckhbw           m4, m1, m3    ; interleaved input
-  punpcklbw           m1, m3        ; interleaved output
-  mova  [dstq          ], m1
-  palignr             m3, m4, m1, 2
-  mova  [dstq+strideq  ], m3
-  palignr             m3, m4, m1, 4
-  mova  [dstq+strideq*2], m3
-  palignr             m3, m4, m1, 6
-  mova  [dstq+stride3q ], m3
-  lea               dstq, [dstq+strideq*4]
-  palignr             m3, m4, m1, 8
-  mova  [dstq          ], m3
-  palignr             m3, m4, m1, 10
-  mova  [dstq+strideq  ], m3
-  palignr             m3, m4, m1, 12
-  mova  [dstq+strideq*2], m3
-  palignr             m3, m4, m1, 14
-  mova  [dstq+stride3q ], m3
-  DEFINE_ARGS dst, stride, stride3, line
-  mov              lined, 2
-  mova                m0, [GLOBAL(sh_b23456789abcdefff)]
-.loop:
-  lea               dstq, [dstq+strideq*4]
-  mova  [dstq          ], m4
-  pshufb              m4, m0
-  mova  [dstq+strideq  ], m4
-  pshufb              m4, m0
-  mova  [dstq+strideq*2], m4
-  pshufb              m4, m0
-  mova  [dstq+stride3q ], m4
-  pshufb              m4, m0
-  dec              lined
-  jnz .loop
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM ssse3
-cglobal d207_predictor_32x32, 4, 5, 8, dst, stride, stride3, left, goffset
-  GET_GOT     goffsetq
-  lea           stride3q, [strideq*3]
-  mova                m1, [leftq]              ;  0-15 [byte]
-  mova                m2, [leftq+16]           ; 16-31 [byte]
-  pshufb              m0, m2, [GLOBAL(sh_b23456789abcdefff)]
-  pshufb              m4, m2, [GLOBAL(sh_b123456789abcdeff)]
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m2, m4, m0, m3
-  palignr             m6, m2, m1, 1
-  palignr             m5, m2, m1, 2
-  pavgb               m2, m4         ; high 16px even lines
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m6, m5, m0
-  pavgb                   m1, m6         ; low 16px even lines
-
-  punpckhbw               m6, m1, m0               ; interleaved output 2
-  punpcklbw               m1, m0                   ; interleaved output 1
-
-  punpckhbw               m7, m2, m3               ; interleaved output 4
-  punpcklbw               m2, m3                   ; interleaved output 3
-
-  ; output 1st 8 lines (and half of 2nd 8 lines)
-  DEFINE_ARGS dst, stride, stride3, dst8
-  lea                  dst8q, [dstq+strideq*8]
-  mova  [dstq              ], m1
-  mova  [dstq           +16], m6
-  mova  [dst8q             ], m6
-  palignr             m0, m6, m1, 2
-  palignr             m4, m2, m6, 2
-  mova  [dstq +strideq     ], m0
-  mova  [dstq +strideq  +16], m4
-  mova  [dst8q+strideq     ], m4
-  palignr             m0, m6, m1, 4
-  palignr             m4, m2, m6, 4
-  mova  [dstq +strideq*2   ], m0
-  mova  [dstq +strideq*2+16], m4
-  mova  [dst8q+strideq*2   ], m4
-  palignr             m0, m6, m1, 6
-  palignr             m4, m2, m6, 6
-  mova  [dstq +stride3q    ], m0
-  mova  [dstq +stride3q +16], m4
-  mova  [dst8q+stride3q    ], m4
-  lea               dstq, [dstq +strideq*4]
-  lea              dst8q, [dst8q+strideq*4]
-  palignr             m0, m6, m1, 8
-  palignr             m4, m2, m6, 8
-  mova  [dstq              ], m0
-  mova  [dstq           +16], m4
-  mova  [dst8q             ], m4
-  palignr             m0, m6, m1, 10
-  palignr             m4, m2, m6, 10
-  mova  [dstq +strideq     ], m0
-  mova  [dstq +strideq  +16], m4
-  mova  [dst8q+strideq     ], m4
-  palignr             m0, m6, m1, 12
-  palignr             m4, m2, m6, 12
-  mova  [dstq +strideq*2   ], m0
-  mova  [dstq +strideq*2+16], m4
-  mova  [dst8q+strideq*2   ], m4
-  palignr             m0, m6, m1, 14
-  palignr             m4, m2, m6, 14
-  mova  [dstq +stride3q    ], m0
-  mova  [dstq +stride3q +16], m4
-  mova  [dst8q+stride3q    ], m4
-  lea               dstq, [dstq+strideq*4]
-  lea              dst8q, [dst8q+strideq*4]
-
-  ; output 2nd half of 2nd 8 lines and half of 3rd 8 lines
-  mova  [dstq           +16], m2
-  mova  [dst8q             ], m2
-  palignr             m4, m7, m2, 2
-  mova  [dstq +strideq  +16], m4
-  mova  [dst8q+strideq     ], m4
-  palignr             m4, m7, m2, 4
-  mova  [dstq +strideq*2+16], m4
-  mova  [dst8q+strideq*2   ], m4
-  palignr             m4, m7, m2, 6
-  mova  [dstq +stride3q +16], m4
-  mova  [dst8q+stride3q    ], m4
-  lea               dstq, [dstq+strideq*4]
-  lea              dst8q, [dst8q+strideq*4]
-  palignr             m4, m7, m2, 8
-  mova  [dstq           +16], m4
-  mova  [dst8q             ], m4
-  palignr             m4, m7, m2, 10
-  mova  [dstq +strideq  +16], m4
-  mova  [dst8q+strideq     ], m4
-  palignr             m4, m7, m2, 12
-  mova  [dstq +strideq*2+16], m4
-  mova  [dst8q+strideq*2   ], m4
-  palignr             m4, m7, m2, 14
-  mova  [dstq +stride3q +16], m4
-  mova  [dst8q+stride3q    ], m4
-  lea               dstq, [dstq+strideq*4]
-  lea              dst8q, [dst8q+strideq*4]
-
-  ; output 2nd half of 3rd 8 lines and half of 4th 8 lines
-  mova                m0, [GLOBAL(sh_b23456789abcdefff)]
-  mova  [dstq           +16], m7
-  mova  [dst8q             ], m7
-  pshufb              m7, m0
-  mova  [dstq +strideq  +16], m7
-  mova  [dst8q+strideq     ], m7
-  pshufb              m7, m0
-  mova  [dstq +strideq*2+16], m7
-  mova  [dst8q+strideq*2   ], m7
-  pshufb              m7, m0
-  mova  [dstq +stride3q +16], m7
-  mova  [dst8q+stride3q    ], m7
-  pshufb              m7, m0
-  lea               dstq, [dstq+strideq*4]
-  lea              dst8q, [dst8q+strideq*4]
-  mova  [dstq           +16], m7
-  mova  [dst8q             ], m7
-  pshufb              m7, m0
-  mova  [dstq +strideq  +16], m7
-  mova  [dst8q+strideq     ], m7
-  pshufb              m7, m0
-  mova  [dstq +strideq*2+16], m7
-  mova  [dst8q+strideq*2   ], m7
-  pshufb              m7, m0
-  mova  [dstq +stride3q +16], m7
-  mova  [dst8q+stride3q    ], m7
-  pshufb              m7, m0
-  lea               dstq, [dstq+strideq*4]
-
-  ; output last half of 4th 8 lines
-  mova  [dstq           +16], m7
-  mova  [dstq +strideq  +16], m7
-  mova  [dstq +strideq*2+16], m7
-  mova  [dstq +stride3q +16], m7
-  lea               dstq, [dstq+strideq*4]
-  mova  [dstq           +16], m7
-  mova  [dstq +strideq  +16], m7
-  mova  [dstq +strideq*2+16], m7
-  mova  [dstq +stride3q +16], m7
-
-  ; done!
-  RESTORE_GOT
-  RET
diff --git a/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_sse2.c b/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
deleted file mode 100644
index df5068c624..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
+++ /dev/null
@@ -1,4046 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/x86/inv_txfm_sse2.h"
-#include "vpx_dsp/x86/txfm_common_sse2.h"
-
-#define RECON_AND_STORE4X4(dest, in_x) \
-{                                                     \
-  __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
-  d0 = _mm_unpacklo_epi8(d0, zero); \
-  d0 = _mm_add_epi16(in_x, d0); \
-  d0 = _mm_packus_epi16(d0, d0); \
-  *(int *)(dest) = _mm_cvtsi128_si32(d0); \
-}
-
-void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
-                             int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i eight = _mm_set1_epi16(8);
-  const __m128i cst = _mm_setr_epi16(
-      (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,
-      (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
-      (int16_t)cospi_8_64, (int16_t)cospi_24_64);
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i input0, input1, input2, input3;
-
-  // Rows
-  input0 = load_input_data(input);
-  input2 = load_input_data(input + 8);
-
-  // Construct i3, i1, i3, i1, i2, i0, i2, i0
-  input0 = _mm_shufflelo_epi16(input0, 0xd8);
-  input0 = _mm_shufflehi_epi16(input0, 0xd8);
-  input2 = _mm_shufflelo_epi16(input2, 0xd8);
-  input2 = _mm_shufflehi_epi16(input2, 0xd8);
-
-  input1 = _mm_unpackhi_epi32(input0, input0);
-  input0 = _mm_unpacklo_epi32(input0, input0);
-  input3 = _mm_unpackhi_epi32(input2, input2);
-  input2 = _mm_unpacklo_epi32(input2, input2);
-
-  // Stage 1
-  input0 = _mm_madd_epi16(input0, cst);
-  input1 = _mm_madd_epi16(input1, cst);
-  input2 = _mm_madd_epi16(input2, cst);
-  input3 = _mm_madd_epi16(input3, cst);
-
-  input0 = _mm_add_epi32(input0, rounding);
-  input1 = _mm_add_epi32(input1, rounding);
-  input2 = _mm_add_epi32(input2, rounding);
-  input3 = _mm_add_epi32(input3, rounding);
-
-  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
-  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
-  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
-  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
-
-  // Stage 2
-  input0 = _mm_packs_epi32(input0, input1);
-  input1 = _mm_packs_epi32(input2, input3);
-
-  // Transpose
-  input2 = _mm_unpacklo_epi16(input0, input1);
-  input3 = _mm_unpackhi_epi16(input0, input1);
-  input0 = _mm_unpacklo_epi32(input2, input3);
-  input1 = _mm_unpackhi_epi32(input2, input3);
-
-  // Switch column2, column 3, and then, we got:
-  // input2: column1, column 0;  input3: column2, column 3.
-  input1 = _mm_shuffle_epi32(input1, 0x4e);
-  input2 = _mm_add_epi16(input0, input1);
-  input3 = _mm_sub_epi16(input0, input1);
-
-  // Columns
-  // Construct i3, i1, i3, i1, i2, i0, i2, i0
-  input0 = _mm_unpacklo_epi32(input2, input2);
-  input1 = _mm_unpackhi_epi32(input2, input2);
-  input2 = _mm_unpackhi_epi32(input3, input3);
-  input3 = _mm_unpacklo_epi32(input3, input3);
-
-  // Stage 1
-  input0 = _mm_madd_epi16(input0, cst);
-  input1 = _mm_madd_epi16(input1, cst);
-  input2 = _mm_madd_epi16(input2, cst);
-  input3 = _mm_madd_epi16(input3, cst);
-
-  input0 = _mm_add_epi32(input0, rounding);
-  input1 = _mm_add_epi32(input1, rounding);
-  input2 = _mm_add_epi32(input2, rounding);
-  input3 = _mm_add_epi32(input3, rounding);
-
-  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
-  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
-  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
-  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
-
-  // Stage 2
-  input0 = _mm_packs_epi32(input0, input2);
-  input1 = _mm_packs_epi32(input1, input3);
-
-  // Transpose
-  input2 = _mm_unpacklo_epi16(input0, input1);
-  input3 = _mm_unpackhi_epi16(input0, input1);
-  input0 = _mm_unpacklo_epi32(input2, input3);
-  input1 = _mm_unpackhi_epi32(input2, input3);
-
-  // Switch column2, column 3, and then, we got:
-  // input2: column1, column 0;  input3: column2, column 3.
-  input1 = _mm_shuffle_epi32(input1, 0x4e);
-  input2 = _mm_add_epi16(input0, input1);
-  input3 = _mm_sub_epi16(input0, input1);
-
-  // Final round and shift
-  input2 = _mm_add_epi16(input2, eight);
-  input3 = _mm_add_epi16(input3, eight);
-
-  input2 = _mm_srai_epi16(input2, 4);
-  input3 = _mm_srai_epi16(input3, 4);
-
-  // Reconstruction and Store
-  {
-    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
-    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
-    d0 = _mm_unpacklo_epi32(d0,
-                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
-    d2 = _mm_unpacklo_epi32(
-        _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);
-    d0 = _mm_unpacklo_epi8(d0, zero);
-    d2 = _mm_unpacklo_epi8(d2, zero);
-    d0 = _mm_add_epi16(d0, input2);
-    d2 = _mm_add_epi16(d2, input3);
-    d0 = _mm_packus_epi16(d0, d2);
-    // store input0
-    *(int *)dest = _mm_cvtsi128_si32(d0);
-    // store input1
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
-    // store input2
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
-    // store input3
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
-  }
-}
-
-void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
-                            int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a;
-
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 4);
-
-  dc_value = _mm_set1_epi16(a);
-
-  RECON_AND_STORE4X4(dest + 0 * stride, dc_value);
-  RECON_AND_STORE4X4(dest + 1 * stride, dc_value);
-  RECON_AND_STORE4X4(dest + 2 * stride, dc_value);
-  RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
-}
-
-static INLINE void transpose_4x4(__m128i *res) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
-  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
-
-  res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
-  res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
-}
-
-void idct4_sse2(__m128i *in) {
-  const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i u[8], v[8];
-
-  transpose_4x4(in);
-  // stage 1
-  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
-  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-
-  u[0] = _mm_packs_epi32(v[0], v[1]);
-  u[1] = _mm_packs_epi32(v[3], v[2]);
-
-  // stage 2
-  in[0] = _mm_add_epi16(u[0], u[1]);
-  in[1] = _mm_sub_epi16(u[0], u[1]);
-  in[1] = _mm_shuffle_epi32(in[1], 0x4E);
-}
-
-void iadst4_sse2(__m128i *in) {
-  const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
-  const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
-  const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
-  const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
-  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
-  const __m128i kZero = _mm_set1_epi16(0);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i u[8], v[8], in7;
-
-  transpose_4x4(in);
-  in7 = _mm_srli_si128(in[1], 8);
-  in7 = _mm_add_epi16(in7, in[0]);
-  in7 = _mm_sub_epi16(in7, in[1]);
-
-  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
-  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
-  u[2] = _mm_unpacklo_epi16(in7, kZero);
-  u[3] = _mm_unpackhi_epi16(in[0], kZero);
-
-  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
-  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
-  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2
-  v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4
-  v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6
-  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2
-
-  u[0] = _mm_add_epi32(v[0], v[1]);
-  u[1] = _mm_add_epi32(v[3], v[4]);
-  u[2] = v[2];
-  u[3] = _mm_add_epi32(u[0], u[1]);
-  u[4] = _mm_slli_epi32(v[5], 2);
-  u[5] = _mm_add_epi32(u[3], v[5]);
-  u[6] = _mm_sub_epi32(u[5], u[4]);
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-
-  in[0] = _mm_packs_epi32(u[0], u[1]);
-  in[1] = _mm_packs_epi32(u[2], u[3]);
-}
-
-#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
-                      out0, out1, out2, out3, out4, out5, out6, out7) \
-  {                                                     \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
-    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
-    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
-    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
-    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
-    const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
-    const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
-                                                        \
-    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
-    const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
-    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
-    const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
-    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
-    const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
-    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
-    const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
-                                                            \
-    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
-    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
-    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
-    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
-    out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
-    out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
-    out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
-    out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
-  }
-
-#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \
-                         out0, out1, out2, out3) \
-  {                                              \
-    const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
-    const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
-    const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
-    \
-    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
-    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
-    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
-    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
-    \
-    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
-    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
-    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
-    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
-  }
-
-#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
-  {                                            \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
-    out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
-    out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
-  }
-
-// Define Macro for multiplying elements by constants and adding them together.
-#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
-                               cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
-  {   \
-      tmp0 = _mm_madd_epi16(lo_0, cst0); \
-      tmp1 = _mm_madd_epi16(hi_0, cst0); \
-      tmp2 = _mm_madd_epi16(lo_0, cst1); \
-      tmp3 = _mm_madd_epi16(hi_0, cst1); \
-      tmp4 = _mm_madd_epi16(lo_1, cst2); \
-      tmp5 = _mm_madd_epi16(hi_1, cst2); \
-      tmp6 = _mm_madd_epi16(lo_1, cst3); \
-      tmp7 = _mm_madd_epi16(hi_1, cst3); \
-      \
-      tmp0 = _mm_add_epi32(tmp0, rounding); \
-      tmp1 = _mm_add_epi32(tmp1, rounding); \
-      tmp2 = _mm_add_epi32(tmp2, rounding); \
-      tmp3 = _mm_add_epi32(tmp3, rounding); \
-      tmp4 = _mm_add_epi32(tmp4, rounding); \
-      tmp5 = _mm_add_epi32(tmp5, rounding); \
-      tmp6 = _mm_add_epi32(tmp6, rounding); \
-      tmp7 = _mm_add_epi32(tmp7, rounding); \
-      \
-      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-      tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
-      tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
-      tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
-      tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
-      \
-      res0 = _mm_packs_epi32(tmp0, tmp1); \
-      res1 = _mm_packs_epi32(tmp2, tmp3); \
-      res2 = _mm_packs_epi32(tmp4, tmp5); \
-      res3 = _mm_packs_epi32(tmp6, tmp7); \
-  }
-
-#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
-  {   \
-      tmp0 = _mm_madd_epi16(lo_0, cst0); \
-      tmp1 = _mm_madd_epi16(hi_0, cst0); \
-      tmp2 = _mm_madd_epi16(lo_0, cst1); \
-      tmp3 = _mm_madd_epi16(hi_0, cst1); \
-      \
-      tmp0 = _mm_add_epi32(tmp0, rounding); \
-      tmp1 = _mm_add_epi32(tmp1, rounding); \
-      tmp2 = _mm_add_epi32(tmp2, rounding); \
-      tmp3 = _mm_add_epi32(tmp3, rounding); \
-      \
-      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-      \
-      res0 = _mm_packs_epi32(tmp0, tmp1); \
-      res1 = _mm_packs_epi32(tmp2, tmp3); \
-  }
-
-#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \
-              out0, out1, out2, out3, out4, out5, out6, out7)  \
-  { \
-  /* Stage1 */      \
-  { \
-    const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
-    const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
-    const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
-    const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
-    \
-    MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
-                          stg1_1, stg1_2, stg1_3, stp1_4,      \
-                          stp1_7, stp1_5, stp1_6)              \
-  } \
-    \
-  /* Stage2 */ \
-  { \
-    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
-    const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
-    const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
-    const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
-    \
-    MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
-                           stg2_1, stg2_2, stg2_3, stp2_0,     \
-                           stp2_1, stp2_2, stp2_3)             \
-    \
-    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
-    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
-    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
-    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
-  } \
-    \
-  /* Stage3 */ \
-  { \
-    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
-    const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
-    \
-    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
-    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
-    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
-    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
-    \
-    tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
-    tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
-    tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
-    tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
-    \
-    tmp0 = _mm_add_epi32(tmp0, rounding); \
-    tmp1 = _mm_add_epi32(tmp1, rounding); \
-    tmp2 = _mm_add_epi32(tmp2, rounding); \
-    tmp3 = _mm_add_epi32(tmp3, rounding); \
-    \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-    \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
-  } \
-  \
-  /* Stage4  */ \
-  out0 = _mm_adds_epi16(stp1_0, stp2_7); \
-  out1 = _mm_adds_epi16(stp1_1, stp1_6); \
-  out2 = _mm_adds_epi16(stp1_2, stp1_5); \
-  out3 = _mm_adds_epi16(stp1_3, stp2_4); \
-  out4 = _mm_subs_epi16(stp1_3, stp2_4); \
-  out5 = _mm_subs_epi16(stp1_2, stp1_5); \
-  out6 = _mm_subs_epi16(stp1_1, stp1_6); \
-  out7 = _mm_subs_epi16(stp1_0, stp2_7); \
-  }
-
-void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
-                             int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-
-  // Load input data.
-  in0 = load_input_data(input);
-  in1 = load_input_data(input + 8 * 1);
-  in2 = load_input_data(input + 8 * 2);
-  in3 = load_input_data(input + 8 * 3);
-  in4 = load_input_data(input + 8 * 4);
-  in5 = load_input_data(input + 8 * 5);
-  in6 = load_input_data(input + 8 * 6);
-  in7 = load_input_data(input + 8 * 7);
-
-  // 2-D
-  for (i = 0; i < 2; i++) {
-    // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
-    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,
-                  in0, in1, in2, in3, in4, in5, in6, in7);
-
-    // 4-stage 1D idct8x8
-    IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
-          in0, in1, in2, in3, in4, in5, in6, in7);
-  }
-
-  // Final rounding and shift
-  in0 = _mm_adds_epi16(in0, final_rounding);
-  in1 = _mm_adds_epi16(in1, final_rounding);
-  in2 = _mm_adds_epi16(in2, final_rounding);
-  in3 = _mm_adds_epi16(in3, final_rounding);
-  in4 = _mm_adds_epi16(in4, final_rounding);
-  in5 = _mm_adds_epi16(in5, final_rounding);
-  in6 = _mm_adds_epi16(in6, final_rounding);
-  in7 = _mm_adds_epi16(in7, final_rounding);
-
-  in0 = _mm_srai_epi16(in0, 5);
-  in1 = _mm_srai_epi16(in1, 5);
-  in2 = _mm_srai_epi16(in2, 5);
-  in3 = _mm_srai_epi16(in3, 5);
-  in4 = _mm_srai_epi16(in4, 5);
-  in5 = _mm_srai_epi16(in5, 5);
-  in6 = _mm_srai_epi16(in6, 5);
-  in7 = _mm_srai_epi16(in7, 5);
-
-  RECON_AND_STORE(dest + 0 * stride, in0);
-  RECON_AND_STORE(dest + 1 * stride, in1);
-  RECON_AND_STORE(dest + 2 * stride, in2);
-  RECON_AND_STORE(dest + 3 * stride, in3);
-  RECON_AND_STORE(dest + 4 * stride, in4);
-  RECON_AND_STORE(dest + 5 * stride, in5);
-  RECON_AND_STORE(dest + 6 * stride, in6);
-  RECON_AND_STORE(dest + 7 * stride, in7);
-}
-
-void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
-                            int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a;
-
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 5);
-
-  dc_value = _mm_set1_epi16(a);
-
-  RECON_AND_STORE(dest + 0 * stride, dc_value);
-  RECON_AND_STORE(dest + 1 * stride, dc_value);
-  RECON_AND_STORE(dest + 2 * stride, dc_value);
-  RECON_AND_STORE(dest + 3 * stride, dc_value);
-  RECON_AND_STORE(dest + 4 * stride, dc_value);
-  RECON_AND_STORE(dest + 5 * stride, dc_value);
-  RECON_AND_STORE(dest + 6 * stride, dc_value);
-  RECON_AND_STORE(dest + 7 * stride, dc_value);
-}
-
-void idct8_sse2(__m128i *in) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
-  // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
-  TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
-                in0, in1, in2, in3, in4, in5, in6, in7);
-
-  // 4-stage 1D idct8x8
-  IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
-        in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
-}
-
-void iadst8_sse2(__m128i *in) {
-  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__const_0 = _mm_set1_epi16(0);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
-  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
-  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-
-  // transpose
-  array_transpose_8x8(in, in);
-
-  // properly aligned for butterfly input
-  in0 = in[7];
-  in1 = in[0];
-  in2 = in[5];
-  in3 = in[2];
-  in4 = in[3];
-  in5 = in[4];
-  in6 = in[1];
-  in7 = in[6];
-
-  // column transformation
-  // stage 1
-  // interleave and multiply/add into 32-bit integer
-  s0 = _mm_unpacklo_epi16(in0, in1);
-  s1 = _mm_unpackhi_epi16(in0, in1);
-  s2 = _mm_unpacklo_epi16(in2, in3);
-  s3 = _mm_unpackhi_epi16(in2, in3);
-  s4 = _mm_unpacklo_epi16(in4, in5);
-  s5 = _mm_unpackhi_epi16(in4, in5);
-  s6 = _mm_unpacklo_epi16(in6, in7);
-  s7 = _mm_unpackhi_epi16(in6, in7);
-
-  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
-  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
-  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
-  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
-  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
-  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
-  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
-  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
-  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
-  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
-  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
-  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
-  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
-  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
-  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
-  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
-
-  // addition
-  w0 = _mm_add_epi32(u0, u8);
-  w1 = _mm_add_epi32(u1, u9);
-  w2 = _mm_add_epi32(u2, u10);
-  w3 = _mm_add_epi32(u3, u11);
-  w4 = _mm_add_epi32(u4, u12);
-  w5 = _mm_add_epi32(u5, u13);
-  w6 = _mm_add_epi32(u6, u14);
-  w7 = _mm_add_epi32(u7, u15);
-  w8 = _mm_sub_epi32(u0, u8);
-  w9 = _mm_sub_epi32(u1, u9);
-  w10 = _mm_sub_epi32(u2, u10);
-  w11 = _mm_sub_epi32(u3, u11);
-  w12 = _mm_sub_epi32(u4, u12);
-  w13 = _mm_sub_epi32(u5, u13);
-  w14 = _mm_sub_epi32(u6, u14);
-  w15 = _mm_sub_epi32(u7, u15);
-
-  // shift and rounding
-  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
-  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
-  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
-  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
-  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
-  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
-  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
-  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
-  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
-  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
-  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
-  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
-  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
-  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
-  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
-  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
-  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
-  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
-  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
-  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
-
-  // back to 16-bit and pack 8 integers into __m128i
-  in[0] = _mm_packs_epi32(u0, u1);
-  in[1] = _mm_packs_epi32(u2, u3);
-  in[2] = _mm_packs_epi32(u4, u5);
-  in[3] = _mm_packs_epi32(u6, u7);
-  in[4] = _mm_packs_epi32(u8, u9);
-  in[5] = _mm_packs_epi32(u10, u11);
-  in[6] = _mm_packs_epi32(u12, u13);
-  in[7] = _mm_packs_epi32(u14, u15);
-
-  // stage 2
-  s0 = _mm_add_epi16(in[0], in[2]);
-  s1 = _mm_add_epi16(in[1], in[3]);
-  s2 = _mm_sub_epi16(in[0], in[2]);
-  s3 = _mm_sub_epi16(in[1], in[3]);
-  u0 = _mm_unpacklo_epi16(in[4], in[5]);
-  u1 = _mm_unpackhi_epi16(in[4], in[5]);
-  u2 = _mm_unpacklo_epi16(in[6], in[7]);
-  u3 = _mm_unpackhi_epi16(in[6], in[7]);
-
-  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
-  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
-  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
-  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
-  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
-  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
-  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
-  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
-
-  w0 = _mm_add_epi32(v0, v4);
-  w1 = _mm_add_epi32(v1, v5);
-  w2 = _mm_add_epi32(v2, v6);
-  w3 = _mm_add_epi32(v3, v7);
-  w4 = _mm_sub_epi32(v0, v4);
-  w5 = _mm_sub_epi32(v1, v5);
-  w6 = _mm_sub_epi32(v2, v6);
-  w7 = _mm_sub_epi32(v3, v7);
-
-  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
-  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
-  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
-  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
-  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-
-  // back to 16-bit intergers
-  s4 = _mm_packs_epi32(u0, u1);
-  s5 = _mm_packs_epi32(u2, u3);
-  s6 = _mm_packs_epi32(u4, u5);
-  s7 = _mm_packs_epi32(u6, u7);
-
-  // stage 3
-  u0 = _mm_unpacklo_epi16(s2, s3);
-  u1 = _mm_unpackhi_epi16(s2, s3);
-  u2 = _mm_unpacklo_epi16(s6, s7);
-  u3 = _mm_unpackhi_epi16(s6, s7);
-
-  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
-  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
-  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
-  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
-  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
-  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
-  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
-  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
-
-  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
-  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
-  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
-  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
-  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
-  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
-  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
-  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
-  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
-  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
-  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
-  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
-  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
-  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
-  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
-  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
-  s2 = _mm_packs_epi32(v0, v1);
-  s3 = _mm_packs_epi32(v2, v3);
-  s6 = _mm_packs_epi32(v4, v5);
-  s7 = _mm_packs_epi32(v6, v7);
-
-  in[0] = s0;
-  in[1] = _mm_sub_epi16(k__const_0, s4);
-  in[2] = s6;
-  in[3] = _mm_sub_epi16(k__const_0, s2);
-  in[4] = s3;
-  in[5] = _mm_sub_epi16(k__const_0, s7);
-  in[6] = s5;
-  in[7] = _mm_sub_epi16(k__const_0, s1);
-}
-
-void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
-                             int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
-  // Rows. Load 4-row input data.
-  in0 = load_input_data(input);
-  in1 = load_input_data(input + 8 * 1);
-  in2 = load_input_data(input + 8 * 2);
-  in3 = load_input_data(input + 8 * 3);
-
-  // 8x4 Transpose
-  TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
-  // Stage1
-  {
-    const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
-    const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
-
-    tmp0 = _mm_madd_epi16(lo_17, stg1_0);
-    tmp2 = _mm_madd_epi16(lo_17, stg1_1);
-    tmp4 = _mm_madd_epi16(lo_35, stg1_2);
-    tmp6 = _mm_madd_epi16(lo_35, stg1_3);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
-    stp1_5 = _mm_packs_epi32(tmp4, tmp6);
-  }
-
-  // Stage2
-  {
-    const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
-    const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
-
-    tmp0 = _mm_madd_epi16(lo_04, stg2_0);
-    tmp2 = _mm_madd_epi16(lo_04, stg2_1);
-    tmp4 = _mm_madd_epi16(lo_26, stg2_2);
-    tmp6 = _mm_madd_epi16(lo_26, stg2_3);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp2_0 = _mm_packs_epi32(tmp0, tmp2);
-    stp2_2 = _mm_packs_epi32(tmp6, tmp4);
-
-    tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
-    tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
-
-    stp2_4 = tmp0;
-    stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
-    stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
-  }
-
-  // Stage3
-  {
-    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
-
-    tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
-    tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
-
-    stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
-    stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
-
-    tmp0 = _mm_madd_epi16(lo_56, stg3_0);
-    tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
-    stp1_5 = _mm_packs_epi32(tmp0, tmp2);
-  }
-
-  // Stage4
-  tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
-  tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
-  tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
-  tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
-
-  TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
-
-  IDCT8(in0, in1, in2, in3, zero, zero, zero, zero,
-        in0, in1, in2, in3, in4, in5, in6, in7);
-  // Final rounding and shift
-  in0 = _mm_adds_epi16(in0, final_rounding);
-  in1 = _mm_adds_epi16(in1, final_rounding);
-  in2 = _mm_adds_epi16(in2, final_rounding);
-  in3 = _mm_adds_epi16(in3, final_rounding);
-  in4 = _mm_adds_epi16(in4, final_rounding);
-  in5 = _mm_adds_epi16(in5, final_rounding);
-  in6 = _mm_adds_epi16(in6, final_rounding);
-  in7 = _mm_adds_epi16(in7, final_rounding);
-
-  in0 = _mm_srai_epi16(in0, 5);
-  in1 = _mm_srai_epi16(in1, 5);
-  in2 = _mm_srai_epi16(in2, 5);
-  in3 = _mm_srai_epi16(in3, 5);
-  in4 = _mm_srai_epi16(in4, 5);
-  in5 = _mm_srai_epi16(in5, 5);
-  in6 = _mm_srai_epi16(in6, 5);
-  in7 = _mm_srai_epi16(in7, 5);
-
-  RECON_AND_STORE(dest + 0 * stride, in0);
-  RECON_AND_STORE(dest + 1 * stride, in1);
-  RECON_AND_STORE(dest + 2 * stride, in2);
-  RECON_AND_STORE(dest + 3 * stride, in3);
-  RECON_AND_STORE(dest + 4 * stride, in4);
-  RECON_AND_STORE(dest + 5 * stride, in5);
-  RECON_AND_STORE(dest + 6 * stride, in6);
-  RECON_AND_STORE(dest + 7 * stride, in7);
-}
-
-#define IDCT16 \
-  /* Stage2 */ \
-  { \
-    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
-    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
-    const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);   \
-    const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);   \
-    const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
-    const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
-    const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
-    const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
-    \
-    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
-                           stg2_0, stg2_1, stg2_2, stg2_3, \
-                           stp2_8, stp2_15, stp2_9, stp2_14) \
-    \
-    MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
-                           stg2_4, stg2_5, stg2_6, stg2_7, \
-                           stp2_10, stp2_13, stp2_11, stp2_12) \
-  } \
-    \
-  /* Stage3 */ \
-  { \
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
-    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
-    const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
-    const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
-    \
-    MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
-                           stg3_0, stg3_1, stg3_2, stg3_3, \
-                           stp1_4, stp1_7, stp1_5, stp1_6) \
-    \
-    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);  \
-    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);    \
-    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
-    stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
-    \
-    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
-    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
-    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
-  } \
-  \
-  /* Stage4 */ \
-  { \
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
-    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
-    const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
-    const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
-    \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
-    \
-    MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
-                           stg4_0, stg4_1, stg4_2, stg4_3, \
-                           stp2_0, stp2_1, stp2_2, stp2_3) \
-    \
-    stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
-    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
-    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
-    stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
-    \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
-                           stg4_4, stg4_5, stg4_6, stg4_7, \
-                           stp2_9, stp2_14, stp2_10, stp2_13) \
-  } \
-    \
-  /* Stage5 */ \
-  { \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
-    \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
-    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
-    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
-    \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
-    \
-    tmp0 = _mm_add_epi32(tmp0, rounding); \
-    tmp1 = _mm_add_epi32(tmp1, rounding); \
-    tmp2 = _mm_add_epi32(tmp2, rounding); \
-    tmp3 = _mm_add_epi32(tmp3, rounding); \
-    \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-    \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
-    \
-    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
-    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
-    \
-    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
-    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
-  } \
-    \
-  /* Stage6 */ \
-  { \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
-    \
-    stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
-    stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
-    \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
-                           stg6_0, stg4_0, stg6_0, stg4_0, \
-                           stp2_10, stp2_13, stp2_11, stp2_12) \
-  }
-
-#define IDCT16_10 \
-    /* Stage2 */ \
-    { \
-      const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
-      const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
-      const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
-      const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
-      \
-      MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \
-                             stg2_0, stg2_1, stg2_6, stg2_7, \
-                             stp1_8_0, stp1_15, stp1_11, stp1_12_0) \
-    } \
-      \
-    /* Stage3 */ \
-    { \
-      const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
-      const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
-      \
-      MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \
-                               stg3_0, stg3_1,  \
-                               stp2_4, stp2_7) \
-      \
-      stp1_9  =  stp1_8_0; \
-      stp1_10 =  stp1_11;  \
-      \
-      stp1_13 = stp1_12_0; \
-      stp1_14 = stp1_15;   \
-    } \
-    \
-    /* Stage4 */ \
-    { \
-      const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
-      const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
-      \
-      const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
-      const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
-      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
-      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
-      \
-      MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \
-                               stg4_0, stg4_1, \
-                               stp1_0, stp1_1) \
-      stp2_5 = stp2_4; \
-      stp2_6 = stp2_7; \
-      \
-      MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
-                             stg4_4, stg4_5, stg4_6, stg4_7, \
-                             stp2_9, stp2_14, stp2_10, stp2_13) \
-    } \
-      \
-    /* Stage5 */ \
-    { \
-      const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
-      const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
-      \
-      stp1_2 = stp1_1; \
-      stp1_3 = stp1_0; \
-      \
-      tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
-      tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
-      tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
-      tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
-      \
-      tmp0 = _mm_add_epi32(tmp0, rounding); \
-      tmp1 = _mm_add_epi32(tmp1, rounding); \
-      tmp2 = _mm_add_epi32(tmp2, rounding); \
-      tmp3 = _mm_add_epi32(tmp3, rounding); \
-      \
-      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-      \
-      stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
-      stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
-      \
-      stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
-      stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
-      stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
-      stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
-      \
-      stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
-      stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
-      stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
-      stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
-    } \
-      \
-    /* Stage6 */ \
-    { \
-      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
-      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
-      const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
-      const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
-      \
-      stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
-      stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
-      stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
-      stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
-      stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
-      stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
-      stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
-      stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
-      \
-      MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
-                             stg6_0, stg4_0, stg6_0, stg4_0, \
-                             stp2_10, stp2_13, stp2_11, stp2_12) \
-    }
-
-void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
-                                int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in[16], l[16], r[16], *curr1;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-          stp1_8_0, stp1_12_0;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-
-  curr1 = l;
-  for (i = 0; i < 2; i++) {
-    // 1-D idct
-
-    // Load input data.
-    in[0] = load_input_data(input);
-    in[8] = load_input_data(input + 8 * 1);
-    in[1] = load_input_data(input + 8 * 2);
-    in[9] = load_input_data(input + 8 * 3);
-    in[2] = load_input_data(input + 8 * 4);
-    in[10] = load_input_data(input + 8 * 5);
-    in[3] = load_input_data(input + 8 * 6);
-    in[11] = load_input_data(input + 8 * 7);
-    in[4] = load_input_data(input + 8 * 8);
-    in[12] = load_input_data(input + 8 * 9);
-    in[5] = load_input_data(input + 8 * 10);
-    in[13] = load_input_data(input + 8 * 11);
-    in[6] = load_input_data(input + 8 * 12);
-    in[14] = load_input_data(input + 8 * 13);
-    in[7] = load_input_data(input + 8 * 14);
-    in[15] = load_input_data(input + 8 * 15);
-
-    array_transpose_8x8(in, in);
-    array_transpose_8x8(in + 8, in + 8);
-
-    IDCT16
-
-    // Stage7
-    curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
-    curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
-    curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
-    curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
-    curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
-    curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
-    curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
-    curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
-    curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
-    curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
-    curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
-    curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
-    curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
-    curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
-    curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
-    curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-    curr1 = r;
-    input += 128;
-  }
-  for (i = 0; i < 2; i++) {
-    int j;
-    // 1-D idct
-    array_transpose_8x8(l + i * 8, in);
-    array_transpose_8x8(r + i * 8, in + 8);
-
-    IDCT16
-
-    // 2-D
-    in[0] = _mm_add_epi16(stp2_0, stp1_15);
-    in[1] = _mm_add_epi16(stp2_1, stp1_14);
-    in[2] = _mm_add_epi16(stp2_2, stp2_13);
-    in[3] = _mm_add_epi16(stp2_3, stp2_12);
-    in[4] = _mm_add_epi16(stp2_4, stp2_11);
-    in[5] = _mm_add_epi16(stp2_5, stp2_10);
-    in[6] = _mm_add_epi16(stp2_6, stp1_9);
-    in[7] = _mm_add_epi16(stp2_7, stp1_8);
-    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
-    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
-    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
-    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
-    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
-    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
-    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
-    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-    for (j = 0; j < 16; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
-
-    dest += 8;
-  }
-}
-
-void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
-                              int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a, i;
-
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 6);
-
-  dc_value = _mm_set1_epi16(a);
-
-  for (i = 0; i < 16; ++i) {
-    RECON_AND_STORE(dest +  0, dc_value);
-    RECON_AND_STORE(dest +  8, dc_value);
-    dest += stride;
-  }
-}
-
-static void iadst16_8col(__m128i *in) {
-  // perform 16x16 1-D ADST for 8 columns
-  __m128i s[16], x[16], u[32], v[32];
-  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
-  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
-  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
-  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
-  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
-  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
-  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
-  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
-  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
-  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kZero = _mm_set1_epi16(0);
-
-  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
-  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
-  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
-  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
-  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
-  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
-  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
-  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
-  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
-  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
-  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
-  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
-  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
-  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
-  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
-  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
-  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
-  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
-  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
-  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
-  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
-  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
-  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
-  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
-  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
-  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
-  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
-  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
-  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
-  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
-  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
-  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
-
-  u[0] = _mm_add_epi32(v[0], v[16]);
-  u[1] = _mm_add_epi32(v[1], v[17]);
-  u[2] = _mm_add_epi32(v[2], v[18]);
-  u[3] = _mm_add_epi32(v[3], v[19]);
-  u[4] = _mm_add_epi32(v[4], v[20]);
-  u[5] = _mm_add_epi32(v[5], v[21]);
-  u[6] = _mm_add_epi32(v[6], v[22]);
-  u[7] = _mm_add_epi32(v[7], v[23]);
-  u[8] = _mm_add_epi32(v[8], v[24]);
-  u[9] = _mm_add_epi32(v[9], v[25]);
-  u[10] = _mm_add_epi32(v[10], v[26]);
-  u[11] = _mm_add_epi32(v[11], v[27]);
-  u[12] = _mm_add_epi32(v[12], v[28]);
-  u[13] = _mm_add_epi32(v[13], v[29]);
-  u[14] = _mm_add_epi32(v[14], v[30]);
-  u[15] = _mm_add_epi32(v[15], v[31]);
-  u[16] = _mm_sub_epi32(v[0], v[16]);
-  u[17] = _mm_sub_epi32(v[1], v[17]);
-  u[18] = _mm_sub_epi32(v[2], v[18]);
-  u[19] = _mm_sub_epi32(v[3], v[19]);
-  u[20] = _mm_sub_epi32(v[4], v[20]);
-  u[21] = _mm_sub_epi32(v[5], v[21]);
-  u[22] = _mm_sub_epi32(v[6], v[22]);
-  u[23] = _mm_sub_epi32(v[7], v[23]);
-  u[24] = _mm_sub_epi32(v[8], v[24]);
-  u[25] = _mm_sub_epi32(v[9], v[25]);
-  u[26] = _mm_sub_epi32(v[10], v[26]);
-  u[27] = _mm_sub_epi32(v[11], v[27]);
-  u[28] = _mm_sub_epi32(v[12], v[28]);
-  u[29] = _mm_sub_epi32(v[13], v[29]);
-  u[30] = _mm_sub_epi32(v[14], v[30]);
-  u[31] = _mm_sub_epi32(v[15], v[31]);
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
-  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
-  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
-  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
-  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
-  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
-  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
-  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
-  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
-  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
-  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
-  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
-  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
-  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
-  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
-  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
-  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
-  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
-  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
-  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
-  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
-  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
-  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
-  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
-  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
-  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
-  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
-  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
-  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
-  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
-  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
-
-  s[0] = _mm_packs_epi32(u[0], u[1]);
-  s[1] = _mm_packs_epi32(u[2], u[3]);
-  s[2] = _mm_packs_epi32(u[4], u[5]);
-  s[3] = _mm_packs_epi32(u[6], u[7]);
-  s[4] = _mm_packs_epi32(u[8], u[9]);
-  s[5] = _mm_packs_epi32(u[10], u[11]);
-  s[6] = _mm_packs_epi32(u[12], u[13]);
-  s[7] = _mm_packs_epi32(u[14], u[15]);
-  s[8] = _mm_packs_epi32(u[16], u[17]);
-  s[9] = _mm_packs_epi32(u[18], u[19]);
-  s[10] = _mm_packs_epi32(u[20], u[21]);
-  s[11] = _mm_packs_epi32(u[22], u[23]);
-  s[12] = _mm_packs_epi32(u[24], u[25]);
-  s[13] = _mm_packs_epi32(u[26], u[27]);
-  s[14] = _mm_packs_epi32(u[28], u[29]);
-  s[15] = _mm_packs_epi32(u[30], u[31]);
-
-  // stage 2
-  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
-  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
-  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
-  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
-  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
-  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
-  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
-  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
-
-  u[0] = _mm_add_epi32(v[0], v[8]);
-  u[1] = _mm_add_epi32(v[1], v[9]);
-  u[2] = _mm_add_epi32(v[2], v[10]);
-  u[3] = _mm_add_epi32(v[3], v[11]);
-  u[4] = _mm_add_epi32(v[4], v[12]);
-  u[5] = _mm_add_epi32(v[5], v[13]);
-  u[6] = _mm_add_epi32(v[6], v[14]);
-  u[7] = _mm_add_epi32(v[7], v[15]);
-  u[8] = _mm_sub_epi32(v[0], v[8]);
-  u[9] = _mm_sub_epi32(v[1], v[9]);
-  u[10] = _mm_sub_epi32(v[2], v[10]);
-  u[11] = _mm_sub_epi32(v[3], v[11]);
-  u[12] = _mm_sub_epi32(v[4], v[12]);
-  u[13] = _mm_sub_epi32(v[5], v[13]);
-  u[14] = _mm_sub_epi32(v[6], v[14]);
-  u[15] = _mm_sub_epi32(v[7], v[15]);
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
-  x[0] = _mm_add_epi16(s[0], s[4]);
-  x[1] = _mm_add_epi16(s[1], s[5]);
-  x[2] = _mm_add_epi16(s[2], s[6]);
-  x[3] = _mm_add_epi16(s[3], s[7]);
-  x[4] = _mm_sub_epi16(s[0], s[4]);
-  x[5] = _mm_sub_epi16(s[1], s[5]);
-  x[6] = _mm_sub_epi16(s[2], s[6]);
-  x[7] = _mm_sub_epi16(s[3], s[7]);
-  x[8] = _mm_packs_epi32(u[0], u[1]);
-  x[9] = _mm_packs_epi32(u[2], u[3]);
-  x[10] = _mm_packs_epi32(u[4], u[5]);
-  x[11] = _mm_packs_epi32(u[6], u[7]);
-  x[12] = _mm_packs_epi32(u[8], u[9]);
-  x[13] = _mm_packs_epi32(u[10], u[11]);
-  x[14] = _mm_packs_epi32(u[12], u[13]);
-  x[15] = _mm_packs_epi32(u[14], u[15]);
-
-  // stage 3
-  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
-  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
-  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
-  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
-  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
-  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
-  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
-  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
-
-  u[0] = _mm_add_epi32(v[0], v[4]);
-  u[1] = _mm_add_epi32(v[1], v[5]);
-  u[2] = _mm_add_epi32(v[2], v[6]);
-  u[3] = _mm_add_epi32(v[3], v[7]);
-  u[4] = _mm_sub_epi32(v[0], v[4]);
-  u[5] = _mm_sub_epi32(v[1], v[5]);
-  u[6] = _mm_sub_epi32(v[2], v[6]);
-  u[7] = _mm_sub_epi32(v[3], v[7]);
-  u[8] = _mm_add_epi32(v[8], v[12]);
-  u[9] = _mm_add_epi32(v[9], v[13]);
-  u[10] = _mm_add_epi32(v[10], v[14]);
-  u[11] = _mm_add_epi32(v[11], v[15]);
-  u[12] = _mm_sub_epi32(v[8], v[12]);
-  u[13] = _mm_sub_epi32(v[9], v[13]);
-  u[14] = _mm_sub_epi32(v[10], v[14]);
-  u[15] = _mm_sub_epi32(v[11], v[15]);
-
-  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[0] = _mm_add_epi16(x[0], x[2]);
-  s[1] = _mm_add_epi16(x[1], x[3]);
-  s[2] = _mm_sub_epi16(x[0], x[2]);
-  s[3] = _mm_sub_epi16(x[1], x[3]);
-  s[4] = _mm_packs_epi32(v[0], v[1]);
-  s[5] = _mm_packs_epi32(v[2], v[3]);
-  s[6] = _mm_packs_epi32(v[4], v[5]);
-  s[7] = _mm_packs_epi32(v[6], v[7]);
-  s[8] = _mm_add_epi16(x[8], x[10]);
-  s[9] = _mm_add_epi16(x[9], x[11]);
-  s[10] = _mm_sub_epi16(x[8], x[10]);
-  s[11] = _mm_sub_epi16(x[9], x[11]);
-  s[12] = _mm_packs_epi32(v[8], v[9]);
-  s[13] = _mm_packs_epi32(v[10], v[11]);
-  s[14] = _mm_packs_epi32(v[12], v[13]);
-  s[15] = _mm_packs_epi32(v[14], v[15]);
-
-  // stage 4
-  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
-  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
-  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
-  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
-  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
-  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
-  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
-  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  in[0] = s[0];
-  in[1] = _mm_sub_epi16(kZero, s[8]);
-  in[2] = s[12];
-  in[3] = _mm_sub_epi16(kZero, s[4]);
-  in[4] = _mm_packs_epi32(v[4], v[5]);
-  in[5] = _mm_packs_epi32(v[12], v[13]);
-  in[6] = _mm_packs_epi32(v[8], v[9]);
-  in[7] = _mm_packs_epi32(v[0], v[1]);
-  in[8] = _mm_packs_epi32(v[2], v[3]);
-  in[9] = _mm_packs_epi32(v[10], v[11]);
-  in[10] = _mm_packs_epi32(v[14], v[15]);
-  in[11] = _mm_packs_epi32(v[6], v[7]);
-  in[12] = s[5];
-  in[13] = _mm_sub_epi16(kZero, s[13]);
-  in[14] = s[9];
-  in[15] = _mm_sub_epi16(kZero, s[1]);
-}
-
-static void idct16_8col(__m128i *in) {
-  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i v[16], u[16], s[16], t[16];
-
-  // stage 1
-  s[0] = in[0];
-  s[1] = in[8];
-  s[2] = in[4];
-  s[3] = in[12];
-  s[4] = in[2];
-  s[5] = in[10];
-  s[6] = in[6];
-  s[7] = in[14];
-  s[8] = in[1];
-  s[9] = in[9];
-  s[10] = in[5];
-  s[11] = in[13];
-  s[12] = in[3];
-  s[13] = in[11];
-  s[14] = in[7];
-  s[15] = in[15];
-
-  // stage 2
-  u[0] = _mm_unpacklo_epi16(s[8], s[15]);
-  u[1] = _mm_unpackhi_epi16(s[8], s[15]);
-  u[2] = _mm_unpacklo_epi16(s[9], s[14]);
-  u[3] = _mm_unpackhi_epi16(s[9], s[14]);
-  u[4] = _mm_unpacklo_epi16(s[10], s[13]);
-  u[5] = _mm_unpackhi_epi16(s[10], s[13]);
-  u[6] = _mm_unpacklo_epi16(s[11], s[12]);
-  u[7] = _mm_unpackhi_epi16(s[11], s[12]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[8]  = _mm_packs_epi32(u[0], u[1]);
-  s[15] = _mm_packs_epi32(u[2], u[3]);
-  s[9]  = _mm_packs_epi32(u[4], u[5]);
-  s[14] = _mm_packs_epi32(u[6], u[7]);
-  s[10] = _mm_packs_epi32(u[8], u[9]);
-  s[13] = _mm_packs_epi32(u[10], u[11]);
-  s[11] = _mm_packs_epi32(u[12], u[13]);
-  s[12] = _mm_packs_epi32(u[14], u[15]);
-
-  // stage 3
-  t[0] = s[0];
-  t[1] = s[1];
-  t[2] = s[2];
-  t[3] = s[3];
-  u[0] = _mm_unpacklo_epi16(s[4], s[7]);
-  u[1] = _mm_unpackhi_epi16(s[4], s[7]);
-  u[2] = _mm_unpacklo_epi16(s[5], s[6]);
-  u[3] = _mm_unpackhi_epi16(s[5], s[6]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  t[4] = _mm_packs_epi32(u[0], u[1]);
-  t[7] = _mm_packs_epi32(u[2], u[3]);
-  t[5] = _mm_packs_epi32(u[4], u[5]);
-  t[6] = _mm_packs_epi32(u[6], u[7]);
-  t[8] = _mm_add_epi16(s[8], s[9]);
-  t[9] = _mm_sub_epi16(s[8], s[9]);
-  t[10] = _mm_sub_epi16(s[11], s[10]);
-  t[11] = _mm_add_epi16(s[10], s[11]);
-  t[12] = _mm_add_epi16(s[12], s[13]);
-  t[13] = _mm_sub_epi16(s[12], s[13]);
-  t[14] = _mm_sub_epi16(s[15], s[14]);
-  t[15] = _mm_add_epi16(s[14], s[15]);
-
-  // stage 4
-  u[0] = _mm_unpacklo_epi16(t[0], t[1]);
-  u[1] = _mm_unpackhi_epi16(t[0], t[1]);
-  u[2] = _mm_unpacklo_epi16(t[2], t[3]);
-  u[3] = _mm_unpackhi_epi16(t[2], t[3]);
-  u[4] = _mm_unpacklo_epi16(t[9], t[14]);
-  u[5] = _mm_unpackhi_epi16(t[9], t[14]);
-  u[6] = _mm_unpacklo_epi16(t[10], t[13]);
-  u[7] = _mm_unpackhi_epi16(t[10], t[13]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[0] = _mm_packs_epi32(u[0], u[1]);
-  s[1] = _mm_packs_epi32(u[2], u[3]);
-  s[2] = _mm_packs_epi32(u[4], u[5]);
-  s[3] = _mm_packs_epi32(u[6], u[7]);
-  s[4] = _mm_add_epi16(t[4], t[5]);
-  s[5] = _mm_sub_epi16(t[4], t[5]);
-  s[6] = _mm_sub_epi16(t[7], t[6]);
-  s[7] = _mm_add_epi16(t[6], t[7]);
-  s[8] = t[8];
-  s[15] = t[15];
-  s[9]  = _mm_packs_epi32(u[8], u[9]);
-  s[14] = _mm_packs_epi32(u[10], u[11]);
-  s[10] = _mm_packs_epi32(u[12], u[13]);
-  s[13] = _mm_packs_epi32(u[14], u[15]);
-  s[11] = t[11];
-  s[12] = t[12];
-
-  // stage 5
-  t[0] = _mm_add_epi16(s[0], s[3]);
-  t[1] = _mm_add_epi16(s[1], s[2]);
-  t[2] = _mm_sub_epi16(s[1], s[2]);
-  t[3] = _mm_sub_epi16(s[0], s[3]);
-  t[4] = s[4];
-  t[7] = s[7];
-
-  u[0] = _mm_unpacklo_epi16(s[5], s[6]);
-  u[1] = _mm_unpackhi_epi16(s[5], s[6]);
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  t[5] = _mm_packs_epi32(u[0], u[1]);
-  t[6] = _mm_packs_epi32(u[2], u[3]);
-
-  t[8] = _mm_add_epi16(s[8], s[11]);
-  t[9] = _mm_add_epi16(s[9], s[10]);
-  t[10] = _mm_sub_epi16(s[9], s[10]);
-  t[11] = _mm_sub_epi16(s[8], s[11]);
-  t[12] = _mm_sub_epi16(s[15], s[12]);
-  t[13] = _mm_sub_epi16(s[14], s[13]);
-  t[14] = _mm_add_epi16(s[13], s[14]);
-  t[15] = _mm_add_epi16(s[12], s[15]);
-
-  // stage 6
-  s[0] = _mm_add_epi16(t[0], t[7]);
-  s[1] = _mm_add_epi16(t[1], t[6]);
-  s[2] = _mm_add_epi16(t[2], t[5]);
-  s[3] = _mm_add_epi16(t[3], t[4]);
-  s[4] = _mm_sub_epi16(t[3], t[4]);
-  s[5] = _mm_sub_epi16(t[2], t[5]);
-  s[6] = _mm_sub_epi16(t[1], t[6]);
-  s[7] = _mm_sub_epi16(t[0], t[7]);
-  s[8] = t[8];
-  s[9] = t[9];
-
-  u[0] = _mm_unpacklo_epi16(t[10], t[13]);
-  u[1] = _mm_unpackhi_epi16(t[10], t[13]);
-  u[2] = _mm_unpacklo_epi16(t[11], t[12]);
-  u[3] = _mm_unpackhi_epi16(t[11], t[12]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  s[10] = _mm_packs_epi32(u[0], u[1]);
-  s[13] = _mm_packs_epi32(u[2], u[3]);
-  s[11] = _mm_packs_epi32(u[4], u[5]);
-  s[12] = _mm_packs_epi32(u[6], u[7]);
-  s[14] = t[14];
-  s[15] = t[15];
-
-  // stage 7
-  in[0] = _mm_add_epi16(s[0], s[15]);
-  in[1] = _mm_add_epi16(s[1], s[14]);
-  in[2] = _mm_add_epi16(s[2], s[13]);
-  in[3] = _mm_add_epi16(s[3], s[12]);
-  in[4] = _mm_add_epi16(s[4], s[11]);
-  in[5] = _mm_add_epi16(s[5], s[10]);
-  in[6] = _mm_add_epi16(s[6], s[9]);
-  in[7] = _mm_add_epi16(s[7], s[8]);
-  in[8] = _mm_sub_epi16(s[7], s[8]);
-  in[9] = _mm_sub_epi16(s[6], s[9]);
-  in[10] = _mm_sub_epi16(s[5], s[10]);
-  in[11] = _mm_sub_epi16(s[4], s[11]);
-  in[12] = _mm_sub_epi16(s[3], s[12]);
-  in[13] = _mm_sub_epi16(s[2], s[13]);
-  in[14] = _mm_sub_epi16(s[1], s[14]);
-  in[15] = _mm_sub_epi16(s[0], s[15]);
-}
-
-void idct16_sse2(__m128i *in0, __m128i *in1) {
-  array_transpose_16x16(in0, in1);
-  idct16_8col(in0);
-  idct16_8col(in1);
-}
-
-void iadst16_sse2(__m128i *in0, __m128i *in1) {
-  array_transpose_16x16(in0, in1);
-  iadst16_8col(in0);
-  iadst16_8col(in1);
-}
-
-void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
-                               int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  __m128i in[16], l[16];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6,
-          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-          stp1_8_0, stp1_12_0;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-  // First 1-D inverse DCT
-  // Load input data.
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 8 * 2);
-  in[2] = load_input_data(input + 8 * 4);
-  in[3] = load_input_data(input + 8 * 6);
-
-  TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
-
-  // Stage2
-  {
-    const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
-    const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
-
-    tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
-    tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
-    tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
-    tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp5 = _mm_add_epi32(tmp5, rounding);
-    tmp7 = _mm_add_epi32(tmp7, rounding);
-
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
-
-    stp2_8  = _mm_packs_epi32(tmp0, tmp2);
-    stp2_11 = _mm_packs_epi32(tmp5, tmp7);
-  }
-
-  // Stage3
-  {
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
-
-    tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
-    tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
-    stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
-    stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
-
-    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
-  }
-
-  // Stage4
-  {
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
-
-    tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
-    tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
-    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
-    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
-    tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
-    tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp1 = _mm_add_epi32(tmp1, rounding);
-    tmp3 = _mm_add_epi32(tmp3, rounding);
-    tmp5 = _mm_add_epi32(tmp5, rounding);
-    tmp7 = _mm_add_epi32(tmp7, rounding);
-
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
-
-    stp1_0 = _mm_packs_epi32(tmp0, tmp0);
-    stp1_1 = _mm_packs_epi32(tmp2, tmp2);
-    stp2_9 = _mm_packs_epi32(tmp1, tmp3);
-    stp2_10 = _mm_packs_epi32(tmp5, tmp7);
-
-    stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
-  }
-
-  // Stage5 and Stage6
-  {
-    tmp0 = _mm_add_epi16(stp2_8, stp2_11);
-    tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
-    tmp2 = _mm_add_epi16(stp2_9, stp2_10);
-    tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
-
-    stp1_9  = _mm_unpacklo_epi64(tmp2, zero);
-    stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
-    stp1_8  = _mm_unpacklo_epi64(tmp0, zero);
-    stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
-
-    stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
-    stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
-    stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
-    stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
-  }
-
-  // Stage6
-  {
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
-
-    tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
-    tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
-    tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
-    tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
-    tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
-    tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
-
-    tmp1 = _mm_add_epi32(tmp1, rounding);
-    tmp3 = _mm_add_epi32(tmp3, rounding);
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp1_6 = _mm_packs_epi32(tmp3, tmp1);
-
-    stp2_10 = _mm_packs_epi32(tmp0, zero);
-    stp2_13 = _mm_packs_epi32(tmp2, zero);
-    stp2_11 = _mm_packs_epi32(tmp4, zero);
-    stp2_12 = _mm_packs_epi32(tmp6, zero);
-
-    tmp0 = _mm_add_epi16(stp1_0, stp1_4);
-    tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
-    tmp2 = _mm_add_epi16(stp1_1, stp1_6);
-    tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
-
-    stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
-    stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
-    stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
-    stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
-    stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
-    stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
-    stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
-    stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
-  }
-
-  // Stage7. Left 8x16 only.
-  l[0] = _mm_add_epi16(stp2_0, stp1_15);
-  l[1] = _mm_add_epi16(stp2_1, stp1_14);
-  l[2] = _mm_add_epi16(stp2_2, stp2_13);
-  l[3] = _mm_add_epi16(stp2_3, stp2_12);
-  l[4] = _mm_add_epi16(stp2_4, stp2_11);
-  l[5] = _mm_add_epi16(stp2_5, stp2_10);
-  l[6] = _mm_add_epi16(stp2_6, stp1_9);
-  l[7] = _mm_add_epi16(stp2_7, stp1_8);
-  l[8] = _mm_sub_epi16(stp2_7, stp1_8);
-  l[9] = _mm_sub_epi16(stp2_6, stp1_9);
-  l[10] = _mm_sub_epi16(stp2_5, stp2_10);
-  l[11] = _mm_sub_epi16(stp2_4, stp2_11);
-  l[12] = _mm_sub_epi16(stp2_3, stp2_12);
-  l[13] = _mm_sub_epi16(stp2_2, stp2_13);
-  l[14] = _mm_sub_epi16(stp2_1, stp1_14);
-  l[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-  // Second 1-D inverse transform, performed per 8x16 block
-  for (i = 0; i < 2; i++) {
-    int j;
-    array_transpose_4X8(l + 8 * i, in);
-
-    IDCT16_10
-
-    // Stage7
-    in[0] = _mm_add_epi16(stp2_0, stp1_15);
-    in[1] = _mm_add_epi16(stp2_1, stp1_14);
-    in[2] = _mm_add_epi16(stp2_2, stp2_13);
-    in[3] = _mm_add_epi16(stp2_3, stp2_12);
-    in[4] = _mm_add_epi16(stp2_4, stp2_11);
-    in[5] = _mm_add_epi16(stp2_5, stp2_10);
-    in[6] = _mm_add_epi16(stp2_6, stp1_9);
-    in[7] = _mm_add_epi16(stp2_7, stp1_8);
-    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
-    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
-    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
-    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
-    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
-    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
-    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
-    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-    for (j = 0; j < 16; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
-
-    dest += 8;
-  }
-}
-
-#define LOAD_DQCOEFF(reg, input) \
-  {  \
-    reg = load_input_data(input); \
-    input += 8; \
-  }  \
-
-#define IDCT32_34 \
-/* Stage1 */ \
-{ \
-  const __m128i zero = _mm_setzero_si128();\
-  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
-  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
-  \
-  const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \
-  const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
-  \
-  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
-  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
-  \
-  const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
-  const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
-  \
-  MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \
-                         stg1_1, stp1_16, stp1_31); \
-  MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \
-                         stg1_7, stp1_19, stp1_28); \
-  MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \
-                         stg1_9, stp1_20, stp1_27); \
-  MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \
-                         stg1_15, stp1_23, stp1_24); \
-} \
-\
-/* Stage2 */ \
-{ \
-  const __m128i zero = _mm_setzero_si128();\
-  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
-  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
-  \
-  const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
-  const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
-  \
-  MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \
-                         stg2_1, stp2_8, stp2_15); \
-  MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \
-                         stg2_7, stp2_11, stp2_12); \
-  \
-  stp2_16 = stp1_16; \
-  stp2_19 = stp1_19; \
-  \
-  stp2_20 = stp1_20; \
-  stp2_23 = stp1_23; \
-  \
-  stp2_24 = stp1_24; \
-  stp2_27 = stp1_27; \
-  \
-  stp2_28 = stp1_28; \
-  stp2_31 = stp1_31; \
-} \
-\
-/* Stage3 */ \
-{ \
-  const __m128i zero = _mm_setzero_si128();\
-  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
-  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
-  \
-  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
-  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
-  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
-  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
-  \
-  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
-  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
-  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
-  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
-  \
-  MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \
-                         stg3_1, stp1_4, stp1_7); \
-  \
-  stp1_8 = stp2_8; \
-  stp1_11 = stp2_11; \
-  stp1_12 = stp2_12; \
-  stp1_15 = stp2_15; \
-  \
-  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
-                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
-                         stp1_18, stp1_29) \
-  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
-                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
-                         stp1_22, stp1_25) \
-  \
-  stp1_16 = stp2_16; \
-  stp1_31 = stp2_31; \
-  stp1_19 = stp2_19; \
-  stp1_20 = stp2_20; \
-  stp1_23 = stp2_23; \
-  stp1_24 = stp2_24; \
-  stp1_27 = stp2_27; \
-  stp1_28 = stp2_28; \
-} \
-\
-/* Stage4 */ \
-{ \
-  const __m128i zero = _mm_setzero_si128();\
-  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
-  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
-  \
-  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
-  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
-  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
-  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
-  \
-  MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \
-                         stg4_1, stp2_0, stp2_1); \
-  \
-  stp2_4 = stp1_4; \
-  stp2_5 = stp1_4; \
-  stp2_6 = stp1_7; \
-  stp2_7 = stp1_7; \
-  \
-  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
-                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
-                         stp2_10, stp2_13) \
-  \
-  stp2_8 = stp1_8; \
-  stp2_15 = stp1_15; \
-  stp2_11 = stp1_11; \
-  stp2_12 = stp1_12; \
-  \
-  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
-  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
-  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
-  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
-  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
-  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
-  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
-  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
-  \
-  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
-  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
-  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
-  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
-  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
-  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
-  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
-  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
-} \
-\
-/* Stage5 */ \
-{ \
-  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
-  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
-  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
-  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
-  \
-  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
-  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
-  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
-  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
-  \
-  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
-  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
-  \
-  stp1_0 = stp2_0; \
-  stp1_1 = stp2_1; \
-  stp1_2 = stp2_1; \
-  stp1_3 = stp2_0; \
-  \
-  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
-  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
-  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
-  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
-  \
-  tmp0 = _mm_add_epi32(tmp0, rounding); \
-  tmp1 = _mm_add_epi32(tmp1, rounding); \
-  tmp2 = _mm_add_epi32(tmp2, rounding); \
-  tmp3 = _mm_add_epi32(tmp3, rounding); \
-  \
-  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-  \
-  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
-  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
-  \
-  stp1_4 = stp2_4; \
-  stp1_7 = stp2_7; \
-  \
-  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
-  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
-  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
-  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
-  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
-  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
-  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
-  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
-  \
-  stp1_16 = stp2_16; \
-  stp1_17 = stp2_17; \
-  \
-  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
-                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
-                         stp1_19, stp1_28) \
-  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
-                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
-                         stp1_21, stp1_26) \
-  \
-  stp1_22 = stp2_22; \
-  stp1_23 = stp2_23; \
-  stp1_24 = stp2_24; \
-  stp1_25 = stp2_25; \
-  stp1_30 = stp2_30; \
-  stp1_31 = stp2_31; \
-} \
-\
-/* Stage6 */ \
-{ \
-  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
-  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
-  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
-  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
-  \
-  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
-  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
-  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
-  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
-  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
-  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
-  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
-  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
-  \
-  stp2_8 = stp1_8; \
-  stp2_9 = stp1_9; \
-  stp2_14 = stp1_14; \
-  stp2_15 = stp1_15; \
-  \
-  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
-                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
-                         stp2_13, stp2_11, stp2_12) \
-  \
-  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
-  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
-  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
-  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
-  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
-  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
-  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
-  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
-  \
-  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
-  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
-  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
-  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
-  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
-  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
-  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
-  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
-} \
-\
-/* Stage7 */ \
-{ \
-  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
-  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
-  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
-  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
-  \
-  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
-  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
-  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
-  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
-  \
-  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
-  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
-  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
-  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
-  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
-  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
-  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
-  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
-  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
-  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
-  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
-  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
-  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
-  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
-  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
-  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
-  \
-  stp1_16 = stp2_16; \
-  stp1_17 = stp2_17; \
-  stp1_18 = stp2_18; \
-  stp1_19 = stp2_19; \
-  \
-  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
-                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
-                         stp1_21, stp1_26) \
-  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
-                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
-                         stp1_23, stp1_24) \
-  \
-  stp1_28 = stp2_28; \
-  stp1_29 = stp2_29; \
-  stp1_30 = stp2_30; \
-  stp1_31 = stp2_31; \
-}
-
-
-#define IDCT32 \
-/* Stage1 */ \
-{ \
-  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
-  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
-  const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
-  const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
-  \
-  const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
-  const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
-  const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \
-  const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
-  \
-  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
-  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
-  const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
-  const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
-  \
-  const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
-  const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
-  const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
-  const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
-  \
-  MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
-                         stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
-                         stp1_17, stp1_30) \
-  MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
-                         stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
-                         stp1_19, stp1_28) \
-  MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
-                         stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
-                         stp1_21, stp1_26) \
-  MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
-                         stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
-                         stp1_23, stp1_24) \
-} \
-\
-/* Stage2 */ \
-{ \
-  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
-  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
-  const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
-  const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
-  \
-  const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
-  const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
-  const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
-  const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
-  \
-  MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
-                         stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
-                         stp2_14) \
-  MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
-                         stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
-                         stp2_11, stp2_12) \
-  \
-  stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
-  stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
-  stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
-  stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
-  \
-  stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
-  stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
-  stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
-  stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
-  \
-  stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
-  stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
-  stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
-  stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
-  \
-  stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
-  stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
-  stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
-  stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
-} \
-\
-/* Stage3 */ \
-{ \
-  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
-  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
-  const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
-  const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
-  \
-  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
-  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
-  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
-  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
-  \
-  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
-  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
-  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
-  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
-  \
-  MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
-                         stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
-                         stp1_6) \
-  \
-  stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
-  stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
-  stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
-  stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
-  stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
-  stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
-  stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
-  stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
-  \
-  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
-                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
-                         stp1_18, stp1_29) \
-  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
-                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
-                         stp1_22, stp1_25) \
-  \
-  stp1_16 = stp2_16; \
-  stp1_31 = stp2_31; \
-  stp1_19 = stp2_19; \
-  stp1_20 = stp2_20; \
-  stp1_23 = stp2_23; \
-  stp1_24 = stp2_24; \
-  stp1_27 = stp2_27; \
-  stp1_28 = stp2_28; \
-} \
-\
-/* Stage4 */ \
-{ \
-  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
-  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
-  const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
-  const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
-  \
-  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
-  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
-  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
-  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
-  \
-  MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
-                         stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
-                         stp2_2, stp2_3) \
-  \
-  stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
-  stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
-  stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
-  stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
-  \
-  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
-                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
-                         stp2_10, stp2_13) \
-  \
-  stp2_8 = stp1_8; \
-  stp2_15 = stp1_15; \
-  stp2_11 = stp1_11; \
-  stp2_12 = stp1_12; \
-  \
-  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
-  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
-  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
-  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
-  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
-  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
-  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
-  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
-  \
-  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
-  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
-  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
-  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
-  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
-  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
-  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
-  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
-} \
-\
-/* Stage5 */ \
-{ \
-  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
-  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
-  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
-  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
-  \
-  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
-  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
-  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
-  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
-  \
-  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
-  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
-  \
-  stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
-  stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
-  stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
-  stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
-  \
-  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
-  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
-  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
-  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
-  \
-  tmp0 = _mm_add_epi32(tmp0, rounding); \
-  tmp1 = _mm_add_epi32(tmp1, rounding); \
-  tmp2 = _mm_add_epi32(tmp2, rounding); \
-  tmp3 = _mm_add_epi32(tmp3, rounding); \
-  \
-  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-  \
-  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
-  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
-  \
-  stp1_4 = stp2_4; \
-  stp1_7 = stp2_7; \
-  \
-  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
-  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
-  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
-  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
-  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
-  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
-  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
-  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
-  \
-  stp1_16 = stp2_16; \
-  stp1_17 = stp2_17; \
-  \
-  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
-                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
-                         stp1_19, stp1_28) \
-  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
-                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
-                         stp1_21, stp1_26) \
-  \
-  stp1_22 = stp2_22; \
-  stp1_23 = stp2_23; \
-  stp1_24 = stp2_24; \
-  stp1_25 = stp2_25; \
-  stp1_30 = stp2_30; \
-  stp1_31 = stp2_31; \
-} \
-\
-/* Stage6 */ \
-{ \
-  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
-  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
-  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
-  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
-  \
-  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
-  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
-  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
-  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
-  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
-  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
-  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
-  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
-  \
-  stp2_8 = stp1_8; \
-  stp2_9 = stp1_9; \
-  stp2_14 = stp1_14; \
-  stp2_15 = stp1_15; \
-  \
-  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
-                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
-                         stp2_13, stp2_11, stp2_12) \
-  \
-  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
-  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
-  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
-  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
-  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
-  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
-  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
-  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
-  \
-  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
-  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
-  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
-  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
-  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
-  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
-  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
-  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
-} \
-\
-/* Stage7 */ \
-{ \
-  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
-  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
-  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
-  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
-  \
-  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
-  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
-  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
-  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
-  \
-  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
-  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
-  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
-  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
-  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
-  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
-  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
-  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
-  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
-  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
-  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
-  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
-  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
-  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
-  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
-  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
-  \
-  stp1_16 = stp2_16; \
-  stp1_17 = stp2_17; \
-  stp1_18 = stp2_18; \
-  stp1_19 = stp2_19; \
-  \
-  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
-                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
-                         stp1_21, stp1_26) \
-  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
-                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
-                         stp1_23, stp1_24) \
-  \
-  stp1_28 = stp2_28; \
-  stp1_29 = stp2_29; \
-  stp1_30 = stp2_30; \
-  stp1_31 = stp2_31; \
-}
-
-// Only upper-left 8x8 has non-zero coeff
-void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
-                               int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1<<5);
-
-  // idct constants for each stage
-  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in[32], col[32];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
-          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
-          stp1_30, stp1_31;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
-          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
-          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
-          stp2_30, stp2_31;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-
-  // Load input data. Only need to load the top left 8x8 block.
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 32);
-  in[2] = load_input_data(input + 64);
-  in[3] = load_input_data(input + 96);
-  in[4] = load_input_data(input + 128);
-  in[5] = load_input_data(input + 160);
-  in[6] = load_input_data(input + 192);
-  in[7] = load_input_data(input + 224);
-
-  for (i = 8; i < 32; ++i) {
-    in[i] = _mm_setzero_si128();
-  }
-
-  array_transpose_8x8(in, in);
-  // TODO(hkuang): Following transposes are unnecessary. But remove them will
-  // lead to performance drop on some devices.
-  array_transpose_8x8(in + 8, in + 8);
-  array_transpose_8x8(in + 16, in + 16);
-  array_transpose_8x8(in + 24, in + 24);
-
-  IDCT32_34
-
-  // 1_D: Store 32 intermediate results for each 8x32 block.
-  col[0] = _mm_add_epi16(stp1_0, stp1_31);
-  col[1] = _mm_add_epi16(stp1_1, stp1_30);
-  col[2] = _mm_add_epi16(stp1_2, stp1_29);
-  col[3] = _mm_add_epi16(stp1_3, stp1_28);
-  col[4] = _mm_add_epi16(stp1_4, stp1_27);
-  col[5] = _mm_add_epi16(stp1_5, stp1_26);
-  col[6] = _mm_add_epi16(stp1_6, stp1_25);
-  col[7] = _mm_add_epi16(stp1_7, stp1_24);
-  col[8] = _mm_add_epi16(stp1_8, stp1_23);
-  col[9] = _mm_add_epi16(stp1_9, stp1_22);
-  col[10] = _mm_add_epi16(stp1_10, stp1_21);
-  col[11] = _mm_add_epi16(stp1_11, stp1_20);
-  col[12] = _mm_add_epi16(stp1_12, stp1_19);
-  col[13] = _mm_add_epi16(stp1_13, stp1_18);
-  col[14] = _mm_add_epi16(stp1_14, stp1_17);
-  col[15] = _mm_add_epi16(stp1_15, stp1_16);
-  col[16] = _mm_sub_epi16(stp1_15, stp1_16);
-  col[17] = _mm_sub_epi16(stp1_14, stp1_17);
-  col[18] = _mm_sub_epi16(stp1_13, stp1_18);
-  col[19] = _mm_sub_epi16(stp1_12, stp1_19);
-  col[20] = _mm_sub_epi16(stp1_11, stp1_20);
-  col[21] = _mm_sub_epi16(stp1_10, stp1_21);
-  col[22] = _mm_sub_epi16(stp1_9, stp1_22);
-  col[23] = _mm_sub_epi16(stp1_8, stp1_23);
-  col[24] = _mm_sub_epi16(stp1_7, stp1_24);
-  col[25] = _mm_sub_epi16(stp1_6, stp1_25);
-  col[26] = _mm_sub_epi16(stp1_5, stp1_26);
-  col[27] = _mm_sub_epi16(stp1_4, stp1_27);
-  col[28] = _mm_sub_epi16(stp1_3, stp1_28);
-  col[29] = _mm_sub_epi16(stp1_2, stp1_29);
-  col[30] = _mm_sub_epi16(stp1_1, stp1_30);
-  col[31] = _mm_sub_epi16(stp1_0, stp1_31);
-  for (i = 0; i < 4; i++) {
-    int j;
-    const __m128i zero = _mm_setzero_si128();
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(col + i * 8, in);
-    IDCT32_34
-
-    // 2_D: Calculate the results and store them to destination.
-    in[0] = _mm_add_epi16(stp1_0, stp1_31);
-    in[1] = _mm_add_epi16(stp1_1, stp1_30);
-    in[2] = _mm_add_epi16(stp1_2, stp1_29);
-    in[3] = _mm_add_epi16(stp1_3, stp1_28);
-    in[4] = _mm_add_epi16(stp1_4, stp1_27);
-    in[5] = _mm_add_epi16(stp1_5, stp1_26);
-    in[6] = _mm_add_epi16(stp1_6, stp1_25);
-    in[7] = _mm_add_epi16(stp1_7, stp1_24);
-    in[8] = _mm_add_epi16(stp1_8, stp1_23);
-    in[9] = _mm_add_epi16(stp1_9, stp1_22);
-    in[10] = _mm_add_epi16(stp1_10, stp1_21);
-    in[11] = _mm_add_epi16(stp1_11, stp1_20);
-    in[12] = _mm_add_epi16(stp1_12, stp1_19);
-    in[13] = _mm_add_epi16(stp1_13, stp1_18);
-    in[14] = _mm_add_epi16(stp1_14, stp1_17);
-    in[15] = _mm_add_epi16(stp1_15, stp1_16);
-    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
-    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
-    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
-    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
-    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
-    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
-    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
-    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
-    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
-    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
-    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
-    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
-    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
-    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
-    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
-    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
-
-    for (j = 0; j < 32; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
-
-    dest += 8;
-  }
-}
-
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
-                                 int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-
-  // idct constants for each stage
-  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
-  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
-  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
-  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
-  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
-  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
-  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
-  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in[32], col[128], zero_idx[16];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
-          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
-          stp1_30, stp1_31;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
-          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
-          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
-          stp2_30, stp2_31;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i, j, i32;
-
-  for (i = 0; i < 4; i++) {
-    i32 = (i << 5);
-    // First 1-D idct
-    // Load input data.
-    LOAD_DQCOEFF(in[0], input);
-    LOAD_DQCOEFF(in[8], input);
-    LOAD_DQCOEFF(in[16], input);
-    LOAD_DQCOEFF(in[24], input);
-    LOAD_DQCOEFF(in[1], input);
-    LOAD_DQCOEFF(in[9], input);
-    LOAD_DQCOEFF(in[17], input);
-    LOAD_DQCOEFF(in[25], input);
-    LOAD_DQCOEFF(in[2], input);
-    LOAD_DQCOEFF(in[10], input);
-    LOAD_DQCOEFF(in[18], input);
-    LOAD_DQCOEFF(in[26], input);
-    LOAD_DQCOEFF(in[3], input);
-    LOAD_DQCOEFF(in[11], input);
-    LOAD_DQCOEFF(in[19], input);
-    LOAD_DQCOEFF(in[27], input);
-
-    LOAD_DQCOEFF(in[4], input);
-    LOAD_DQCOEFF(in[12], input);
-    LOAD_DQCOEFF(in[20], input);
-    LOAD_DQCOEFF(in[28], input);
-    LOAD_DQCOEFF(in[5], input);
-    LOAD_DQCOEFF(in[13], input);
-    LOAD_DQCOEFF(in[21], input);
-    LOAD_DQCOEFF(in[29], input);
-    LOAD_DQCOEFF(in[6], input);
-    LOAD_DQCOEFF(in[14], input);
-    LOAD_DQCOEFF(in[22], input);
-    LOAD_DQCOEFF(in[30], input);
-    LOAD_DQCOEFF(in[7], input);
-    LOAD_DQCOEFF(in[15], input);
-    LOAD_DQCOEFF(in[23], input);
-    LOAD_DQCOEFF(in[31], input);
-
-    // checking if all entries are zero
-    zero_idx[0] = _mm_or_si128(in[0], in[1]);
-    zero_idx[1] = _mm_or_si128(in[2], in[3]);
-    zero_idx[2] = _mm_or_si128(in[4], in[5]);
-    zero_idx[3] = _mm_or_si128(in[6], in[7]);
-    zero_idx[4] = _mm_or_si128(in[8], in[9]);
-    zero_idx[5] = _mm_or_si128(in[10], in[11]);
-    zero_idx[6] = _mm_or_si128(in[12], in[13]);
-    zero_idx[7] = _mm_or_si128(in[14], in[15]);
-    zero_idx[8] = _mm_or_si128(in[16], in[17]);
-    zero_idx[9] = _mm_or_si128(in[18], in[19]);
-    zero_idx[10] = _mm_or_si128(in[20], in[21]);
-    zero_idx[11] = _mm_or_si128(in[22], in[23]);
-    zero_idx[12] = _mm_or_si128(in[24], in[25]);
-    zero_idx[13] = _mm_or_si128(in[26], in[27]);
-    zero_idx[14] = _mm_or_si128(in[28], in[29]);
-    zero_idx[15] = _mm_or_si128(in[30], in[31]);
-
-    zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
-    zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
-    zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
-    zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
-    zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
-    zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
-    zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
-    zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
-
-    zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
-    zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
-    zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
-    zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
-    zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
-    zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
-    zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
-
-    if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
-      col[i32 + 0] = _mm_setzero_si128();
-      col[i32 + 1] = _mm_setzero_si128();
-      col[i32 + 2] = _mm_setzero_si128();
-      col[i32 + 3] = _mm_setzero_si128();
-      col[i32 + 4] = _mm_setzero_si128();
-      col[i32 + 5] = _mm_setzero_si128();
-      col[i32 + 6] = _mm_setzero_si128();
-      col[i32 + 7] = _mm_setzero_si128();
-      col[i32 + 8] = _mm_setzero_si128();
-      col[i32 + 9] = _mm_setzero_si128();
-      col[i32 + 10] = _mm_setzero_si128();
-      col[i32 + 11] = _mm_setzero_si128();
-      col[i32 + 12] = _mm_setzero_si128();
-      col[i32 + 13] = _mm_setzero_si128();
-      col[i32 + 14] = _mm_setzero_si128();
-      col[i32 + 15] = _mm_setzero_si128();
-      col[i32 + 16] = _mm_setzero_si128();
-      col[i32 + 17] = _mm_setzero_si128();
-      col[i32 + 18] = _mm_setzero_si128();
-      col[i32 + 19] = _mm_setzero_si128();
-      col[i32 + 20] = _mm_setzero_si128();
-      col[i32 + 21] = _mm_setzero_si128();
-      col[i32 + 22] = _mm_setzero_si128();
-      col[i32 + 23] = _mm_setzero_si128();
-      col[i32 + 24] = _mm_setzero_si128();
-      col[i32 + 25] = _mm_setzero_si128();
-      col[i32 + 26] = _mm_setzero_si128();
-      col[i32 + 27] = _mm_setzero_si128();
-      col[i32 + 28] = _mm_setzero_si128();
-      col[i32 + 29] = _mm_setzero_si128();
-      col[i32 + 30] = _mm_setzero_si128();
-      col[i32 + 31] = _mm_setzero_si128();
-      continue;
-    }
-
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(in, in);
-    array_transpose_8x8(in + 8, in + 8);
-    array_transpose_8x8(in + 16, in + 16);
-    array_transpose_8x8(in + 24, in + 24);
-
-    IDCT32
-
-    // 1_D: Store 32 intermediate results for each 8x32 block.
-    col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
-    col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
-    col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
-    col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
-    col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
-    col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
-    col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
-    col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
-    col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
-    col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
-    col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
-    col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
-    col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
-    col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
-    col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
-    col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
-    col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
-    col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
-    col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
-    col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
-    col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
-    col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
-    col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
-    col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
-    col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
-    col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
-    col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
-    col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
-    col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
-    col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
-    col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
-    col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
-  }
-  for (i = 0; i < 4; i++) {
-    // Second 1-D idct
-    j = i << 3;
-
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(col + j, in);
-    array_transpose_8x8(col + j + 32, in + 8);
-    array_transpose_8x8(col + j + 64, in + 16);
-    array_transpose_8x8(col + j + 96, in + 24);
-
-    IDCT32
-
-    // 2_D: Calculate the results and store them to destination.
-    in[0] = _mm_add_epi16(stp1_0, stp1_31);
-    in[1] = _mm_add_epi16(stp1_1, stp1_30);
-    in[2] = _mm_add_epi16(stp1_2, stp1_29);
-    in[3] = _mm_add_epi16(stp1_3, stp1_28);
-    in[4] = _mm_add_epi16(stp1_4, stp1_27);
-    in[5] = _mm_add_epi16(stp1_5, stp1_26);
-    in[6] = _mm_add_epi16(stp1_6, stp1_25);
-    in[7] = _mm_add_epi16(stp1_7, stp1_24);
-    in[8] = _mm_add_epi16(stp1_8, stp1_23);
-    in[9] = _mm_add_epi16(stp1_9, stp1_22);
-    in[10] = _mm_add_epi16(stp1_10, stp1_21);
-    in[11] = _mm_add_epi16(stp1_11, stp1_20);
-    in[12] = _mm_add_epi16(stp1_12, stp1_19);
-    in[13] = _mm_add_epi16(stp1_13, stp1_18);
-    in[14] = _mm_add_epi16(stp1_14, stp1_17);
-    in[15] = _mm_add_epi16(stp1_15, stp1_16);
-    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
-    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
-    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
-    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
-    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
-    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
-    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
-    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
-    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
-    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
-    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
-    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
-    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
-    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
-    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
-    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
-
-    for (j = 0; j < 32; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
-
-    dest += 8;
-  }
-}
-
-void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
-                              int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a, j;
-
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 6);
-
-  dc_value = _mm_set1_epi16(a);
-
-  for (j = 0; j < 32; ++j) {
-    RECON_AND_STORE(dest +  0 + j * stride, dc_value);
-    RECON_AND_STORE(dest +  8 + j * stride, dc_value);
-    RECON_AND_STORE(dest + 16 + j * stride, dc_value);
-    RECON_AND_STORE(dest + 24 + j * stride, dc_value);
-  }
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
-  __m128i ubounded, retval;
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
-  ubounded = _mm_cmpgt_epi16(value, max);
-  retval = _mm_andnot_si128(ubounded, value);
-  ubounded = _mm_and_si128(ubounded, max);
-  retval = _mm_or_si128(retval, ubounded);
-  retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
-  return retval;
-}
-
-void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                    int stride, int bd) {
-  tran_low_t out[4 * 4];
-  tran_low_t *outptr = out;
-  int i, j;
-  __m128i inptr[4];
-  __m128i sign_bits[2];
-  __m128i temp_mm, min_input, max_input;
-  int test;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  int optimised_cols = 0;
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i eight = _mm_set1_epi16(8);
-  const __m128i max = _mm_set1_epi16(12043);
-  const __m128i min = _mm_set1_epi16(-12043);
-  // Load input into __m128i
-  inptr[0] = _mm_loadu_si128((const __m128i *)input);
-  inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
-  inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
-  inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
-
-  // Pack to 16 bits
-  inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
-  inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
-
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp_mm = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp_mm);
-
-  if (!test) {
-    // Do the row transform
-    idct4_sse2(inptr);
-
-    // Check the min & max values
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp_mm = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp_mm);
-
-    if (test) {
-      transpose_4x4(inptr);
-      sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
-      sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
-      inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
-      inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
-      inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
-      inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
-      _mm_storeu_si128((__m128i *)outptr, inptr[0]);
-      _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
-      _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
-      _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 4; ++i) {
-      vpx_highbd_idct4_c(input, outptr, bd);
-      input += 4;
-      outptr += 4;
-    }
-  }
-
-  if (optimised_cols) {
-    idct4_sse2(inptr);
-
-    // Final round and shift
-    inptr[0] = _mm_add_epi16(inptr[0], eight);
-    inptr[1] = _mm_add_epi16(inptr[1], eight);
-
-    inptr[0] = _mm_srai_epi16(inptr[0], 4);
-    inptr[1] = _mm_srai_epi16(inptr[1], 4);
-
-    // Reconstruction and Store
-    {
-      __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
-      __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
-      d0 = _mm_unpacklo_epi64(
-          d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
-      d2 = _mm_unpacklo_epi64(
-          d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
-      d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
-      d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
-      // store input0
-      _mm_storel_epi64((__m128i *)dest, d0);
-      // store input1
-      d0 = _mm_srli_si128(d0, 8);
-      _mm_storel_epi64((__m128i *)(dest + stride), d0);
-      // store input2
-      _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
-      // store input3
-      d2 = _mm_srli_si128(d2, 8);
-      _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[4], temp_out[4];
-    // Columns
-    for (i = 0; i < 4; ++i) {
-      for (j = 0; j < 4; ++j)
-        temp_in[j] = out[j * 4 + i];
-      vpx_highbd_idct4_c(temp_in, temp_out, bd);
-      for (j = 0; j < 4; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
-      }
-    }
-  }
-}
-
-void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                    int stride, int bd) {
-  tran_low_t out[8 * 8];
-  tran_low_t *outptr = out;
-  int i, j, test;
-  __m128i inptr[8];
-  __m128i min_input, max_input, temp1, temp2, sign_bits;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i sixteen = _mm_set1_epi16(16);
-  const __m128i max = _mm_set1_epi16(6201);
-  const __m128i min = _mm_set1_epi16(-6201);
-  int optimised_cols = 0;
-
-  // Load input into __m128i & pack to 16 bits
-  for (i = 0; i < 8; i++) {
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
-    inptr[i] = _mm_packs_epi32(temp1, temp2);
-  }
-
-  // Find the min & max for the row transform
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  for (i = 2; i < 8; i++) {
-    max_input = _mm_max_epi16(max_input, inptr[i]);
-    min_input = _mm_min_epi16(min_input, inptr[i]);
-  }
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp1 = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp1);
-
-  if (!test) {
-    // Do the row transform
-    idct8_sse2(inptr);
-
-    // Find the min & max for the column transform
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    for (i = 2; i < 8; i++) {
-      max_input = _mm_max_epi16(max_input, inptr[i]);
-      min_input = _mm_min_epi16(min_input, inptr[i]);
-    }
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp1 = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp1);
-
-    if (test) {
-      array_transpose_8x8(inptr, inptr);
-      for (i = 0; i < 8; i++) {
-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
-        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
-        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
-      }
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 8; ++i) {
-      vpx_highbd_idct8_c(input, outptr, bd);
-      input += 8;
-      outptr += 8;
-    }
-  }
-
-  if (optimised_cols) {
-    idct8_sse2(inptr);
-
-    // Final round & shift and Reconstruction and Store
-    {
-      __m128i d[8];
-      for (i = 0; i < 8; i++) {
-        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
-        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
-        inptr[i] = _mm_srai_epi16(inptr[i], 5);
-        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
-        // Store
-        _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
-      }
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[8], temp_out[8];
-    for (i = 0; i < 8; ++i) {
-      for (j = 0; j < 8; ++j)
-        temp_in[j] = out[j * 8 + i];
-      vpx_highbd_idct8_c(temp_in, temp_out, bd);
-      for (j = 0; j < 8; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
-      }
-    }
-  }
-}
-
-void vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                    int stride, int bd) {
-  tran_low_t out[8 * 8] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j, test;
-  __m128i inptr[8];
-  __m128i min_input, max_input, temp1, temp2, sign_bits;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i sixteen = _mm_set1_epi16(16);
-  const __m128i max = _mm_set1_epi16(6201);
-  const __m128i min = _mm_set1_epi16(-6201);
-  int optimised_cols = 0;
-
-  // Load input into __m128i & pack to 16 bits
-  for (i = 0; i < 8; i++) {
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
-    inptr[i] = _mm_packs_epi32(temp1, temp2);
-  }
-
-  // Find the min & max for the row transform
-  // only first 4 row has non-zero coefs
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  for (i = 2; i < 4; i++) {
-    max_input = _mm_max_epi16(max_input, inptr[i]);
-    min_input = _mm_min_epi16(min_input, inptr[i]);
-  }
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp1 = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp1);
-
-  if (!test) {
-    // Do the row transform
-    idct8_sse2(inptr);
-
-    // Find the min & max for the column transform
-    // N.B. Only first 4 cols contain non-zero coeffs
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    for (i = 2; i < 8; i++) {
-      max_input = _mm_max_epi16(max_input, inptr[i]);
-      min_input = _mm_min_epi16(min_input, inptr[i]);
-    }
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp1 = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp1);
-
-    if (test) {
-      // Use fact only first 4 rows contain non-zero coeffs
-      array_transpose_4X8(inptr, inptr);
-      for (i = 0; i < 4; i++) {
-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
-        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
-        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
-      }
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 4; ++i) {
-      vpx_highbd_idct8_c(input, outptr, bd);
-      input += 8;
-      outptr += 8;
-    }
-  }
-
-  if (optimised_cols) {
-    idct8_sse2(inptr);
-
-    // Final round & shift and Reconstruction and Store
-    {
-      __m128i d[8];
-      for (i = 0; i < 8; i++) {
-        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
-        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
-        inptr[i] = _mm_srai_epi16(inptr[i], 5);
-        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
-        // Store
-        _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
-      }
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[8], temp_out[8];
-    for (i = 0; i < 8; ++i) {
-      for (j = 0; j < 8; ++j)
-        temp_in[j] = out[j * 8 + i];
-      vpx_highbd_idct8_c(temp_in, temp_out, bd);
-      for (j = 0; j < 8; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
-      }
-    }
-  }
-}
-
-void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                       int stride, int bd) {
-  tran_low_t out[16 * 16];
-  tran_low_t *outptr = out;
-  int i, j, test;
-  __m128i inptr[32];
-  __m128i min_input, max_input, temp1, temp2, sign_bits;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i rounding = _mm_set1_epi16(32);
-  const __m128i max = _mm_set1_epi16(3155);
-  const __m128i min = _mm_set1_epi16(-3155);
-  int optimised_cols = 0;
-
-  // Load input into __m128i & pack to 16 bits
-  for (i = 0; i < 16; i++) {
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
-    inptr[i] = _mm_packs_epi32(temp1, temp2);
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
-    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
-  }
-
-  // Find the min & max for the row transform
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  for (i = 2; i < 32; i++) {
-    max_input = _mm_max_epi16(max_input, inptr[i]);
-    min_input = _mm_min_epi16(min_input, inptr[i]);
-  }
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp1 = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp1);
-
-  if (!test) {
-    // Do the row transform
-    idct16_sse2(inptr, inptr + 16);
-
-    // Find the min & max for the column transform
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    for (i = 2; i < 32; i++) {
-      max_input = _mm_max_epi16(max_input, inptr[i]);
-      min_input = _mm_min_epi16(min_input, inptr[i]);
-    }
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp1 = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp1);
-
-    if (test) {
-      array_transpose_16x16(inptr, inptr + 16);
-      for (i = 0; i < 16; i++) {
-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
-        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
-        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
-        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
-        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
-        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
-      }
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 16; ++i) {
-      vpx_highbd_idct16_c(input, outptr, bd);
-      input += 16;
-      outptr += 16;
-    }
-  }
-
-  if (optimised_cols) {
-    idct16_sse2(inptr, inptr + 16);
-
-    // Final round & shift and Reconstruction and Store
-    {
-      __m128i d[2];
-      for (i = 0; i < 16; i++) {
-        inptr[i   ] = _mm_add_epi16(inptr[i   ], rounding);
-        inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
-        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
-        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
-        inptr[i   ] = _mm_srai_epi16(inptr[i   ], 6);
-        inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
-        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i   ]), bd);
-        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
-        // Store
-        _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
-        _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
-      }
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[16], temp_out[16];
-    for (i = 0; i < 16; ++i) {
-      for (j = 0; j < 16; ++j)
-        temp_in[j] = out[j * 16 + i];
-      vpx_highbd_idct16_c(temp_in, temp_out, bd);
-      for (j = 0; j < 16; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
-      }
-    }
-  }
-}
-
-void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                      int stride, int bd) {
-  tran_low_t out[16 * 16] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j, test;
-  __m128i inptr[32];
-  __m128i min_input, max_input, temp1, temp2, sign_bits;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i rounding = _mm_set1_epi16(32);
-  const __m128i max = _mm_set1_epi16(3155);
-  const __m128i min = _mm_set1_epi16(-3155);
-  int optimised_cols = 0;
-
-  // Load input into __m128i & pack to 16 bits
-  for (i = 0; i < 16; i++) {
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
-    inptr[i] = _mm_packs_epi32(temp1, temp2);
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
-    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
-  }
-
-  // Find the min & max for the row transform
-  // Since all non-zero dct coefficients are in upper-left 4x4 area,
-  // we only need to consider first 4 rows here.
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  for (i = 2; i < 4; i++) {
-    max_input = _mm_max_epi16(max_input, inptr[i]);
-    min_input = _mm_min_epi16(min_input, inptr[i]);
-  }
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp1 = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp1);
-
-  if (!test) {
-    // Do the row transform (N.B. This transposes inptr)
-    idct16_sse2(inptr, inptr + 16);
-
-    // Find the min & max for the column transform
-    // N.B. Only first 4 cols contain non-zero coeffs
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    for (i = 2; i < 16; i++) {
-      max_input = _mm_max_epi16(max_input, inptr[i]);
-      min_input = _mm_min_epi16(min_input, inptr[i]);
-    }
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp1 = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp1);
-
-    if (test) {
-      // Use fact only first 4 rows contain non-zero coeffs
-      array_transpose_8x8(inptr, inptr);
-      array_transpose_8x8(inptr + 8, inptr + 16);
-      for (i = 0; i < 4; i++) {
-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
-        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
-        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
-        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
-        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
-        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
-      }
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 4; ++i) {
-      vpx_highbd_idct16_c(input, outptr, bd);
-      input += 16;
-      outptr += 16;
-    }
-  }
-
-  if (optimised_cols) {
-    idct16_sse2(inptr, inptr + 16);
-
-    // Final round & shift and Reconstruction and Store
-    {
-      __m128i d[2];
-      for (i = 0; i < 16; i++) {
-        inptr[i   ] = _mm_add_epi16(inptr[i   ], rounding);
-        inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
-        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
-        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
-        inptr[i   ] = _mm_srai_epi16(inptr[i   ], 6);
-        inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
-        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i   ]), bd);
-        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
-        // Store
-        _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
-        _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
-      }
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[16], temp_out[16];
-    for (i = 0; i < 16; ++i) {
-      for (j = 0; j < 16; ++j)
-        temp_in[j] = out[j * 16 + i];
-      vpx_highbd_idct16_c(temp_in, temp_out, bd);
-      for (j = 0; j < 16; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
-      }
-    }
-  }
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_sse2.h b/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
deleted file mode 100644
index bd520c18e5..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_DSP_X86_INV_TXFM_SSE2_H_
-#define VPX_DSP_X86_INV_TXFM_SSE2_H_
-
-#include <emmintrin.h>  // SSE2
-#include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
-#include "vpx_dsp/inv_txfm.h"
-#include "vpx_dsp/x86/txfm_common_sse2.h"
-
-// perform 8x8 transpose
-static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
-  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
-  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
-
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-
-  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
-  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
-  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
-  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
-  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
-  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
-  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
-  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
-}
-
-#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
-  {                                                     \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
-                                                        \
-    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);  /* i1 i0 */  \
-    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1);  /* i3 i2 */  \
-  }
-
-static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-
-  out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
-  out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
-  out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
-  out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
-}
-
-static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
-  __m128i tbuf[8];
-  array_transpose_8x8(res0, res0);
-  array_transpose_8x8(res1, tbuf);
-  array_transpose_8x8(res0 + 8, res1);
-  array_transpose_8x8(res1 + 8, res1 + 8);
-
-  res0[8] = tbuf[0];
-  res0[9] = tbuf[1];
-  res0[10] = tbuf[2];
-  res0[11] = tbuf[3];
-  res0[12] = tbuf[4];
-  res0[13] = tbuf[5];
-  res0[14] = tbuf[6];
-  res0[15] = tbuf[7];
-}
-
-// Function to allow 8 bit optimisations to be used when profile 0 is used with
-// highbitdepth enabled
-static INLINE __m128i load_input_data(const tran_low_t *data) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5],
-      data[6], data[7]);
-#else
-  return _mm_load_si128((const __m128i *)data);
-#endif
-}
-
-static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) {
-  in[0]  = load_input_data(input + 0 * 16);
-  in[1]  = load_input_data(input + 1 * 16);
-  in[2]  = load_input_data(input + 2 * 16);
-  in[3]  = load_input_data(input + 3 * 16);
-  in[4]  = load_input_data(input + 4 * 16);
-  in[5]  = load_input_data(input + 5 * 16);
-  in[6]  = load_input_data(input + 6 * 16);
-  in[7]  = load_input_data(input + 7 * 16);
-
-  in[8]  = load_input_data(input + 8 * 16);
-  in[9]  = load_input_data(input + 9 * 16);
-  in[10]  = load_input_data(input + 10 * 16);
-  in[11]  = load_input_data(input + 11 * 16);
-  in[12]  = load_input_data(input + 12 * 16);
-  in[13]  = load_input_data(input + 13 * 16);
-  in[14]  = load_input_data(input + 14 * 16);
-  in[15]  = load_input_data(input + 15 * 16);
-}
-
-#define RECON_AND_STORE(dest, in_x) \
-  {                                                     \
-     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
-      d0 = _mm_unpacklo_epi8(d0, zero); \
-      d0 = _mm_add_epi16(in_x, d0); \
-      d0 = _mm_packus_epi16(d0, d0); \
-      _mm_storel_epi64((__m128i *)(dest), d0); \
-  }
-
-static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
-  const __m128i final_rounding = _mm_set1_epi16(1<<5);
-  const __m128i zero = _mm_setzero_si128();
-  // Final rounding and shift
-  in[0] = _mm_adds_epi16(in[0], final_rounding);
-  in[1] = _mm_adds_epi16(in[1], final_rounding);
-  in[2] = _mm_adds_epi16(in[2], final_rounding);
-  in[3] = _mm_adds_epi16(in[3], final_rounding);
-  in[4] = _mm_adds_epi16(in[4], final_rounding);
-  in[5] = _mm_adds_epi16(in[5], final_rounding);
-  in[6] = _mm_adds_epi16(in[6], final_rounding);
-  in[7] = _mm_adds_epi16(in[7], final_rounding);
-  in[8] = _mm_adds_epi16(in[8], final_rounding);
-  in[9] = _mm_adds_epi16(in[9], final_rounding);
-  in[10] = _mm_adds_epi16(in[10], final_rounding);
-  in[11] = _mm_adds_epi16(in[11], final_rounding);
-  in[12] = _mm_adds_epi16(in[12], final_rounding);
-  in[13] = _mm_adds_epi16(in[13], final_rounding);
-  in[14] = _mm_adds_epi16(in[14], final_rounding);
-  in[15] = _mm_adds_epi16(in[15], final_rounding);
-
-  in[0] = _mm_srai_epi16(in[0], 6);
-  in[1] = _mm_srai_epi16(in[1], 6);
-  in[2] = _mm_srai_epi16(in[2], 6);
-  in[3] = _mm_srai_epi16(in[3], 6);
-  in[4] = _mm_srai_epi16(in[4], 6);
-  in[5] = _mm_srai_epi16(in[5], 6);
-  in[6] = _mm_srai_epi16(in[6], 6);
-  in[7] = _mm_srai_epi16(in[7], 6);
-  in[8] = _mm_srai_epi16(in[8], 6);
-  in[9] = _mm_srai_epi16(in[9], 6);
-  in[10] = _mm_srai_epi16(in[10], 6);
-  in[11] = _mm_srai_epi16(in[11], 6);
-  in[12] = _mm_srai_epi16(in[12], 6);
-  in[13] = _mm_srai_epi16(in[13], 6);
-  in[14] = _mm_srai_epi16(in[14], 6);
-  in[15] = _mm_srai_epi16(in[15], 6);
-
-  RECON_AND_STORE(dest +  0 * stride, in[0]);
-  RECON_AND_STORE(dest +  1 * stride, in[1]);
-  RECON_AND_STORE(dest +  2 * stride, in[2]);
-  RECON_AND_STORE(dest +  3 * stride, in[3]);
-  RECON_AND_STORE(dest +  4 * stride, in[4]);
-  RECON_AND_STORE(dest +  5 * stride, in[5]);
-  RECON_AND_STORE(dest +  6 * stride, in[6]);
-  RECON_AND_STORE(dest +  7 * stride, in[7]);
-  RECON_AND_STORE(dest +  8 * stride, in[8]);
-  RECON_AND_STORE(dest +  9 * stride, in[9]);
-  RECON_AND_STORE(dest + 10 * stride, in[10]);
-  RECON_AND_STORE(dest + 11 * stride, in[11]);
-  RECON_AND_STORE(dest + 12 * stride, in[12]);
-  RECON_AND_STORE(dest + 13 * stride, in[13]);
-  RECON_AND_STORE(dest + 14 * stride, in[14]);
-  RECON_AND_STORE(dest + 15 * stride, in[15]);
-}
-
-void idct4_sse2(__m128i *in);
-void idct8_sse2(__m128i *in);
-void idct16_sse2(__m128i *in0, __m128i *in1);
-void iadst4_sse2(__m128i *in);
-void iadst8_sse2(__m128i *in);
-void iadst16_sse2(__m128i *in0, __m128i *in1);
-
-#endif  // VPX_DSP_X86_INV_TXFM_SSE2_H_
diff --git a/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm b/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
deleted file mode 100644
index 20baf820f6..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
+++ /dev/null
@@ -1,1793 +0,0 @@
-;
-;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-; This file provides SSSE3 version of the inverse transformation. Part
-; of the functions are originally derived from the ffmpeg project.
-; Note that the current version applies to x86 64-bit only.
-
-SECTION_RODATA
-
-pw_11585x2: times 8 dw 23170
-
-pw_m2404x2:  times 8 dw  -2404*2
-pw_m4756x2:  times 8 dw  -4756*2
-pw_m5520x2:  times 8 dw  -5520*2
-pw_m8423x2:  times 8 dw  -8423*2
-pw_m9102x2:  times 8 dw  -9102*2
-pw_m10394x2: times 8 dw -10394*2
-pw_m11003x2: times 8 dw -11003*2
-
-pw_16364x2: times 8 dw 16364*2
-pw_16305x2: times 8 dw 16305*2
-pw_16207x2: times 8 dw 16207*2
-pw_16069x2: times 8 dw 16069*2
-pw_15893x2: times 8 dw 15893*2
-pw_15679x2: times 8 dw 15679*2
-pw_15426x2: times 8 dw 15426*2
-pw_15137x2: times 8 dw 15137*2
-pw_14811x2: times 8 dw 14811*2
-pw_14449x2: times 8 dw 14449*2
-pw_14053x2: times 8 dw 14053*2
-pw_13623x2: times 8 dw 13623*2
-pw_13160x2: times 8 dw 13160*2
-pw_12665x2: times 8 dw 12665*2
-pw_12140x2: times 8 dw 12140*2
-pw__9760x2: times 8 dw  9760*2
-pw__7723x2: times 8 dw  7723*2
-pw__7005x2: times 8 dw  7005*2
-pw__6270x2: times 8 dw  6270*2
-pw__3981x2: times 8 dw  3981*2
-pw__3196x2: times 8 dw  3196*2
-pw__1606x2: times 8 dw  1606*2
-pw___804x2: times 8 dw   804*2
-
-pd_8192:    times 4 dd 8192
-pw_32:      times 8 dw 32
-pw_16:      times 8 dw 16
-
-%macro TRANSFORM_COEFFS 2
-pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2
-pw_m%2_%1:  dw -%2,  %1, -%2,  %1, -%2,  %1, -%2,  %1
-pw_m%1_m%2: dw -%1, -%2, -%1, -%2, -%1, -%2, -%1, -%2
-%endmacro
-
-TRANSFORM_COEFFS    6270, 15137
-TRANSFORM_COEFFS    3196, 16069
-TRANSFORM_COEFFS   13623,  9102
-
-; constants for 32x32_34
-TRANSFORM_COEFFS      804, 16364
-TRANSFORM_COEFFS    15426,  5520
-TRANSFORM_COEFFS     3981, 15893
-TRANSFORM_COEFFS    16207,  2404
-TRANSFORM_COEFFS     1606, 16305
-TRANSFORM_COEFFS    15679,  4756
-TRANSFORM_COEFFS    11585, 11585
-
-; constants for 32x32_1024
-TRANSFORM_COEFFS    12140, 11003
-TRANSFORM_COEFFS     7005, 14811
-TRANSFORM_COEFFS    14053,  8423
-TRANSFORM_COEFFS     9760, 13160
-TRANSFORM_COEFFS    12665, 10394
-TRANSFORM_COEFFS     7723, 14449
-
-%macro PAIR_PP_COEFFS 2
-dpw_%1_%2:   dw  %1,  %1,  %1,  %1,  %2,  %2,  %2,  %2
-%endmacro
-
-%macro PAIR_MP_COEFFS 2
-dpw_m%1_%2:  dw -%1, -%1, -%1, -%1,  %2,  %2,  %2,  %2
-%endmacro
-
-%macro PAIR_MM_COEFFS 2
-dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2
-%endmacro
-
-PAIR_PP_COEFFS     30274, 12540
-PAIR_PP_COEFFS      6392, 32138
-PAIR_MP_COEFFS     18204, 27246
-
-PAIR_PP_COEFFS     12540, 12540
-PAIR_PP_COEFFS     30274, 30274
-PAIR_PP_COEFFS      6392,  6392
-PAIR_PP_COEFFS     32138, 32138
-PAIR_MM_COEFFS     18204, 18204
-PAIR_PP_COEFFS     27246, 27246
-
-SECTION .text
-
-%if ARCH_X86_64
-%macro SUM_SUB 3
-  psubw  m%3, m%1, m%2
-  paddw  m%1, m%2
-  SWAP    %2, %3
-%endmacro
-
-; butterfly operation
-%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2
-  pmaddwd            m%1, m%3, %5
-  pmaddwd            m%2, m%3, %6
-  paddd              m%1,  %4
-  paddd              m%2,  %4
-  psrad              m%1,  14
-  psrad              m%2,  14
-%endmacro
-
-%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
-  punpckhwd          m%6, m%2, m%1
-  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_m%4_%3], [pw_%3_%4]
-  punpcklwd          m%2, m%1
-  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_m%4_%3], [pw_%3_%4]
-  packssdw           m%1, m%7
-  packssdw           m%2, m%6
-%endmacro
-
-%macro BUTTERFLY_4Xmm 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
-  punpckhwd          m%6, m%2, m%1
-  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_m%4_%3], [pw_m%3_m%4]
-  punpcklwd          m%2, m%1
-  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_m%4_%3], [pw_m%3_m%4]
-  packssdw           m%1, m%7
-  packssdw           m%2, m%6
-%endmacro
-
-; matrix transpose
-%macro INTERLEAVE_2X 4
-  punpckh%1          m%4, m%2, m%3
-  punpckl%1          m%2, m%3
-  SWAP               %3,  %4
-%endmacro
-
-%macro TRANSPOSE8X8 9
-  INTERLEAVE_2X  wd, %1, %2, %9
-  INTERLEAVE_2X  wd, %3, %4, %9
-  INTERLEAVE_2X  wd, %5, %6, %9
-  INTERLEAVE_2X  wd, %7, %8, %9
-
-  INTERLEAVE_2X  dq, %1, %3, %9
-  INTERLEAVE_2X  dq, %2, %4, %9
-  INTERLEAVE_2X  dq, %5, %7, %9
-  INTERLEAVE_2X  dq, %6, %8, %9
-
-  INTERLEAVE_2X  qdq, %1, %5, %9
-  INTERLEAVE_2X  qdq, %3, %7, %9
-  INTERLEAVE_2X  qdq, %2, %6, %9
-  INTERLEAVE_2X  qdq, %4, %8, %9
-
-  SWAP  %2, %5
-  SWAP  %4, %7
-%endmacro
-
-%macro IDCT8_1D 0
-  SUM_SUB          0,    4,    9
-  BUTTERFLY_4X     2,    6,    6270, 15137,  m8,  9,  10
-  pmulhrsw        m0,  m12
-  pmulhrsw        m4,  m12
-  BUTTERFLY_4X     1,    7,    3196, 16069,  m8,  9,  10
-  BUTTERFLY_4X     5,    3,   13623,  9102,  m8,  9,  10
-
-  SUM_SUB          1,    5,    9
-  SUM_SUB          7,    3,    9
-  SUM_SUB          0,    6,    9
-  SUM_SUB          4,    2,    9
-  SUM_SUB          3,    5,    9
-  pmulhrsw        m3,  m12
-  pmulhrsw        m5,  m12
-
-  SUM_SUB          0,    7,    9
-  SUM_SUB          4,    3,    9
-  SUM_SUB          2,    5,    9
-  SUM_SUB          6,    1,    9
-
-  SWAP             3,    6
-  SWAP             1,    4
-%endmacro
-
-; This macro handles 8 pixels per line
-%macro ADD_STORE_8P_2X 5;  src1, src2, tmp1, tmp2, zero
-  paddw           m%1, m11
-  paddw           m%2, m11
-  psraw           m%1, 5
-  psraw           m%2, 5
-
-  movh            m%3, [outputq]
-  movh            m%4, [outputq + strideq]
-  punpcklbw       m%3, m%5
-  punpcklbw       m%4, m%5
-  paddw           m%3, m%1
-  paddw           m%4, m%2
-  packuswb        m%3, m%5
-  packuswb        m%4, m%5
-  movh               [outputq], m%3
-  movh     [outputq + strideq], m%4
-%endmacro
-
-INIT_XMM ssse3
-; full inverse 8x8 2D-DCT transform
-cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
-  mova     m8, [pd_8192]
-  mova    m11, [pw_16]
-  mova    m12, [pw_11585x2]
-
-  lea      r3, [2 * strideq]
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova     m0, [inputq +   0]
-  packssdw m0, [inputq +  16]
-  mova     m1, [inputq +  32]
-  packssdw m1, [inputq +  48]
-  mova     m2, [inputq +  64]
-  packssdw m2, [inputq +  80]
-  mova     m3, [inputq +  96]
-  packssdw m3, [inputq + 112]
-  mova     m4, [inputq + 128]
-  packssdw m4, [inputq + 144]
-  mova     m5, [inputq + 160]
-  packssdw m5, [inputq + 176]
-  mova     m6, [inputq + 192]
-  packssdw m6, [inputq + 208]
-  mova     m7, [inputq + 224]
-  packssdw m7, [inputq + 240]
-%else
-  mova     m0, [inputq +   0]
-  mova     m1, [inputq +  16]
-  mova     m2, [inputq +  32]
-  mova     m3, [inputq +  48]
-  mova     m4, [inputq +  64]
-  mova     m5, [inputq +  80]
-  mova     m6, [inputq +  96]
-  mova     m7, [inputq + 112]
-%endif
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
-  IDCT8_1D
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
-  IDCT8_1D
-
-  pxor    m12, m12
-  ADD_STORE_8P_2X  0, 1, 9, 10, 12
-  lea              outputq, [outputq + r3]
-  ADD_STORE_8P_2X  2, 3, 9, 10, 12
-  lea              outputq, [outputq + r3]
-  ADD_STORE_8P_2X  4, 5, 9, 10, 12
-  lea              outputq, [outputq + r3]
-  ADD_STORE_8P_2X  6, 7, 9, 10, 12
-
-  RET
-
-; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero
-cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
-  mova       m8, [pd_8192]
-  mova      m11, [pw_16]
-  mova      m12, [pw_11585x2]
-
-  lea        r3, [2 * strideq]
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova       m0, [inputq +   0]
-  packssdw   m0, [inputq +  16]
-  mova       m1, [inputq +  32]
-  packssdw   m1, [inputq +  48]
-  mova       m2, [inputq +  64]
-  packssdw   m2, [inputq +  80]
-  mova       m3, [inputq +  96]
-  packssdw   m3, [inputq + 112]
-%else
-  mova       m0, [inputq +  0]
-  mova       m1, [inputq + 16]
-  mova       m2, [inputq + 32]
-  mova       m3, [inputq + 48]
-%endif
-
-  punpcklwd  m0, m1
-  punpcklwd  m2, m3
-  punpckhdq  m9, m0, m2
-  punpckldq  m0, m2
-  SWAP       2, 9
-
-  ; m0 -> [0], [0]
-  ; m1 -> [1], [1]
-  ; m2 -> [2], [2]
-  ; m3 -> [3], [3]
-  punpckhqdq m10, m0, m0
-  punpcklqdq m0,  m0
-  punpckhqdq m9,  m2, m2
-  punpcklqdq m2,  m2
-  SWAP       1, 10
-  SWAP       3,  9
-
-  pmulhrsw   m0, m12
-  pmulhrsw   m2, [dpw_30274_12540]
-  pmulhrsw   m1, [dpw_6392_32138]
-  pmulhrsw   m3, [dpw_m18204_27246]
-
-  SUM_SUB    0, 2, 9
-  SUM_SUB    1, 3, 9
-
-  punpcklqdq m9, m3, m3
-  punpckhqdq m5, m3, m9
-
-  SUM_SUB    3, 5, 9
-  punpckhqdq m5, m3
-  pmulhrsw   m5, m12
-
-  punpckhqdq m9, m1, m5
-  punpcklqdq m1, m5
-  SWAP       5, 9
-
-  SUM_SUB    0, 5, 9
-  SUM_SUB    2, 1, 9
-
-  punpckhqdq m3, m0, m0
-  punpckhqdq m4, m1, m1
-  punpckhqdq m6, m5, m5
-  punpckhqdq m7, m2, m2
-
-  punpcklwd  m0, m3
-  punpcklwd  m7, m2
-  punpcklwd  m1, m4
-  punpcklwd  m6, m5
-
-  punpckhdq  m4, m0, m7
-  punpckldq  m0, m7
-  punpckhdq  m10, m1, m6
-  punpckldq  m5, m1, m6
-
-  punpckhqdq m1, m0, m5
-  punpcklqdq m0, m5
-  punpckhqdq m3, m4, m10
-  punpcklqdq m2, m4, m10
-
-
-  pmulhrsw   m0, m12
-  pmulhrsw   m6, m2, [dpw_30274_30274]
-  pmulhrsw   m4, m2, [dpw_12540_12540]
-
-  pmulhrsw   m7, m1, [dpw_32138_32138]
-  pmulhrsw   m1, [dpw_6392_6392]
-  pmulhrsw   m5, m3, [dpw_m18204_m18204]
-  pmulhrsw   m3, [dpw_27246_27246]
-
-  mova       m2, m0
-  SUM_SUB    0, 6, 9
-  SUM_SUB    2, 4, 9
-  SUM_SUB    1, 5, 9
-  SUM_SUB    7, 3, 9
-
-  SUM_SUB    3, 5, 9
-  pmulhrsw   m3, m12
-  pmulhrsw   m5, m12
-
-  SUM_SUB    0, 7, 9
-  SUM_SUB    2, 3, 9
-  SUM_SUB    4, 5, 9
-  SUM_SUB    6, 1, 9
-
-  SWAP       3, 6
-  SWAP       1, 2
-  SWAP       2, 4
-
-
-  pxor    m12, m12
-  ADD_STORE_8P_2X  0, 1, 9, 10, 12
-  lea              outputq, [outputq + r3]
-  ADD_STORE_8P_2X  2, 3, 9, 10, 12
-  lea              outputq, [outputq + r3]
-  ADD_STORE_8P_2X  4, 5, 9, 10, 12
-  lea              outputq, [outputq + r3]
-  ADD_STORE_8P_2X  6, 7, 9, 10, 12
-
-  RET
-
-%define  idx0 16 * 0
-%define  idx1 16 * 1
-%define  idx2 16 * 2
-%define  idx3 16 * 3
-%define  idx4 16 * 4
-%define  idx5 16 * 5
-%define  idx6 16 * 6
-%define  idx7 16 * 7
-%define  idx8 16 * 0
-%define  idx9 16 * 1
-%define idx10 16 * 2
-%define idx11 16 * 3
-%define idx12 16 * 4
-%define idx13 16 * 5
-%define idx14 16 * 6
-%define idx15 16 * 7
-%define idx16 16 * 0
-%define idx17 16 * 1
-%define idx18 16 * 2
-%define idx19 16 * 3
-%define idx20 16 * 4
-%define idx21 16 * 5
-%define idx22 16 * 6
-%define idx23 16 * 7
-%define idx24 16 * 0
-%define idx25 16 * 1
-%define idx26 16 * 2
-%define idx27 16 * 3
-%define idx28 16 * 4
-%define idx29 16 * 5
-%define idx30 16 * 6
-%define idx31 16 * 7
-
-; FROM idct32x32_add_neon.asm
-;
-; Instead of doing the transforms stage by stage, it is done by loading
-; some input values and doing as many stages as possible to minimize the
-; storing/loading of intermediate results. To fit within registers, the
-; final coefficients are cut into four blocks:
-; BLOCK A: 16-19,28-31
-; BLOCK B: 20-23,24-27
-; BLOCK C: 8-11,12-15
-; BLOCK D: 0-3,4-7
-; Blocks A and C are straight calculation through the various stages. In
-; block B, further calculations are performed using the results from
-; block A. In block D, further calculations are performed using the results
-; from block C and then the final calculations are done using results from
-; block A and B which have been combined at the end of block B.
-;
-
-%macro IDCT32X32_34 4
-  ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                m11, m1
-  pmulhrsw             m1, [pw___804x2] ; stp1_16
-  mova      [r4 +      0], m0
-  pmulhrsw            m11, [pw_16364x2] ; stp2_31
-  mova      [r4 + 16 * 2], m2
-  mova                m12, m7
-  pmulhrsw             m7, [pw_15426x2] ; stp1_28
-  mova      [r4 + 16 * 4], m4
-  pmulhrsw            m12, [pw_m5520x2] ; stp2_19
-  mova      [r4 + 16 * 6], m6
-
-  ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m2, m1   ; stp1_16
-  mova                 m0, m11  ; stp1_31
-  mova                 m4, m7   ; stp1_28
-  mova                m15, m12  ; stp1_19
-
-  ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          0,     2,   3196, 16069,  m8,  9,  10 ; stp1_17, stp1_30
-  BUTTERFLY_4Xmm        4,    15,   3196, 16069,  m8,  9,  10 ; stp1_29, stp1_18
-
-  ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               1, 12, 9 ; stp2_16, stp2_19
-  SUM_SUB               0, 15, 9 ; stp2_17, stp2_18
-  SUM_SUB              11,  7, 9 ; stp2_31, stp2_28
-  SUM_SUB               2,  4, 9 ; stp2_30, stp2_29
-
-  ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          4,    15,   6270, 15137,  m8,  9,  10 ; stp1_18, stp1_29
-  BUTTERFLY_4X          7,    12,   6270, 15137,  m8,  9,  10 ; stp1_19, stp1_28
-
-  ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m6, m5
-  pmulhrsw             m5, [pw__3981x2] ; stp1_20
-  mova [stp + %4 + idx28], m12
-  mova [stp + %4 + idx29], m15
-  pmulhrsw             m6, [pw_15893x2] ; stp2_27
-  mova [stp + %4 + idx30], m2
-  mova                 m2, m3
-  pmulhrsw             m3, [pw_m2404x2] ; stp1_23
-  mova [stp + %4 + idx31], m11
-  pmulhrsw             m2, [pw_16207x2] ; stp2_24
-
-  ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                m13, m5 ; stp1_20
-  mova                m14, m6 ; stp1_27
-  mova                m15, m3 ; stp1_23
-  mova                m11, m2 ; stp1_24
-
-  ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_21, stp1_26
-  BUTTERFLY_4Xmm       11,    15,  13623,  9102,  m8,  9,  10 ; stp1_25, stp1_22
-
-  ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               3,  5, 9 ; stp2_23, stp2_20
-  SUM_SUB              15, 14, 9 ; stp2_22, stp2_21
-  SUM_SUB               2,  6, 9 ; stp2_24, stp2_27
-  SUM_SUB              11, 13, 9 ; stp2_25, stp2_26
-
-  ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4Xmm        6,     5,   6270, 15137,  m8,  9,  10 ; stp1_27, stp1_20
-  BUTTERFLY_4Xmm       13,    14,   6270, 15137,  m8,  9,  10 ; stp1_26, stp1_21
-
-  ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               1,  3, 9 ; stp2_16, stp2_23
-  SUM_SUB               0, 15, 9 ; stp2_17, stp2_22
-  SUM_SUB               4, 14, 9 ; stp2_18, stp2_21
-  SUM_SUB               7,  5, 9 ; stp2_19, stp2_20
-  mova [stp + %3 + idx16], m1
-  mova [stp + %3 + idx17], m0
-  mova [stp + %3 + idx18], m4
-  mova [stp + %3 + idx19], m7
-
-  mova                 m4, [stp + %4 + idx28]
-  mova                 m7, [stp + %4 + idx29]
-  mova                m10, [stp + %4 + idx30]
-  mova                m12, [stp + %4 + idx31]
-  SUM_SUB               4,  6, 9 ; stp2_28, stp2_27
-  SUM_SUB               7, 13, 9 ; stp2_29, stp2_26
-  SUM_SUB              10, 11, 9 ; stp2_30, stp2_25
-  SUM_SUB              12,  2, 9 ; stp2_31, stp2_24
-  mova [stp + %4 + idx28], m4
-  mova [stp + %4 + idx29], m7
-  mova [stp + %4 + idx30], m10
-  mova [stp + %4 + idx31], m12
-
-  ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               6,  5, 9
-  pmulhrsw             m6, m10  ; stp1_27
-  pmulhrsw             m5, m10  ; stp1_20
-  SUM_SUB              13, 14,  9
-  pmulhrsw            m13, m10  ; stp1_26
-  pmulhrsw            m14, m10  ; stp1_21
-  SUM_SUB              11, 15,  9
-  pmulhrsw            m11, m10  ; stp1_25
-  pmulhrsw            m15, m10  ; stp1_22
-  SUM_SUB               2,  3,  9
-  pmulhrsw             m2, m10  ; stp1_24
-  pmulhrsw             m3, m10  ; stp1_23
-%else
-  BUTTERFLY_4X          6,     5,  11585, 11585,  m8,  9,  10 ; stp1_20, stp1_27
-  SWAP 6, 5
-  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_21, stp1_26
-  SWAP 13, 14
-  BUTTERFLY_4X         11,    15,  11585, 11585,  m8,  9,  10 ; stp1_22, stp1_25
-  SWAP 11, 15
-  BUTTERFLY_4X          2,     3,  11585, 11585,  m8,  9,  10 ; stp1_23, stp1_24
-  SWAP 2, 3
-%endif
-
-  mova [stp + %4 + idx24], m2
-  mova [stp + %4 + idx25], m11
-  mova [stp + %4 + idx26], m13
-  mova [stp + %4 + idx27], m6
-
-  ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m0, [rsp + transposed_in + 16 *  2]
-  mova                 m6, [rsp + transposed_in + 16 *  6]
-
-  mova                 m1, m0
-  pmulhrsw             m0, [pw__1606x2] ; stp1_8
-  mova [stp + %3 + idx20], m5
-  mova [stp + %3 + idx21], m14
-  pmulhrsw             m1, [pw_16305x2] ; stp2_15
-  mova [stp + %3 + idx22], m15
-  mova                 m7, m6
-  pmulhrsw             m7, [pw_m4756x2] ; stp2_11
-  mova [stp + %3 + idx23], m3
-  pmulhrsw             m6, [pw_15679x2] ; stp1_12
-
-  ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m3, m0 ; stp1_8
-  mova                 m2, m1 ; stp1_15
-
-  ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_9, stp1_14
-  mova                 m4, m7 ; stp1_11
-  mova                 m5, m6 ; stp1_12
-  BUTTERFLY_4Xmm        5,     4,   6270, 15137,  m8,  9,  10 ; stp1_13, stp1_10
-
-  ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0,  7, 9 ;  stp1_8, stp1_11
-  SUM_SUB               2,  4, 9 ;  stp1_9, stp1_10
-  SUM_SUB               1,  6, 9 ;  stp1_15, stp1_12
-  SUM_SUB               3,  5, 9 ;  stp1_14, stp1_13
-
-  ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               5,  4, 9
-  pmulhrsw             m5, m10  ; stp1_13
-  pmulhrsw             m4, m10  ; stp1_10
-  SUM_SUB               6,  7, 9
-  pmulhrsw             m6, m10  ; stp1_12
-  pmulhrsw             m7, m10  ; stp1_11
-%else
-  BUTTERFLY_4X          5,     4,  11585, 11585,  m8,  9,  10 ; stp1_10, stp1_13
-  SWAP 5, 4
-  BUTTERFLY_4X          6,     7,  11585, 11585,  m8,  9,  10 ; stp1_11, stp1_12
-  SWAP 6, 7
-%endif
-
-  ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova [stp + %2 +  idx8], m0
-  mova [stp + %2 +  idx9], m2
-  mova [stp + %2 + idx10], m4
-  mova [stp + %2 + idx11], m7
-
-  ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                m11, [rsp + transposed_in + 16 *  4]
-  mova                m12, m11
-  pmulhrsw            m11, [pw__3196x2] ; stp1_4
-  pmulhrsw            m12, [pw_16069x2] ; stp1_7
-
-  ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m0, [rsp + transposed_in + 16 *  0]
-  mova                m10, [pw_11585x2]
-  pmulhrsw             m0, m10  ; stp1_1
-
-  mova                m14, m11 ; stp1_4
-  mova                m13, m12 ; stp1_7
-
-  ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  SUM_SUB              13,   14,  9
-  pmulhrsw            m13, m10  ; stp1_6
-  pmulhrsw            m14, m10  ; stp1_5
-%else
-  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_5, stp1_6
-  SWAP 13, 14
-%endif
-  mova                 m7, m0 ; stp1_0 = stp1_1
-  mova                 m4, m0 ; stp1_1
-  mova                 m2, m7 ; stp1_0
-
-  ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0, 12, 9 ;  stp1_0, stp1_7
-  SUM_SUB               7, 13, 9 ;  stp1_1, stp1_6
-  SUM_SUB               2, 14, 9 ;  stp1_2, stp1_5
-  SUM_SUB               4, 11, 9 ;  stp1_3, stp1_4
-
-  ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0,  1, 9 ;  stp1_0, stp1_15
-  SUM_SUB               7,  3, 9 ;  stp1_1, stp1_14
-  SUM_SUB               2,  5, 9 ;  stp1_2, stp1_13
-  SUM_SUB               4,  6, 9 ;  stp1_3, stp1_12
-
-  ; 0-3, 28-31 final stage
-  mova                m15, [stp + %4 + idx30]
-  mova                m10, [stp + %4 + idx31]
-  SUM_SUB               0, 10, 9 ;  stp1_0, stp1_31
-  SUM_SUB               7, 15, 9 ;  stp1_1, stp1_30
-  mova [stp + %1 +  idx0], m0
-  mova [stp + %1 +  idx1], m7
-  mova [stp + %4 + idx30], m15
-  mova [stp + %4 + idx31], m10
-  mova                 m7, [stp + %4 + idx28]
-  mova                 m0, [stp + %4 + idx29]
-  SUM_SUB               2,  0, 9 ;  stp1_2, stp1_29
-  SUM_SUB               4,  7, 9 ;  stp1_3, stp1_28
-  mova [stp + %1 +  idx2], m2
-  mova [stp + %1 +  idx3], m4
-  mova [stp + %4 + idx28], m7
-  mova [stp + %4 + idx29], m0
-
-  ; 12-15, 16-19 final stage
-  mova                 m0, [stp + %3 + idx16]
-  mova                 m7, [stp + %3 + idx17]
-  mova                 m2, [stp + %3 + idx18]
-  mova                 m4, [stp + %3 + idx19]
-  SUM_SUB               1,  0, 9 ;  stp1_15, stp1_16
-  SUM_SUB               3,  7, 9 ;  stp1_14, stp1_17
-  SUM_SUB               5,  2, 9 ;  stp1_13, stp1_18
-  SUM_SUB               6,  4, 9 ;  stp1_12, stp1_19
-  mova [stp + %2 + idx12], m6
-  mova [stp + %2 + idx13], m5
-  mova [stp + %2 + idx14], m3
-  mova [stp + %2 + idx15], m1
-  mova [stp + %3 + idx16], m0
-  mova [stp + %3 + idx17], m7
-  mova [stp + %3 + idx18], m2
-  mova [stp + %3 + idx19], m4
-
-  mova                 m4, [stp + %2 +  idx8]
-  mova                 m5, [stp + %2 +  idx9]
-  mova                 m6, [stp + %2 + idx10]
-  mova                 m7, [stp + %2 + idx11]
-  SUM_SUB              11,  7, 9 ;  stp1_4, stp1_11
-  SUM_SUB              14,  6, 9 ;  stp1_5, stp1_10
-  SUM_SUB              13,  5, 9 ;  stp1_6, stp1_9
-  SUM_SUB              12,  4, 9 ;  stp1_7, stp1_8
-
-  ; 4-7, 24-27 final stage
-  mova                 m0, [stp + %4 + idx27]
-  mova                 m1, [stp + %4 + idx26]
-  mova                 m2, [stp + %4 + idx25]
-  mova                 m3, [stp + %4 + idx24]
-  SUM_SUB              11,  0, 9 ;  stp1_4, stp1_27
-  SUM_SUB              14,  1, 9 ;  stp1_5, stp1_26
-  SUM_SUB              13,  2, 9 ;  stp1_6, stp1_25
-  SUM_SUB              12,  3, 9 ;  stp1_7, stp1_24
-  mova [stp + %4 + idx27], m0
-  mova [stp + %4 + idx26], m1
-  mova [stp + %4 + idx25], m2
-  mova [stp + %4 + idx24], m3
-  mova [stp + %1 +  idx4], m11
-  mova [stp + %1 +  idx5], m14
-  mova [stp + %1 +  idx6], m13
-  mova [stp + %1 +  idx7], m12
-
-  ; 8-11, 20-23 final stage
-  mova                 m0, [stp + %3 + idx20]
-  mova                 m1, [stp + %3 + idx21]
-  mova                 m2, [stp + %3 + idx22]
-  mova                 m3, [stp + %3 + idx23]
-  SUM_SUB               7,  0, 9 ;  stp1_11, stp_20
-  SUM_SUB               6,  1, 9 ;  stp1_10, stp_21
-  SUM_SUB               5,  2, 9 ;   stp1_9, stp_22
-  SUM_SUB               4,  3, 9 ;   stp1_8, stp_23
-  mova [stp + %2 +  idx8], m4
-  mova [stp + %2 +  idx9], m5
-  mova [stp + %2 + idx10], m6
-  mova [stp + %2 + idx11], m7
-  mova [stp + %3 + idx20], m0
-  mova [stp + %3 + idx21], m1
-  mova [stp + %3 + idx22], m2
-  mova [stp + %3 + idx23], m3
-%endmacro
-
-%macro RECON_AND_STORE 1
-  mova            m11, [pw_32]
-  lea             stp, [rsp + %1]
-  mov              r6, 32
-  pxor             m8, m8
-%%recon_and_store:
-  mova             m0, [stp + 16 * 32 * 0]
-  mova             m1, [stp + 16 * 32 * 1]
-  mova             m2, [stp + 16 * 32 * 2]
-  mova             m3, [stp + 16 * 32 * 3]
-  add             stp, 16
-
-  paddw            m0, m11
-  paddw            m1, m11
-  paddw            m2, m11
-  paddw            m3, m11
-  psraw            m0, 6
-  psraw            m1, 6
-  psraw            m2, 6
-  psraw            m3, 6
-  movh             m4, [outputq +  0]
-  movh             m5, [outputq +  8]
-  movh             m6, [outputq + 16]
-  movh             m7, [outputq + 24]
-  punpcklbw        m4, m8
-  punpcklbw        m5, m8
-  punpcklbw        m6, m8
-  punpcklbw        m7, m8
-  paddw            m0, m4
-  paddw            m1, m5
-  paddw            m2, m6
-  paddw            m3, m7
-  packuswb         m0, m1
-  packuswb         m2, m3
-  mova [outputq +  0], m0
-  mova [outputq + 16], m2
-  lea         outputq, [outputq + strideq]
-  dec              r6
-  jnz %%recon_and_store
-%endmacro
-
-%define i32x32_size     16*32*5
-%define pass_two_start  16*32*0
-%define transposed_in   16*32*4
-%define pass_one_start  16*32*0
-%define stp r8
-
-INIT_XMM ssse3
-cglobal idct32x32_34_add, 3, 11, 16, i32x32_size, input, output, stride
-  mova            m8, [pd_8192]
-  lea            stp, [rsp + pass_one_start]
-
-idct32x32_34:
-  mov             r3, inputq
-  lea             r4, [rsp + transposed_in]
-
-idct32x32_34_transpose:
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova            m0, [r3 +       0]
-  packssdw        m0, [r3 +      16]
-  mova            m1, [r3 + 32 *  4]
-  packssdw        m1, [r3 + 32 *  4 + 16]
-  mova            m2, [r3 + 32 *  8]
-  packssdw        m2, [r3 + 32 *  8 + 16]
-  mova            m3, [r3 + 32 * 12]
-  packssdw        m3, [r3 + 32 * 12 + 16]
-  mova            m4, [r3 + 32 * 16]
-  packssdw        m4, [r3 + 32 * 16 + 16]
-  mova            m5, [r3 + 32 * 20]
-  packssdw        m5, [r3 + 32 * 20 + 16]
-  mova            m6, [r3 + 32 * 24]
-  packssdw        m6, [r3 + 32 * 24 + 16]
-  mova            m7, [r3 + 32 * 28]
-  packssdw        m7, [r3 + 32 * 28 + 16]
-%else
-  mova            m0, [r3 +       0]
-  mova            m1, [r3 + 16 *  4]
-  mova            m2, [r3 + 16 *  8]
-  mova            m3, [r3 + 16 * 12]
-  mova            m4, [r3 + 16 * 16]
-  mova            m5, [r3 + 16 * 20]
-  mova            m6, [r3 + 16 * 24]
-  mova            m7, [r3 + 16 * 28]
-%endif
-
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  IDCT32X32_34  16*0, 16*32, 16*64, 16*96
-  lea            stp, [stp + 16 * 8]
-  mov             r6, 4
-  lea            stp, [rsp + pass_one_start]
-  lea             r9, [rsp + pass_one_start]
-
-idct32x32_34_2:
-  lea             r4, [rsp + transposed_in]
-  mov             r3, r9
-
-idct32x32_34_transpose_2:
-  mova            m0, [r3 +      0]
-  mova            m1, [r3 + 16 * 1]
-  mova            m2, [r3 + 16 * 2]
-  mova            m3, [r3 + 16 * 3]
-  mova            m4, [r3 + 16 * 4]
-  mova            m5, [r3 + 16 * 5]
-  mova            m6, [r3 + 16 * 6]
-  mova            m7, [r3 + 16 * 7]
-
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  IDCT32X32_34  16*0, 16*8, 16*16, 16*24
-
-  lea            stp, [stp + 16 * 32]
-  add             r9, 16 * 32
-  dec             r6
-  jnz idct32x32_34_2
-
-  RECON_AND_STORE pass_two_start
-
-  RET
-
-%macro IDCT32X32_135 4
-  ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m1, [rsp + transposed_in + 16 *  1]
-  mova                m11, m1
-  pmulhrsw             m1, [pw___804x2] ; stp1_16
-  pmulhrsw            m11, [pw_16364x2] ; stp2_31
-
-  mova                 m7, [rsp + transposed_in + 16 *  7]
-  mova                m12, m7
-  pmulhrsw             m7, [pw_15426x2] ; stp1_28
-  pmulhrsw            m12, [pw_m5520x2] ; stp2_19
-
-  mova                 m3, [rsp + transposed_in + 16 *  9]
-  mova                 m4, m3
-  pmulhrsw             m3, [pw__7005x2] ; stp1_18
-  pmulhrsw             m4, [pw_14811x2] ; stp2_29
-
-  mova                 m0, [rsp + transposed_in + 16 * 15]
-  mova                 m2, m0
-  pmulhrsw             m0, [pw_12140x2]  ; stp1_30
-  pmulhrsw             m2, [pw_m11003x2] ; stp2_17
-
-  ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               1,  2, 9 ; stp2_16, stp2_17
-  SUM_SUB              12,  3, 9 ; stp2_19, stp2_18
-  SUM_SUB               7,  4, 9 ; stp2_28, stp2_29
-  SUM_SUB              11,  0, 9 ; stp2_31, stp2_30
-
-  ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          0,     2,   3196, 16069,  m8,  9,  10 ; stp1_17, stp1_30
-  BUTTERFLY_4Xmm        4,     3,   3196, 16069,  m8,  9,  10 ; stp1_29, stp1_18
-
-  ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               1, 12, 9 ; stp2_16, stp2_19
-  SUM_SUB               0,  3, 9 ; stp2_17, stp2_18
-  SUM_SUB              11,  7, 9 ; stp2_31, stp2_28
-  SUM_SUB               2,  4, 9 ; stp2_30, stp2_29
-
-  ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          4,     3,   6270, 15137,  m8,  9,  10 ; stp1_18, stp1_29
-  BUTTERFLY_4X          7,    12,   6270, 15137,  m8,  9,  10 ; stp1_19, stp1_28
-
-  mova [stp + %3 + idx16], m1
-  mova [stp + %3 + idx17], m0
-  mova [stp + %3 + idx18], m4
-  mova [stp + %3 + idx19], m7
-  mova [stp + %4 + idx28], m12
-  mova [stp + %4 + idx29], m3
-  mova [stp + %4 + idx30], m2
-  mova [stp + %4 + idx31], m11
-
-  ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m2, [rsp + transposed_in + 16 *  3]
-  mova                 m3, m2
-  pmulhrsw             m3, [pw_m2404x2] ; stp1_23
-  pmulhrsw             m2, [pw_16207x2] ; stp2_24
-
-  mova                 m5, [rsp + transposed_in + 16 *  5]
-  mova                 m6, m5
-  pmulhrsw             m5, [pw__3981x2] ; stp1_20
-  pmulhrsw             m6, [pw_15893x2] ; stp2_27
-
-  mova                m14, [rsp + transposed_in + 16 * 11]
-  mova                m13, m14
-  pmulhrsw            m13, [pw_m8423x2] ; stp1_21
-  pmulhrsw            m14, [pw_14053x2] ; stp2_26
-
-  mova                 m0, [rsp + transposed_in + 16 * 13]
-  mova                 m1, m0
-  pmulhrsw             m0, [pw__9760x2] ; stp1_22
-  pmulhrsw             m1, [pw_13160x2] ; stp2_25
-
-  ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               5, 13, 9 ; stp2_20, stp2_21
-  SUM_SUB               3,  0, 9 ; stp2_23, stp2_22
-  SUM_SUB               2,  1, 9 ; stp2_24, stp2_25
-  SUM_SUB               6, 14, 9 ; stp2_27, stp2_26
-
-  ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_21, stp1_26
-  BUTTERFLY_4Xmm        1,     0,  13623,  9102,  m8,  9,  10 ; stp1_25, stp1_22
-
-  ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               3,  5, 9 ; stp2_23, stp2_20
-  SUM_SUB               0, 14, 9 ; stp2_22, stp2_21
-  SUM_SUB               2,  6, 9 ; stp2_24, stp2_27
-  SUM_SUB               1, 13, 9 ; stp2_25, stp2_26
-
-  ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4Xmm        6,     5,   6270, 15137,  m8,  9,  10 ; stp1_27, stp1_20
-  BUTTERFLY_4Xmm       13,    14,   6270, 15137,  m8,  9,  10 ; stp1_26, stp1_21
-
-  ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m4, [stp + %3 + idx16]
-  mova                 m7, [stp + %3 + idx17]
-  mova                m11, [stp + %3 + idx18]
-  mova                m12, [stp + %3 + idx19]
-  SUM_SUB               4,  3, 9 ; stp2_16, stp2_23
-  SUM_SUB               7,  0, 9 ; stp2_17, stp2_22
-  SUM_SUB              11, 14, 9 ; stp2_18, stp2_21
-  SUM_SUB              12,  5, 9 ; stp2_19, stp2_20
-  mova [stp + %3 + idx16], m4
-  mova [stp + %3 + idx17], m7
-  mova [stp + %3 + idx18], m11
-  mova [stp + %3 + idx19], m12
-
-  mova                 m4, [stp + %4 + idx28]
-  mova                 m7, [stp + %4 + idx29]
-  mova                m11, [stp + %4 + idx30]
-  mova                m12, [stp + %4 + idx31]
-  SUM_SUB               4,  6, 9 ; stp2_28, stp2_27
-  SUM_SUB               7, 13, 9 ; stp2_29, stp2_26
-  SUM_SUB              11,  1, 9 ; stp2_30, stp2_25
-  SUM_SUB              12,  2, 9 ; stp2_31, stp2_24
-  mova [stp + %4 + idx28], m4
-  mova [stp + %4 + idx29], m7
-  mova [stp + %4 + idx30], m11
-  mova [stp + %4 + idx31], m12
-
-  ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               6,  5,  9
-  pmulhrsw             m6, m10  ; stp1_27
-  pmulhrsw             m5, m10  ; stp1_20
-  SUM_SUB              13, 14,  9
-  pmulhrsw            m13, m10  ; stp1_26
-  pmulhrsw            m14, m10  ; stp1_21
-  SUM_SUB               1,  0,  9
-  pmulhrsw             m1, m10  ; stp1_25
-  pmulhrsw             m0, m10  ; stp1_22
-  SUM_SUB               2,  3,  9
-  pmulhrsw             m2, m10  ; stp1_25
-  pmulhrsw             m3, m10  ; stp1_22
-%else
-  BUTTERFLY_4X          6,     5,  11585, 11585,  m8,  9,  10 ; stp1_20, stp1_27
-  SWAP  6, 5
-  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_21, stp1_26
-  SWAP 13, 14
-  BUTTERFLY_4X          1,     0,  11585, 11585,  m8,  9,  10 ; stp1_22, stp1_25
-  SWAP  1, 0
-  BUTTERFLY_4X          2,     3,  11585, 11585,  m8,  9,  10 ; stp1_23, stp1_24
-  SWAP  2, 3
-%endif
-  mova [stp + %3 + idx20], m5
-  mova [stp + %3 + idx21], m14
-  mova [stp + %3 + idx22], m0
-  mova [stp + %3 + idx23], m3
-  mova [stp + %4 + idx24], m2
-  mova [stp + %4 + idx25], m1
-  mova [stp + %4 + idx26], m13
-  mova [stp + %4 + idx27], m6
-
-  ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m0, [rsp + transposed_in + 16 *  2]
-  mova                 m1, m0
-  pmulhrsw             m0, [pw__1606x2] ; stp1_8
-  pmulhrsw             m1, [pw_16305x2] ; stp2_15
-
-  mova                 m6, [rsp + transposed_in + 16 *  6]
-  mova                 m7, m6
-  pmulhrsw             m7, [pw_m4756x2] ; stp2_11
-  pmulhrsw             m6, [pw_15679x2] ; stp1_12
-
-  mova                 m4, [rsp + transposed_in + 16 * 10]
-  mova                 m5, m4
-  pmulhrsw             m4, [pw__7723x2] ; stp1_10
-  pmulhrsw             m5, [pw_14449x2] ; stp2_13
-
-  mova                 m2, [rsp + transposed_in + 16 * 14]
-  mova                 m3, m2
-  pmulhrsw             m3, [pw_m10394x2] ; stp1_9
-  pmulhrsw             m2, [pw_12665x2] ; stp2_14
-
-  ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0,  3, 9 ;  stp1_8, stp1_9
-  SUM_SUB               7,  4, 9 ; stp1_11, stp1_10
-  SUM_SUB               6,  5, 9 ; stp1_12, stp1_13
-  SUM_SUB               1,  2, 9 ; stp1_15, stp1_14
-
-  ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_9, stp1_14
-  BUTTERFLY_4Xmm        5,     4,   6270, 15137,  m8,  9,  10 ; stp1_13, stp1_10
-
-  ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0,  7, 9 ;  stp1_8, stp1_11
-  SUM_SUB               2,  4, 9 ;  stp1_9, stp1_10
-  SUM_SUB               1,  6, 9 ;  stp1_15, stp1_12
-  SUM_SUB               3,  5, 9 ;  stp1_14, stp1_13
-
-  ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               5,    4,  9
-  pmulhrsw             m5, m10  ; stp1_13
-  pmulhrsw             m4, m10  ; stp1_10
-  SUM_SUB               6,    7,  9
-  pmulhrsw             m6, m10  ; stp1_12
-  pmulhrsw             m7, m10  ; stp1_11
-%else
-  BUTTERFLY_4X       5,     4,  11585,  11585,  m8,  9,  10 ; stp1_10, stp1_13
-  SWAP  5, 4
-  BUTTERFLY_4X       6,     7,  11585,  11585,  m8,  9,  10 ; stp1_11, stp1_12
-  SWAP  6, 7
-%endif
-  ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova [stp + %2 +  idx8], m0
-  mova [stp + %2 +  idx9], m2
-  mova [stp + %2 + idx10], m4
-  mova [stp + %2 + idx11], m7
-  mova [stp + %2 + idx12], m6
-  mova [stp + %2 + idx13], m5
-  mova [stp + %2 + idx14], m3
-  mova [stp + %2 + idx15], m1
-
-  ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                m11, [rsp + transposed_in + 16 *  4]
-  mova                m12, m11
-  pmulhrsw            m11, [pw__3196x2] ; stp1_4
-  pmulhrsw            m12, [pw_16069x2] ; stp1_7
-
-  mova                m13, [rsp + transposed_in + 16 * 12]
-  mova                m14, m13
-  pmulhrsw            m13, [pw_13623x2] ; stp1_6
-  pmulhrsw            m14, [pw_m9102x2] ; stp1_5
-
-  ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m0, [rsp + transposed_in + 16 *  0]
-  mova                 m2, [rsp + transposed_in + 16 *  8]
-  pmulhrsw             m0, [pw_11585x2]  ; stp1_1
-  mova                 m3, m2
-  pmulhrsw             m2, [pw__6270x2]  ; stp1_2
-  pmulhrsw             m3, [pw_15137x2]  ; stp1_3
-
-  SUM_SUB              11, 14, 9 ;  stp1_4, stp1_5
-  SUM_SUB              12, 13, 9 ;  stp1_7, stp1_6
-
-  ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB              13,   14,  9
-  pmulhrsw            m13, m10  ; stp1_6
-  pmulhrsw            m14, m10  ; stp1_5
-%else
-  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_5, stp1_6
-  SWAP 13, 14
-%endif
-  mova                 m1, m0    ; stp1_0 = stp1_1
-  SUM_SUB               0,  3, 9 ;  stp1_0, stp1_3
-  SUM_SUB               1,  2, 9 ;  stp1_1, stp1_2
-
-  ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0, 12, 9 ;  stp1_0, stp1_7
-  SUM_SUB               1, 13, 9 ;  stp1_1, stp1_6
-  SUM_SUB               2, 14, 9 ;  stp1_2, stp1_5
-  SUM_SUB               3, 11, 9 ;  stp1_3, stp1_4
-
-  ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m4, [stp + %2 + idx12]
-  mova                 m5, [stp + %2 + idx13]
-  mova                 m6, [stp + %2 + idx14]
-  mova                 m7, [stp + %2 + idx15]
-  SUM_SUB               0,  7, 9 ;  stp1_0, stp1_15
-  SUM_SUB               1,  6, 9 ;  stp1_1, stp1_14
-  SUM_SUB               2,  5, 9 ;  stp1_2, stp1_13
-  SUM_SUB               3,  4, 9 ;  stp1_3, stp1_12
-
-  ; 0-3, 28-31 final stage
-  mova                m10, [stp + %4 + idx31]
-  mova                m15, [stp + %4 + idx30]
-  SUM_SUB               0, 10, 9 ;  stp1_0, stp1_31
-  SUM_SUB               1, 15, 9 ;  stp1_1, stp1_30
-  mova [stp + %1 +  idx0], m0
-  mova [stp + %1 +  idx1], m1
-  mova [stp + %4 + idx31], m10
-  mova [stp + %4 + idx30], m15
-  mova                 m0, [stp + %4 + idx29]
-  mova                 m1, [stp + %4 + idx28]
-  SUM_SUB               2,  0, 9 ;  stp1_2, stp1_29
-  SUM_SUB               3,  1, 9 ;  stp1_3, stp1_28
-  mova [stp + %1 +  idx2], m2
-  mova [stp + %1 +  idx3], m3
-  mova [stp + %4 + idx29], m0
-  mova [stp + %4 + idx28], m1
-
-  ; 12-15, 16-19 final stage
-  mova                 m0, [stp + %3 + idx16]
-  mova                 m1, [stp + %3 + idx17]
-  mova                 m2, [stp + %3 + idx18]
-  mova                 m3, [stp + %3 + idx19]
-  SUM_SUB               7,  0, 9 ;  stp1_15, stp1_16
-  SUM_SUB               6,  1, 9 ;  stp1_14, stp1_17
-  SUM_SUB               5,  2, 9 ;  stp1_13, stp1_18
-  SUM_SUB               4,  3, 9 ;  stp1_12, stp1_19
-  mova [stp + %2 + idx12], m4
-  mova [stp + %2 + idx13], m5
-  mova [stp + %2 + idx14], m6
-  mova [stp + %2 + idx15], m7
-  mova [stp + %3 + idx16], m0
-  mova [stp + %3 + idx17], m1
-  mova [stp + %3 + idx18], m2
-  mova [stp + %3 + idx19], m3
-
-  mova                 m4, [stp + %2 +  idx8]
-  mova                 m5, [stp + %2 +  idx9]
-  mova                 m6, [stp + %2 + idx10]
-  mova                 m7, [stp + %2 + idx11]
-  SUM_SUB              11,  7, 9 ;  stp1_4, stp1_11
-  SUM_SUB              14,  6, 9 ;  stp1_5, stp1_10
-  SUM_SUB              13,  5, 9 ;  stp1_6, stp1_9
-  SUM_SUB              12,  4, 9 ;  stp1_7, stp1_8
-
-  ; 4-7, 24-27 final stage
-  mova                 m3, [stp + %4 + idx24]
-  mova                 m2, [stp + %4 + idx25]
-  mova                 m1, [stp + %4 + idx26]
-  mova                 m0, [stp + %4 + idx27]
-  SUM_SUB              12,  3, 9 ;  stp1_7, stp1_24
-  SUM_SUB              13,  2, 9 ;  stp1_6, stp1_25
-  SUM_SUB              14,  1, 9 ;  stp1_5, stp1_26
-  SUM_SUB              11,  0, 9 ;  stp1_4, stp1_27
-  mova [stp + %4 + idx24], m3
-  mova [stp + %4 + idx25], m2
-  mova [stp + %4 + idx26], m1
-  mova [stp + %4 + idx27], m0
-  mova [stp + %1 +  idx4], m11
-  mova [stp + %1 +  idx5], m14
-  mova [stp + %1 +  idx6], m13
-  mova [stp + %1 +  idx7], m12
-
-  ; 8-11, 20-23 final stage
-  mova                 m0, [stp + %3 + idx20]
-  mova                 m1, [stp + %3 + idx21]
-  mova                 m2, [stp + %3 + idx22]
-  mova                 m3, [stp + %3 + idx23]
-  SUM_SUB               7,  0, 9 ;  stp1_11, stp_20
-  SUM_SUB               6,  1, 9 ;  stp1_10, stp_21
-  SUM_SUB               5,  2, 9 ;   stp1_9, stp_22
-  SUM_SUB               4,  3, 9 ;   stp1_8, stp_23
-  mova [stp + %2 +  idx8], m4
-  mova [stp + %2 +  idx9], m5
-  mova [stp + %2 + idx10], m6
-  mova [stp + %2 + idx11], m7
-  mova [stp + %3 + idx20], m0
-  mova [stp + %3 + idx21], m1
-  mova [stp + %3 + idx22], m2
-  mova [stp + %3 + idx23], m3
-%endmacro
-
-INIT_XMM ssse3
-cglobal idct32x32_135_add, 3, 11, 16, i32x32_size, input, output, stride
-  mova            m8, [pd_8192]
-  mov             r6, 2
-  lea            stp, [rsp + pass_one_start]
-
-idct32x32_135:
-  mov             r3, inputq
-  lea             r4, [rsp + transposed_in]
-  mov             r7, 2
-
-idct32x32_135_transpose:
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova            m0, [r3 +       0]
-  packssdw        m0, [r3 +      16]
-  mova            m1, [r3 + 32 *  4]
-  packssdw        m1, [r3 + 32 *  4 + 16]
-  mova            m2, [r3 + 32 *  8]
-  packssdw        m2, [r3 + 32 *  8 + 16]
-  mova            m3, [r3 + 32 * 12]
-  packssdw        m3, [r3 + 32 * 12 + 16]
-  mova            m4, [r3 + 32 * 16]
-  packssdw        m4, [r3 + 32 * 16 + 16]
-  mova            m5, [r3 + 32 * 20]
-  packssdw        m5, [r3 + 32 * 20 + 16]
-  mova            m6, [r3 + 32 * 24]
-  packssdw        m6, [r3 + 32 * 24 + 16]
-  mova            m7, [r3 + 32 * 28]
-  packssdw        m7, [r3 + 32 * 28 + 16]
-%else
-  mova            m0, [r3 +       0]
-  mova            m1, [r3 + 16 *  4]
-  mova            m2, [r3 + 16 *  8]
-  mova            m3, [r3 + 16 * 12]
-  mova            m4, [r3 + 16 * 16]
-  mova            m5, [r3 + 16 * 20]
-  mova            m6, [r3 + 16 * 24]
-  mova            m7, [r3 + 16 * 28]
-%endif
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  mova [r4 +      0], m0
-  mova [r4 + 16 * 1], m1
-  mova [r4 + 16 * 2], m2
-  mova [r4 + 16 * 3], m3
-  mova [r4 + 16 * 4], m4
-  mova [r4 + 16 * 5], m5
-  mova [r4 + 16 * 6], m6
-  mova [r4 + 16 * 7], m7
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  add             r3, 32
-%else
-  add             r3, 16
-%endif
-  add             r4, 16 * 8
-  dec             r7
-  jne idct32x32_135_transpose
-
-  IDCT32X32_135 16*0, 16*32, 16*64, 16*96
-  lea            stp, [stp + 16 * 8]
-%if CONFIG_VP9_HIGHBITDEPTH
-  lea         inputq, [inputq + 32 * 32]
-%else
-  lea         inputq, [inputq + 16 * 32]
-%endif
-  dec             r6
-  jnz idct32x32_135
-
-  mov             r6, 4
-  lea            stp, [rsp + pass_one_start]
-  lea             r9, [rsp + pass_one_start]
-
-idct32x32_135_2:
-  lea             r4, [rsp + transposed_in]
-  mov             r3, r9
-  mov             r7, 2
-
-idct32x32_135_transpose_2:
-  mova            m0, [r3 +      0]
-  mova            m1, [r3 + 16 * 1]
-  mova            m2, [r3 + 16 * 2]
-  mova            m3, [r3 + 16 * 3]
-  mova            m4, [r3 + 16 * 4]
-  mova            m5, [r3 + 16 * 5]
-  mova            m6, [r3 + 16 * 6]
-  mova            m7, [r3 + 16 * 7]
-
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  mova [r4 +      0], m0
-  mova [r4 + 16 * 1], m1
-  mova [r4 + 16 * 2], m2
-  mova [r4 + 16 * 3], m3
-  mova [r4 + 16 * 4], m4
-  mova [r4 + 16 * 5], m5
-  mova [r4 + 16 * 6], m6
-  mova [r4 + 16 * 7], m7
-
-  add             r3, 16 * 8
-  add             r4, 16 * 8
-  dec             r7
-  jne idct32x32_135_transpose_2
-
-  IDCT32X32_135 16*0, 16*8, 16*16, 16*24
-
-  lea            stp, [stp + 16 * 32]
-  add             r9, 16 * 32
-  dec             r6
-  jnz idct32x32_135_2
-
-  RECON_AND_STORE pass_two_start
-
-  RET
-
-%macro IDCT32X32_1024 4
-  ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m1, [rsp + transposed_in + 16 *  1]
-  mova                m11, [rsp + transposed_in + 16 * 31]
-  BUTTERFLY_4X          1,    11,    804, 16364,  m8,  9,  10 ; stp1_16, stp1_31
-
-  mova                 m0, [rsp + transposed_in + 16 * 15]
-  mova                 m2, [rsp + transposed_in + 16 * 17]
-  BUTTERFLY_4X          2,     0,  12140, 11003,  m8,  9,  10 ; stp1_17, stp1_30
-
-  mova                 m7, [rsp + transposed_in + 16 *  7]
-  mova                m12, [rsp + transposed_in + 16 * 25]
-  BUTTERFLY_4X         12,     7,  15426,  5520,  m8,  9,  10 ; stp1_19, stp1_28
-
-  mova                 m3, [rsp + transposed_in + 16 *  9]
-  mova                 m4, [rsp + transposed_in + 16 * 23]
-  BUTTERFLY_4X          3,     4,   7005, 14811,  m8,  9,  10 ; stp1_18, stp1_29
-
-  ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               1,  2, 9 ; stp2_16, stp2_17
-  SUM_SUB              12,  3, 9 ; stp2_19, stp2_18
-  SUM_SUB               7,  4, 9 ; stp2_28, stp2_29
-  SUM_SUB              11,  0, 9 ; stp2_31, stp2_30
-
-  ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          0,     2,   3196, 16069,  m8,  9,  10 ; stp1_17, stp1_30
-  BUTTERFLY_4Xmm        4,     3,   3196, 16069,  m8,  9,  10 ; stp1_29, stp1_18
-
-  ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               1, 12, 9 ; stp2_16, stp2_19
-  SUM_SUB               0,  3, 9 ; stp2_17, stp2_18
-  SUM_SUB              11,  7, 9 ; stp2_31, stp2_28
-  SUM_SUB               2,  4, 9 ; stp2_30, stp2_29
-
-  ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          4,     3,   6270, 15137,  m8,  9,  10 ; stp1_18, stp1_29
-  BUTTERFLY_4X          7,    12,   6270, 15137,  m8,  9,  10 ; stp1_19, stp1_28
-
-  mova [stp + %3 + idx16], m1
-  mova [stp + %3 + idx17], m0
-  mova [stp + %3 + idx18], m4
-  mova [stp + %3 + idx19], m7
-  mova [stp + %4 + idx28], m12
-  mova [stp + %4 + idx29], m3
-  mova [stp + %4 + idx30], m2
-  mova [stp + %4 + idx31], m11
-
-  ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m5, [rsp + transposed_in + 16 *  5]
-  mova                 m6, [rsp + transposed_in + 16 * 27]
-  BUTTERFLY_4X          5,     6,   3981, 15893,  m8,  9,  10 ; stp1_20, stp1_27
-
-  mova                m13, [rsp + transposed_in + 16 * 21]
-  mova                m14, [rsp + transposed_in + 16 * 11]
-  BUTTERFLY_4X         13,    14,  14053,  8423,  m8,  9,  10 ; stp1_21, stp1_26
-
-  mova                 m0, [rsp + transposed_in + 16 * 13]
-  mova                 m1, [rsp + transposed_in + 16 * 19]
-  BUTTERFLY_4X          0,     1,   9760, 13160,  m8,  9,  10 ; stp1_22, stp1_25
-
-  mova                 m2, [rsp + transposed_in + 16 *  3]
-  mova                 m3, [rsp + transposed_in + 16 * 29]
-  BUTTERFLY_4X          3,     2,  16207,  2404,  m8,  9,  10 ; stp1_23, stp1_24
-
-  ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               5, 13, 9 ; stp2_20, stp2_21
-  SUM_SUB               3,  0, 9 ; stp2_23, stp2_22
-  SUM_SUB               2,  1, 9 ; stp2_24, stp2_25
-  SUM_SUB               6, 14, 9 ; stp2_27, stp2_26
-
-  ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_21, stp1_26
-  BUTTERFLY_4Xmm        1,     0,  13623,  9102,  m8,  9,  10 ; stp1_25, stp1_22
-
-  ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               3,  5, 9 ; stp2_23, stp2_20
-  SUM_SUB               0, 14, 9 ; stp2_22, stp2_21
-  SUM_SUB               2,  6, 9 ; stp2_24, stp2_27
-  SUM_SUB               1, 13, 9 ; stp2_25, stp2_26
-
-  ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4Xmm        6,     5,   6270, 15137,  m8,  9,  10 ; stp1_27, stp1_20
-  BUTTERFLY_4Xmm       13,    14,   6270, 15137,  m8,  9,  10 ; stp1_26, stp1_21
-
-  ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m4, [stp + %3 + idx16]
-  mova                 m7, [stp + %3 + idx17]
-  mova                m11, [stp + %3 + idx18]
-  mova                m12, [stp + %3 + idx19]
-  SUM_SUB               4,  3, 9 ; stp2_16, stp2_23
-  SUM_SUB               7,  0, 9 ; stp2_17, stp2_22
-  SUM_SUB              11, 14, 9 ; stp2_18, stp2_21
-  SUM_SUB              12,  5, 9 ; stp2_19, stp2_20
-  mova [stp + %3 + idx16], m4
-  mova [stp + %3 + idx17], m7
-  mova [stp + %3 + idx18], m11
-  mova [stp + %3 + idx19], m12
-
-  mova                 m4, [stp + %4 + idx28]
-  mova                 m7, [stp + %4 + idx29]
-  mova                m11, [stp + %4 + idx30]
-  mova                m12, [stp + %4 + idx31]
-  SUM_SUB               4,  6, 9 ; stp2_28, stp2_27
-  SUM_SUB               7, 13, 9 ; stp2_29, stp2_26
-  SUM_SUB              11,  1, 9 ; stp2_30, stp2_25
-  SUM_SUB              12,  2, 9 ; stp2_31, stp2_24
-  mova [stp + %4 + idx28], m4
-  mova [stp + %4 + idx29], m7
-  mova [stp + %4 + idx30], m11
-  mova [stp + %4 + idx31], m12
-
-  ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               6,  5,  9
-  pmulhrsw             m6, m10  ; stp1_27
-  pmulhrsw             m5, m10  ; stp1_20
-  SUM_SUB              13, 14,  9
-  pmulhrsw            m13, m10  ; stp1_26
-  pmulhrsw            m14, m10  ; stp1_21
-  SUM_SUB               1,  0,  9
-  pmulhrsw             m1, m10  ; stp1_25
-  pmulhrsw             m0, m10  ; stp1_22
-  SUM_SUB               2,  3,  9
-  pmulhrsw             m2, m10  ; stp1_25
-  pmulhrsw             m3, m10  ; stp1_22
-%else
-  BUTTERFLY_4X          6,     5,  11585, 11585,  m8,  9,  10 ; stp1_20, stp1_27
-  SWAP  6, 5
-  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_21, stp1_26
-  SWAP 13, 14
-  BUTTERFLY_4X          1,     0,  11585, 11585,  m8,  9,  10 ; stp1_22, stp1_25
-  SWAP  1, 0
-  BUTTERFLY_4X          2,     3,  11585, 11585,  m8,  9,  10 ; stp1_23, stp1_24
-  SWAP  2, 3
-%endif
-  mova [stp + %3 + idx20], m5
-  mova [stp + %3 + idx21], m14
-  mova [stp + %3 + idx22], m0
-  mova [stp + %3 + idx23], m3
-  mova [stp + %4 + idx24], m2
-  mova [stp + %4 + idx25], m1
-  mova [stp + %4 + idx26], m13
-  mova [stp + %4 + idx27], m6
-
-  ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m0, [rsp + transposed_in + 16 *  2]
-  mova                 m1, [rsp + transposed_in + 16 * 30]
-  BUTTERFLY_4X          0,     1,   1606, 16305,  m8,  9,  10 ; stp1_8, stp1_15
-
-  mova                 m2, [rsp + transposed_in + 16 * 14]
-  mova                 m3, [rsp + transposed_in + 16 * 18]
-  BUTTERFLY_4X          3,     2,  12665, 10394,  m8,  9,  10 ; stp1_9, stp1_14
-
-  mova                 m4, [rsp + transposed_in + 16 * 10]
-  mova                 m5, [rsp + transposed_in + 16 * 22]
-  BUTTERFLY_4X          4,     5,   7723, 14449,  m8,  9,  10 ; stp1_10, stp1_13
-
-  mova                 m6, [rsp + transposed_in + 16 *  6]
-  mova                 m7, [rsp + transposed_in + 16 * 26]
-  BUTTERFLY_4X          7,     6,  15679,  4756,  m8,  9,  10 ; stp1_11, stp1_12
-
-  ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0,  3, 9 ;  stp1_8, stp1_9
-  SUM_SUB               7,  4, 9 ; stp1_11, stp1_10
-  SUM_SUB               6,  5, 9 ; stp1_12, stp1_13
-  SUM_SUB               1,  2, 9 ; stp1_15, stp1_14
-
-  ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_9, stp1_14
-  BUTTERFLY_4Xmm        5,     4,   6270, 15137,  m8,  9,  10 ; stp1_13, stp1_10
-
-  ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0,  7, 9 ;  stp1_8, stp1_11
-  SUM_SUB               2,  4, 9 ;  stp1_9, stp1_10
-  SUM_SUB               1,  6, 9 ;  stp1_15, stp1_12
-  SUM_SUB               3,  5, 9 ;  stp1_14, stp1_13
-
-  ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               5,    4,  9
-  pmulhrsw             m5, m10  ; stp1_13
-  pmulhrsw             m4, m10  ; stp1_10
-  SUM_SUB               6,    7,  9
-  pmulhrsw             m6, m10  ; stp1_12
-  pmulhrsw             m7, m10  ; stp1_11
-%else
-  BUTTERFLY_4X       5,     4,  11585,  11585,  m8,  9,  10 ; stp1_10, stp1_13
-  SWAP  5, 4
-  BUTTERFLY_4X       6,     7,  11585,  11585,  m8,  9,  10 ; stp1_11, stp1_12
-  SWAP  6, 7
-%endif
-  ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova [stp + %2 +  idx8], m0
-  mova [stp + %2 +  idx9], m2
-  mova [stp + %2 + idx10], m4
-  mova [stp + %2 + idx11], m7
-  mova [stp + %2 + idx12], m6
-  mova [stp + %2 + idx13], m5
-  mova [stp + %2 + idx14], m3
-  mova [stp + %2 + idx15], m1
-
-  ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                m11, [rsp + transposed_in + 16 *  4]
-  mova                m12, [rsp + transposed_in + 16 * 28]
-  BUTTERFLY_4X         11,    12,   3196, 16069,  m8,  9,  10 ; stp1_4, stp1_7
-
-  mova                m13, [rsp + transposed_in + 16 * 12]
-  mova                m14, [rsp + transposed_in + 16 * 20]
-  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_5, stp1_6
-
-  ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m0, [rsp + transposed_in + 16 *  0]
-  mova                 m1, [rsp + transposed_in + 16 * 16]
-
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               0,    1,  9
-  pmulhrsw             m0, m10  ; stp1_1
-  pmulhrsw             m1, m10  ; stp1_0
-%else
-  BUTTERFLY_4X          0,     1,  11585, 11585,  m8,  9,  10 ; stp1_1, stp1_0
-  SWAP  0, 1
-%endif
-  mova                 m2, [rsp + transposed_in + 16 *  8]
-  mova                 m3, [rsp + transposed_in + 16 * 24]
-  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_2, stp1_3
-
-  mova                m10, [pw_11585x2]
-  SUM_SUB              11, 14, 9 ;  stp1_4, stp1_5
-  SUM_SUB              12, 13, 9 ;  stp1_7, stp1_6
-
-  ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  SUM_SUB              13,   14,  9
-  pmulhrsw            m13, m10  ; stp1_6
-  pmulhrsw            m14, m10  ; stp1_5
-%else
-  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_5, stp1_6
-  SWAP 13, 14
-%endif
-  SUM_SUB               0,  3, 9 ;  stp1_0, stp1_3
-  SUM_SUB               1,  2, 9 ;  stp1_1, stp1_2
-
-  ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0, 12, 9 ;  stp1_0, stp1_7
-  SUM_SUB               1, 13, 9 ;  stp1_1, stp1_6
-  SUM_SUB               2, 14, 9 ;  stp1_2, stp1_5
-  SUM_SUB               3, 11, 9 ;  stp1_3, stp1_4
-
-  ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m4, [stp + %2 + idx12]
-  mova                 m5, [stp + %2 + idx13]
-  mova                 m6, [stp + %2 + idx14]
-  mova                 m7, [stp + %2 + idx15]
-  SUM_SUB               0,  7, 9 ;  stp1_0, stp1_15
-  SUM_SUB               1,  6, 9 ;  stp1_1, stp1_14
-  SUM_SUB               2,  5, 9 ;  stp1_2, stp1_13
-  SUM_SUB               3,  4, 9 ;  stp1_3, stp1_12
-
-  ; 0-3, 28-31 final stage
-  mova                m10, [stp + %4 + idx31]
-  mova                m15, [stp + %4 + idx30]
-  SUM_SUB               0, 10, 9 ;  stp1_0, stp1_31
-  SUM_SUB               1, 15, 9 ;  stp1_1, stp1_30
-  mova [stp + %1 +  idx0], m0
-  mova [stp + %1 +  idx1], m1
-  mova [stp + %4 + idx31], m10
-  mova [stp + %4 + idx30], m15
-  mova                 m0, [stp + %4 + idx29]
-  mova                 m1, [stp + %4 + idx28]
-  SUM_SUB               2,  0, 9 ;  stp1_2, stp1_29
-  SUM_SUB               3,  1, 9 ;  stp1_3, stp1_28
-  mova [stp + %1 +  idx2], m2
-  mova [stp + %1 +  idx3], m3
-  mova [stp + %4 + idx29], m0
-  mova [stp + %4 + idx28], m1
-
-  ; 12-15, 16-19 final stage
-  mova                 m0, [stp + %3 + idx16]
-  mova                 m1, [stp + %3 + idx17]
-  mova                 m2, [stp + %3 + idx18]
-  mova                 m3, [stp + %3 + idx19]
-  SUM_SUB               7,  0, 9 ;  stp1_15, stp1_16
-  SUM_SUB               6,  1, 9 ;  stp1_14, stp1_17
-  SUM_SUB               5,  2, 9 ;  stp1_13, stp1_18
-  SUM_SUB               4,  3, 9 ;  stp1_12, stp1_19
-  mova [stp + %2 + idx12], m4
-  mova [stp + %2 + idx13], m5
-  mova [stp + %2 + idx14], m6
-  mova [stp + %2 + idx15], m7
-  mova [stp + %3 + idx16], m0
-  mova [stp + %3 + idx17], m1
-  mova [stp + %3 + idx18], m2
-  mova [stp + %3 + idx19], m3
-
-  mova                 m4, [stp + %2 +  idx8]
-  mova                 m5, [stp + %2 +  idx9]
-  mova                 m6, [stp + %2 + idx10]
-  mova                 m7, [stp + %2 + idx11]
-  SUM_SUB              11,  7, 9 ;  stp1_4, stp1_11
-  SUM_SUB              14,  6, 9 ;  stp1_5, stp1_10
-  SUM_SUB              13,  5, 9 ;  stp1_6, stp1_9
-  SUM_SUB              12,  4, 9 ;  stp1_7, stp1_8
-
-  ; 4-7, 24-27 final stage
-  mova                 m3, [stp + %4 + idx24]
-  mova                 m2, [stp + %4 + idx25]
-  mova                 m1, [stp + %4 + idx26]
-  mova                 m0, [stp + %4 + idx27]
-  SUM_SUB              12,  3, 9 ;  stp1_7, stp1_24
-  SUM_SUB              13,  2, 9 ;  stp1_6, stp1_25
-  SUM_SUB              14,  1, 9 ;  stp1_5, stp1_26
-  SUM_SUB              11,  0, 9 ;  stp1_4, stp1_27
-  mova [stp + %4 + idx24], m3
-  mova [stp + %4 + idx25], m2
-  mova [stp + %4 + idx26], m1
-  mova [stp + %4 + idx27], m0
-  mova [stp + %1 +  idx4], m11
-  mova [stp + %1 +  idx5], m14
-  mova [stp + %1 +  idx6], m13
-  mova [stp + %1 +  idx7], m12
-
-  ; 8-11, 20-23 final stage
-  mova                 m0, [stp + %3 + idx20]
-  mova                 m1, [stp + %3 + idx21]
-  mova                 m2, [stp + %3 + idx22]
-  mova                 m3, [stp + %3 + idx23]
-  SUM_SUB               7,  0, 9 ;  stp1_11, stp_20
-  SUM_SUB               6,  1, 9 ;  stp1_10, stp_21
-  SUM_SUB               5,  2, 9 ;   stp1_9, stp_22
-  SUM_SUB               4,  3, 9 ;   stp1_8, stp_23
-  mova [stp + %2 +  idx8], m4
-  mova [stp + %2 +  idx9], m5
-  mova [stp + %2 + idx10], m6
-  mova [stp + %2 + idx11], m7
-  mova [stp + %3 + idx20], m0
-  mova [stp + %3 + idx21], m1
-  mova [stp + %3 + idx22], m2
-  mova [stp + %3 + idx23], m3
-%endmacro
-
-INIT_XMM ssse3
-cglobal idct32x32_1024_add, 3, 11, 16, i32x32_size, input, output, stride
-  mova            m8, [pd_8192]
-  mov             r6, 4
-  lea            stp, [rsp + pass_one_start]
-
-idct32x32_1024:
-  mov             r3, inputq
-  lea             r4, [rsp + transposed_in]
-  mov             r7, 4
-
-idct32x32_1024_transpose:
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova            m0, [r3 +       0]
-  packssdw        m0, [r3 +      16]
-  mova            m1, [r3 + 32 *  4]
-  packssdw        m1, [r3 + 32 *  4 + 16]
-  mova            m2, [r3 + 32 *  8]
-  packssdw        m2, [r3 + 32 *  8 + 16]
-  mova            m3, [r3 + 32 * 12]
-  packssdw        m3, [r3 + 32 * 12 + 16]
-  mova            m4, [r3 + 32 * 16]
-  packssdw        m4, [r3 + 32 * 16 + 16]
-  mova            m5, [r3 + 32 * 20]
-  packssdw        m5, [r3 + 32 * 20 + 16]
-  mova            m6, [r3 + 32 * 24]
-  packssdw        m6, [r3 + 32 * 24 + 16]
-  mova            m7, [r3 + 32 * 28]
-  packssdw        m7, [r3 + 32 * 28 + 16]
-%else
-  mova            m0, [r3 +       0]
-  mova            m1, [r3 + 16 *  4]
-  mova            m2, [r3 + 16 *  8]
-  mova            m3, [r3 + 16 * 12]
-  mova            m4, [r3 + 16 * 16]
-  mova            m5, [r3 + 16 * 20]
-  mova            m6, [r3 + 16 * 24]
-  mova            m7, [r3 + 16 * 28]
-%endif
-
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  mova [r4 +      0], m0
-  mova [r4 + 16 * 1], m1
-  mova [r4 + 16 * 2], m2
-  mova [r4 + 16 * 3], m3
-  mova [r4 + 16 * 4], m4
-  mova [r4 + 16 * 5], m5
-  mova [r4 + 16 * 6], m6
-  mova [r4 + 16 * 7], m7
-%if CONFIG_VP9_HIGHBITDEPTH
-  add             r3, 32
-%else
-  add             r3, 16
-%endif
-  add             r4, 16 * 8
-  dec             r7
-  jne idct32x32_1024_transpose
-
-  IDCT32X32_1024 16*0, 16*32, 16*64, 16*96
-
-  lea            stp, [stp + 16 * 8]
-%if CONFIG_VP9_HIGHBITDEPTH
-  lea         inputq, [inputq + 32 * 32]
-%else
-  lea         inputq, [inputq + 16 * 32]
-%endif
-  dec             r6
-  jnz idct32x32_1024
-
-  mov             r6, 4
-  lea            stp, [rsp + pass_one_start]
-  lea             r9, [rsp + pass_one_start]
-
-idct32x32_1024_2:
-  lea             r4, [rsp + transposed_in]
-  mov             r3, r9
-  mov             r7, 4
-
-idct32x32_1024_transpose_2:
-  mova            m0, [r3 +      0]
-  mova            m1, [r3 + 16 * 1]
-  mova            m2, [r3 + 16 * 2]
-  mova            m3, [r3 + 16 * 3]
-  mova            m4, [r3 + 16 * 4]
-  mova            m5, [r3 + 16 * 5]
-  mova            m6, [r3 + 16 * 6]
-  mova            m7, [r3 + 16 * 7]
-
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  mova [r4 +      0], m0
-  mova [r4 + 16 * 1], m1
-  mova [r4 + 16 * 2], m2
-  mova [r4 + 16 * 3], m3
-  mova [r4 + 16 * 4], m4
-  mova [r4 + 16 * 5], m5
-  mova [r4 + 16 * 6], m6
-  mova [r4 + 16 * 7], m7
-
-  add             r3, 16 * 8
-  add             r4, 16 * 8
-  dec             r7
-  jne idct32x32_1024_transpose_2
-
-  IDCT32X32_1024 16*0, 16*8, 16*16, 16*24
-
-  lea            stp, [stp + 16 * 32]
-  add             r9, 16 * 32
-  dec             r6
-  jnz idct32x32_1024_2
-
-  RECON_AND_STORE pass_two_start
-
-  RET
-%endif
diff --git a/thirdparty/libvpx/vpx_dsp/x86/inv_wht_sse2.asm b/thirdparty/libvpx/vpx_dsp/x86/inv_wht_sse2.asm
deleted file mode 100644
index fbbcd76bd7..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/inv_wht_sse2.asm
+++ /dev/null
@@ -1,109 +0,0 @@
-;
-;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro REORDER_INPUTS 0
-  ; a c d b  to  a b c d
-  SWAP 1, 3, 2
-%endmacro
-
-%macro TRANSFORM_COLS 0
-  ; input:
-  ; m0 a
-  ; m1 b
-  ; m2 c
-  ; m3 d
-  paddw           m0,        m2
-  psubw           m3,        m1
-
-  ; wide subtract
-  punpcklwd       m4,        m0
-  punpcklwd       m5,        m3
-  psrad           m4,        16
-  psrad           m5,        16
-  psubd           m4,        m5
-  psrad           m4,        1
-  packssdw        m4,        m4             ; e
-
-  psubw           m5,        m4,        m1  ; b
-  psubw           m4,        m2             ; c
-  psubw           m0,        m5
-  paddw           m3,        m4
-                                ; m0 a
-  SWAP            1,         5  ; m1 b
-  SWAP            2,         4  ; m2 c
-                                ; m3 d
-%endmacro
-
-%macro TRANSPOSE_4X4 0
-  punpcklwd       m0,        m2
-  punpcklwd       m1,        m3
-  mova            m2,        m0
-  punpcklwd       m0,        m1
-  punpckhwd       m2,        m1
-  pshufd          m1,        m0, 0x0e
-  pshufd          m3,        m2, 0x0e
-%endmacro
-
-; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3
-%macro TRANSPOSE_4X4_WIDE 0
-  mova            m3, m0
-  punpcklwd       m0, m1
-  punpckhwd       m3, m1
-  mova            m2, m0
-  punpcklwd       m0, m3
-  punpckhwd       m2, m3
-  pshufd          m1, m0, 0x0e
-  pshufd          m3, m2, 0x0e
-%endmacro
-
-%macro ADD_STORE_4P_2X 5  ; src1, src2, tmp1, tmp2, zero
-  movd            m%3,       [outputq]
-  movd            m%4,       [outputq + strideq]
-  punpcklbw       m%3,       m%5
-  punpcklbw       m%4,       m%5
-  paddw           m%1,       m%3
-  paddw           m%2,       m%4
-  packuswb        m%1,       m%5
-  packuswb        m%2,       m%5
-  movd            [outputq], m%1
-  movd            [outputq + strideq], m%2
-%endmacro
-
-INIT_XMM sse2
-cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova            m0,        [inputq +  0]
-  packssdw        m0,        [inputq + 16]
-  mova            m1,        [inputq + 32]
-  packssdw        m1,        [inputq + 48]
-%else
-  mova            m0,        [inputq +  0]
-  mova            m1,        [inputq + 16]
-%endif
-  psraw           m0,        2
-  psraw           m1,        2
-
-  TRANSPOSE_4X4_WIDE
-  REORDER_INPUTS
-  TRANSFORM_COLS
-  TRANSPOSE_4X4
-  REORDER_INPUTS
-  TRANSFORM_COLS
-
-  pxor            m4, m4
-  ADD_STORE_4P_2X  0, 1, 5, 6, 4
-  lea             outputq, [outputq + 2 * strideq]
-  ADD_STORE_4P_2X  2, 3, 5, 6, 4
-
-  RET
diff --git a/thirdparty/libvpx/vpx_dsp/x86/loopfilter_avx2.c b/thirdparty/libvpx/vpx_dsp/x86/loopfilter_avx2.c
deleted file mode 100644
index be1087c1e9..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/loopfilter_avx2.c
+++ /dev/null
@@ -1,979 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <immintrin.h>  /* AVX2 */
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_ports/mem.h"
-
-void vpx_lpf_horizontal_edge_8_avx2(unsigned char *s, int p,
-                                    const unsigned char *_blimit,
-                                    const unsigned char *_limit,
-                                    const unsigned char *_thresh) {
-    __m128i mask, hev, flat, flat2;
-    const __m128i zero = _mm_set1_epi16(0);
-    const __m128i one = _mm_set1_epi8(1);
-    __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
-    __m128i abs_p1p0;
-
-    const __m128i thresh = _mm_broadcastb_epi8(
-            _mm_cvtsi32_si128((int) _thresh[0]));
-    const __m128i limit = _mm_broadcastb_epi8(
-            _mm_cvtsi32_si128((int) _limit[0]));
-    const __m128i blimit = _mm_broadcastb_epi8(
-            _mm_cvtsi32_si128((int) _blimit[0]));
-
-    q4p4 = _mm_loadl_epi64((__m128i *) (s - 5 * p));
-    q4p4 = _mm_castps_si128(
-            _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *) (s + 4 * p)));
-    q3p3 = _mm_loadl_epi64((__m128i *) (s - 4 * p));
-    q3p3 = _mm_castps_si128(
-            _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *) (s + 3 * p)));
-    q2p2 = _mm_loadl_epi64((__m128i *) (s - 3 * p));
-    q2p2 = _mm_castps_si128(
-            _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *) (s + 2 * p)));
-    q1p1 = _mm_loadl_epi64((__m128i *) (s - 2 * p));
-    q1p1 = _mm_castps_si128(
-            _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *) (s + 1 * p)));
-    p1q1 = _mm_shuffle_epi32(q1p1, 78);
-    q0p0 = _mm_loadl_epi64((__m128i *) (s - 1 * p));
-    q0p0 = _mm_castps_si128(
-            _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *) (s - 0 * p)));
-    p0q0 = _mm_shuffle_epi32(q0p0, 78);
-
-    {
-        __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
-        abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
-                _mm_subs_epu8(q0p0, q1p1));
-        abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
-        fe = _mm_set1_epi8(0xfe);
-        ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
-        abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
-                _mm_subs_epu8(p0q0, q0p0));
-        abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
-                _mm_subs_epu8(p1q1, q1p1));
-        flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-        hev = _mm_subs_epu8(flat, thresh);
-        hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-
-        abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
-        abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-        mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
-        mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-        // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-        mask = _mm_max_epu8(abs_p1p0, mask);
-        // mask |= (abs(p1 - p0) > limit) * -1;
-        // mask |= (abs(q1 - q0) > limit) * -1;
-
-        work = _mm_max_epu8(
-                _mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
-                        _mm_subs_epu8(q1p1, q2p2)),
-                _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
-                        _mm_subs_epu8(q2p2, q3p3)));
-        mask = _mm_max_epu8(work, mask);
-        mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-        mask = _mm_subs_epu8(mask, limit);
-        mask = _mm_cmpeq_epi8(mask, zero);
-    }
-
-    // lp filter
-    {
-        const __m128i t4 = _mm_set1_epi8(4);
-        const __m128i t3 = _mm_set1_epi8(3);
-        const __m128i t80 = _mm_set1_epi8(0x80);
-        const __m128i t1 = _mm_set1_epi16(0x1);
-        __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
-        __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
-        __m128i qs0 = _mm_xor_si128(p0q0, t80);
-        __m128i qs1 = _mm_xor_si128(p1q1, t80);
-        __m128i filt;
-        __m128i work_a;
-        __m128i filter1, filter2;
-        __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
-        __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
-
-        filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
-        work_a = _mm_subs_epi8(qs0, qs0ps0);
-        filt = _mm_adds_epi8(filt, work_a);
-        filt = _mm_adds_epi8(filt, work_a);
-        filt = _mm_adds_epi8(filt, work_a);
-        /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
-        filt = _mm_and_si128(filt, mask);
-
-        filter1 = _mm_adds_epi8(filt, t4);
-        filter2 = _mm_adds_epi8(filt, t3);
-
-        filter1 = _mm_unpacklo_epi8(zero, filter1);
-        filter1 = _mm_srai_epi16(filter1, 0xB);
-        filter2 = _mm_unpacklo_epi8(zero, filter2);
-        filter2 = _mm_srai_epi16(filter2, 0xB);
-
-        /* Filter1 >> 3 */
-        filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
-        qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
-
-        /* filt >> 1 */
-        filt = _mm_adds_epi16(filter1, t1);
-        filt = _mm_srai_epi16(filt, 1);
-        filt = _mm_andnot_si128(
-                _mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), filt);
-        filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
-        qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
-        // loopfilter done
-
-        {
-            __m128i work;
-            flat = _mm_max_epu8(
-                    _mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
-                            _mm_subs_epu8(q0p0, q2p2)),
-                    _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
-                            _mm_subs_epu8(q0p0, q3p3)));
-            flat = _mm_max_epu8(abs_p1p0, flat);
-            flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
-            flat = _mm_subs_epu8(flat, one);
-            flat = _mm_cmpeq_epi8(flat, zero);
-            flat = _mm_and_si128(flat, mask);
-
-            q5p5 = _mm_loadl_epi64((__m128i *) (s - 6 * p));
-            q5p5 = _mm_castps_si128(
-                    _mm_loadh_pi(_mm_castsi128_ps(q5p5),
-                            (__m64 *) (s + 5 * p)));
-
-            q6p6 = _mm_loadl_epi64((__m128i *) (s - 7 * p));
-            q6p6 = _mm_castps_si128(
-                    _mm_loadh_pi(_mm_castsi128_ps(q6p6),
-                            (__m64 *) (s + 6 * p)));
-
-            flat2 = _mm_max_epu8(
-                    _mm_or_si128(_mm_subs_epu8(q4p4, q0p0),
-                            _mm_subs_epu8(q0p0, q4p4)),
-                    _mm_or_si128(_mm_subs_epu8(q5p5, q0p0),
-                            _mm_subs_epu8(q0p0, q5p5)));
-
-            q7p7 = _mm_loadl_epi64((__m128i *) (s - 8 * p));
-            q7p7 = _mm_castps_si128(
-                    _mm_loadh_pi(_mm_castsi128_ps(q7p7),
-                            (__m64 *) (s + 7 * p)));
-
-            work = _mm_max_epu8(
-                    _mm_or_si128(_mm_subs_epu8(q6p6, q0p0),
-                            _mm_subs_epu8(q0p0, q6p6)),
-                    _mm_or_si128(_mm_subs_epu8(q7p7, q0p0),
-                            _mm_subs_epu8(q0p0, q7p7)));
-
-            flat2 = _mm_max_epu8(work, flat2);
-            flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
-            flat2 = _mm_subs_epu8(flat2, one);
-            flat2 = _mm_cmpeq_epi8(flat2, zero);
-            flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
-        }
-
-        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-        // flat and wide flat calculations
-        {
-            const __m128i eight = _mm_set1_epi16(8);
-            const __m128i four = _mm_set1_epi16(4);
-            __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
-            __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
-            __m128i pixelFilter_p, pixelFilter_q;
-            __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
-            __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
-
-            p7_16 = _mm_unpacklo_epi8(q7p7, zero);
-            p6_16 = _mm_unpacklo_epi8(q6p6, zero);
-            p5_16 = _mm_unpacklo_epi8(q5p5, zero);
-            p4_16 = _mm_unpacklo_epi8(q4p4, zero);
-            p3_16 = _mm_unpacklo_epi8(q3p3, zero);
-            p2_16 = _mm_unpacklo_epi8(q2p2, zero);
-            p1_16 = _mm_unpacklo_epi8(q1p1, zero);
-            p0_16 = _mm_unpacklo_epi8(q0p0, zero);
-            q0_16 = _mm_unpackhi_epi8(q0p0, zero);
-            q1_16 = _mm_unpackhi_epi8(q1p1, zero);
-            q2_16 = _mm_unpackhi_epi8(q2p2, zero);
-            q3_16 = _mm_unpackhi_epi8(q3p3, zero);
-            q4_16 = _mm_unpackhi_epi8(q4p4, zero);
-            q5_16 = _mm_unpackhi_epi8(q5p5, zero);
-            q6_16 = _mm_unpackhi_epi8(q6p6, zero);
-            q7_16 = _mm_unpackhi_epi8(q7p7, zero);
-
-            pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
-                    _mm_add_epi16(p4_16, p3_16));
-            pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
-                    _mm_add_epi16(q4_16, q3_16));
-
-            pixetFilter_p2p1p0 = _mm_add_epi16(p0_16,
-                    _mm_add_epi16(p2_16, p1_16));
-            pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
-
-            pixetFilter_q2q1q0 = _mm_add_epi16(q0_16,
-                    _mm_add_epi16(q2_16, q1_16));
-            pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
-            pixelFilter_p = _mm_add_epi16(eight,
-                    _mm_add_epi16(pixelFilter_p, pixelFilter_q));
-            pixetFilter_p2p1p0 = _mm_add_epi16(four,
-                    _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
-            res_p = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)),
-                    4);
-            res_q = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)),
-                    4);
-            flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
-            res_p = _mm_srli_epi16(
-                    _mm_add_epi16(pixetFilter_p2p1p0,
-                            _mm_add_epi16(p3_16, p0_16)), 3);
-            res_q = _mm_srli_epi16(
-                    _mm_add_epi16(pixetFilter_p2p1p0,
-                            _mm_add_epi16(q3_16, q0_16)), 3);
-
-            flat_q0p0 = _mm_packus_epi16(res_p, res_q);
-
-            sum_p7 = _mm_add_epi16(p7_16, p7_16);
-            sum_q7 = _mm_add_epi16(q7_16, q7_16);
-            sum_p3 = _mm_add_epi16(p3_16, p3_16);
-            sum_q3 = _mm_add_epi16(q3_16, q3_16);
-
-            pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
-            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
-            res_p = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)),
-                    4);
-            res_q = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)),
-                    4);
-            flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
-
-            pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
-            pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
-            res_p = _mm_srli_epi16(
-                    _mm_add_epi16(pixetFilter_p2p1p0,
-                            _mm_add_epi16(sum_p3, p1_16)), 3);
-            res_q = _mm_srli_epi16(
-                    _mm_add_epi16(pixetFilter_q2q1q0,
-                            _mm_add_epi16(sum_q3, q1_16)), 3);
-            flat_q1p1 = _mm_packus_epi16(res_p, res_q);
-
-            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-            sum_p3 = _mm_add_epi16(sum_p3, p3_16);
-            sum_q3 = _mm_add_epi16(sum_q3, q3_16);
-
-            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
-            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
-            res_p = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)),
-                    4);
-            res_q = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)),
-                    4);
-            flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
-
-            pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
-            pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
-
-            res_p = _mm_srli_epi16(
-                    _mm_add_epi16(pixetFilter_p2p1p0,
-                            _mm_add_epi16(sum_p3, p2_16)), 3);
-            res_q = _mm_srli_epi16(
-                    _mm_add_epi16(pixetFilter_q2q1q0,
-                            _mm_add_epi16(sum_q3, q2_16)), 3);
-            flat_q2p2 = _mm_packus_epi16(res_p, res_q);
-
-            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
-            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
-            res_p = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)),
-                    4);
-            res_q = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)),
-                    4);
-            flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
-
-            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
-            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
-            res_p = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)),
-                    4);
-            res_q = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)),
-                    4);
-            flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
-
-            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
-            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
-            res_p = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)),
-                    4);
-            res_q = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)),
-                    4);
-            flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
-
-            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
-            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
-            res_p = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)),
-                    4);
-            res_q = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)),
-                    4);
-            flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
-        }
-        // wide flat
-        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-        flat = _mm_shuffle_epi32(flat, 68);
-        flat2 = _mm_shuffle_epi32(flat2, 68);
-
-        q2p2 = _mm_andnot_si128(flat, q2p2);
-        flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
-        q2p2 = _mm_or_si128(q2p2, flat_q2p2);
-
-        qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
-        flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
-        q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
-
-        qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
-        flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
-        q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
-
-        q6p6 = _mm_andnot_si128(flat2, q6p6);
-        flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
-        q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
-        _mm_storel_epi64((__m128i *) (s - 7 * p), q6p6);
-        _mm_storeh_pi((__m64 *) (s + 6 * p), _mm_castsi128_ps(q6p6));
-
-        q5p5 = _mm_andnot_si128(flat2, q5p5);
-        flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
-        q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
-        _mm_storel_epi64((__m128i *) (s - 6 * p), q5p5);
-        _mm_storeh_pi((__m64 *) (s + 5 * p), _mm_castsi128_ps(q5p5));
-
-        q4p4 = _mm_andnot_si128(flat2, q4p4);
-        flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
-        q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
-        _mm_storel_epi64((__m128i *) (s - 5 * p), q4p4);
-        _mm_storeh_pi((__m64 *) (s + 4 * p), _mm_castsi128_ps(q4p4));
-
-        q3p3 = _mm_andnot_si128(flat2, q3p3);
-        flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
-        q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
-        _mm_storel_epi64((__m128i *) (s - 4 * p), q3p3);
-        _mm_storeh_pi((__m64 *) (s + 3 * p), _mm_castsi128_ps(q3p3));
-
-        q2p2 = _mm_andnot_si128(flat2, q2p2);
-        flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
-        q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
-        _mm_storel_epi64((__m128i *) (s - 3 * p), q2p2);
-        _mm_storeh_pi((__m64 *) (s + 2 * p), _mm_castsi128_ps(q2p2));
-
-        q1p1 = _mm_andnot_si128(flat2, q1p1);
-        flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
-        q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
-        _mm_storel_epi64((__m128i *) (s - 2 * p), q1p1);
-        _mm_storeh_pi((__m64 *) (s + 1 * p), _mm_castsi128_ps(q1p1));
-
-        q0p0 = _mm_andnot_si128(flat2, q0p0);
-        flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
-        q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
-        _mm_storel_epi64((__m128i *) (s - 1 * p), q0p0);
-        _mm_storeh_pi((__m64 *) (s - 0 * p), _mm_castsi128_ps(q0p0));
-    }
-}
-
-DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
-  0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128,
-  8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
-};
-
-void vpx_lpf_horizontal_edge_16_avx2(unsigned char *s, int p,
-                                     const unsigned char *_blimit,
-                                     const unsigned char *_limit,
-                                     const unsigned char *_thresh) {
-    __m128i mask, hev, flat, flat2;
-    const __m128i zero = _mm_set1_epi16(0);
-    const __m128i one = _mm_set1_epi8(1);
-    __m128i p7, p6, p5;
-    __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
-    __m128i q5, q6, q7;
-    __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4,
-            q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1,
-            p256_0, q256_0;
-
-    const __m128i thresh = _mm_broadcastb_epi8(
-            _mm_cvtsi32_si128((int) _thresh[0]));
-    const __m128i limit = _mm_broadcastb_epi8(
-            _mm_cvtsi32_si128((int) _limit[0]));
-    const __m128i blimit = _mm_broadcastb_epi8(
-            _mm_cvtsi32_si128((int) _blimit[0]));
-
-    p256_4 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                (__m128d const *)(s - 5 * p)));
-    p256_3 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                (__m128d const *)(s - 4 * p)));
-    p256_2 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                (__m128d const *)(s - 3 * p)));
-    p256_1 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                (__m128d const *)(s - 2 * p)));
-    p256_0 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                (__m128d const *)(s - 1 * p)));
-    q256_0 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                (__m128d const *)(s - 0 * p)));
-    q256_1 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                (__m128d const *)(s + 1 * p)));
-    q256_2 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                (__m128d const *)(s + 2 * p)));
-    q256_3 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                (__m128d const *)(s + 3 * p)));
-    q256_4 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                (__m128d const *)(s + 4 * p)));
-
-    p4 = _mm256_castsi256_si128(p256_4);
-    p3 = _mm256_castsi256_si128(p256_3);
-    p2 = _mm256_castsi256_si128(p256_2);
-    p1 = _mm256_castsi256_si128(p256_1);
-    p0 = _mm256_castsi256_si128(p256_0);
-    q0 = _mm256_castsi256_si128(q256_0);
-    q1 = _mm256_castsi256_si128(q256_1);
-    q2 = _mm256_castsi256_si128(q256_2);
-    q3 = _mm256_castsi256_si128(q256_3);
-    q4 = _mm256_castsi256_si128(q256_4);
-
-    {
-        const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
-                _mm_subs_epu8(p0, p1));
-        const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
-                _mm_subs_epu8(q0, q1));
-        const __m128i fe = _mm_set1_epi8(0xfe);
-        const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
-        __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
-                _mm_subs_epu8(q0, p0));
-        __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
-                _mm_subs_epu8(q1, p1));
-        __m128i work;
-        flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-        hev = _mm_subs_epu8(flat, thresh);
-        hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-
-        abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
-        abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-        mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
-        mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-        // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-        mask = _mm_max_epu8(flat, mask);
-        // mask |= (abs(p1 - p0) > limit) * -1;
-        // mask |= (abs(q1 - q0) > limit) * -1;
-        work = _mm_max_epu8(
-                _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
-                _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
-        mask = _mm_max_epu8(work, mask);
-        work = _mm_max_epu8(
-                _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
-                _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
-        mask = _mm_max_epu8(work, mask);
-        mask = _mm_subs_epu8(mask, limit);
-        mask = _mm_cmpeq_epi8(mask, zero);
-    }
-
-    // lp filter
-    {
-        const __m128i t4 = _mm_set1_epi8(4);
-        const __m128i t3 = _mm_set1_epi8(3);
-        const __m128i t80 = _mm_set1_epi8(0x80);
-        const __m128i te0 = _mm_set1_epi8(0xe0);
-        const __m128i t1f = _mm_set1_epi8(0x1f);
-        const __m128i t1 = _mm_set1_epi8(0x1);
-        const __m128i t7f = _mm_set1_epi8(0x7f);
-
-        __m128i ps1 = _mm_xor_si128(p1, t80);
-        __m128i ps0 = _mm_xor_si128(p0, t80);
-        __m128i qs0 = _mm_xor_si128(q0, t80);
-        __m128i qs1 = _mm_xor_si128(q1, t80);
-        __m128i filt;
-        __m128i work_a;
-        __m128i filter1, filter2;
-        __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1,
-                flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4,
-                flat2_q5, flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1,
-                flat_q2;
-
-        filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
-        work_a = _mm_subs_epi8(qs0, ps0);
-        filt = _mm_adds_epi8(filt, work_a);
-        filt = _mm_adds_epi8(filt, work_a);
-        filt = _mm_adds_epi8(filt, work_a);
-        /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
-        filt = _mm_and_si128(filt, mask);
-
-        filter1 = _mm_adds_epi8(filt, t4);
-        filter2 = _mm_adds_epi8(filt, t3);
-
-        /* Filter1 >> 3 */
-        work_a = _mm_cmpgt_epi8(zero, filter1);
-        filter1 = _mm_srli_epi16(filter1, 3);
-        work_a = _mm_and_si128(work_a, te0);
-        filter1 = _mm_and_si128(filter1, t1f);
-        filter1 = _mm_or_si128(filter1, work_a);
-        qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
-
-        /* Filter2 >> 3 */
-        work_a = _mm_cmpgt_epi8(zero, filter2);
-        filter2 = _mm_srli_epi16(filter2, 3);
-        work_a = _mm_and_si128(work_a, te0);
-        filter2 = _mm_and_si128(filter2, t1f);
-        filter2 = _mm_or_si128(filter2, work_a);
-        ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
-
-        /* filt >> 1 */
-        filt = _mm_adds_epi8(filter1, t1);
-        work_a = _mm_cmpgt_epi8(zero, filt);
-        filt = _mm_srli_epi16(filt, 1);
-        work_a = _mm_and_si128(work_a, t80);
-        filt = _mm_and_si128(filt, t7f);
-        filt = _mm_or_si128(filt, work_a);
-        filt = _mm_andnot_si128(hev, filt);
-        ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
-        qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
-        // loopfilter done
-
-        {
-            __m128i work;
-            work = _mm_max_epu8(
-                    _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
-                    _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
-            flat = _mm_max_epu8(work, flat);
-            work = _mm_max_epu8(
-                    _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
-                    _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
-            flat = _mm_max_epu8(work, flat);
-            work = _mm_max_epu8(
-                    _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
-                    _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
-            flat = _mm_subs_epu8(flat, one);
-            flat = _mm_cmpeq_epi8(flat, zero);
-            flat = _mm_and_si128(flat, mask);
-
-            p256_5 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                        (__m128d const *)(s - 6 * p)));
-            q256_5 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                        (__m128d const *)(s + 5 * p)));
-            p5 = _mm256_castsi256_si128(p256_5);
-            q5 = _mm256_castsi256_si128(q256_5);
-            flat2 = _mm_max_epu8(
-                    _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
-                    _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
-
-            flat2 = _mm_max_epu8(work, flat2);
-            p256_6 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                        (__m128d const *)(s - 7 * p)));
-            q256_6 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                        (__m128d const *)(s + 6 * p)));
-            p6 = _mm256_castsi256_si128(p256_6);
-            q6 = _mm256_castsi256_si128(q256_6);
-            work = _mm_max_epu8(
-                    _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
-                    _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
-
-            flat2 = _mm_max_epu8(work, flat2);
-
-            p256_7 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                        (__m128d const *)(s - 8 * p)));
-            q256_7 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                        (__m128d const *)(s + 7 * p)));
-            p7 = _mm256_castsi256_si128(p256_7);
-            q7 = _mm256_castsi256_si128(q256_7);
-            work = _mm_max_epu8(
-                    _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
-                    _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
-
-            flat2 = _mm_max_epu8(work, flat2);
-            flat2 = _mm_subs_epu8(flat2, one);
-            flat2 = _mm_cmpeq_epi8(flat2, zero);
-            flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
-        }
-
-        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-        // flat and wide flat calculations
-        {
-            const __m256i eight = _mm256_set1_epi16(8);
-            const __m256i four = _mm256_set1_epi16(4);
-            __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
-                    pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p,
-                    res_q;
-
-            const __m256i filter = _mm256_load_si256(
-                                  (__m256i const *)filt_loopfilter_avx2);
-            p256_7 = _mm256_shuffle_epi8(p256_7, filter);
-            p256_6 = _mm256_shuffle_epi8(p256_6, filter);
-            p256_5 = _mm256_shuffle_epi8(p256_5, filter);
-            p256_4 = _mm256_shuffle_epi8(p256_4, filter);
-            p256_3 = _mm256_shuffle_epi8(p256_3, filter);
-            p256_2 = _mm256_shuffle_epi8(p256_2, filter);
-            p256_1 = _mm256_shuffle_epi8(p256_1, filter);
-            p256_0 = _mm256_shuffle_epi8(p256_0, filter);
-            q256_0 = _mm256_shuffle_epi8(q256_0, filter);
-            q256_1 = _mm256_shuffle_epi8(q256_1, filter);
-            q256_2 = _mm256_shuffle_epi8(q256_2, filter);
-            q256_3 = _mm256_shuffle_epi8(q256_3, filter);
-            q256_4 = _mm256_shuffle_epi8(q256_4, filter);
-            q256_5 = _mm256_shuffle_epi8(q256_5, filter);
-            q256_6 = _mm256_shuffle_epi8(q256_6, filter);
-            q256_7 = _mm256_shuffle_epi8(q256_7, filter);
-
-            pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
-                    _mm256_add_epi16(p256_4, p256_3));
-            pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5),
-                    _mm256_add_epi16(q256_4, q256_3));
-
-            pixetFilter_p2p1p0 = _mm256_add_epi16(p256_0,
-                    _mm256_add_epi16(p256_2, p256_1));
-            pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
-
-            pixetFilter_q2q1q0 = _mm256_add_epi16(q256_0,
-                    _mm256_add_epi16(q256_2, q256_1));
-            pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
-
-            pixelFilter_p = _mm256_add_epi16(eight,
-                    _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
-
-            pixetFilter_p2p1p0 = _mm256_add_epi16(four,
-                    _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
-
-            res_p = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_p,
-                            _mm256_add_epi16(p256_7, p256_0)), 4);
-
-            flat2_p0 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
-                            168));
-
-            res_q = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_p,
-                            _mm256_add_epi16(q256_7, q256_0)), 4);
-
-            flat2_q0 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
-                            168));
-
-            res_p = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixetFilter_p2p1p0,
-                            _mm256_add_epi16(p256_3, p256_0)), 3);
-
-            flat_p0 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
-                            168));
-
-            res_q = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixetFilter_p2p1p0,
-                            _mm256_add_epi16(q256_3, q256_0)), 3);
-
-            flat_q0 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
-                            168));
-
-            sum_p7 = _mm256_add_epi16(p256_7, p256_7);
-
-            sum_q7 = _mm256_add_epi16(q256_7, q256_7);
-
-            sum_p3 = _mm256_add_epi16(p256_3, p256_3);
-
-            sum_q3 = _mm256_add_epi16(q256_3, q256_3);
-
-            pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6);
-
-            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6);
-
-            res_p = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_p,
-                            _mm256_add_epi16(sum_p7, p256_1)), 4);
-
-            flat2_p1 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
-                            168));
-
-            res_q = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_q,
-                            _mm256_add_epi16(sum_q7, q256_1)), 4);
-
-            flat2_q1 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
-                            168));
-
-            pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2);
-
-            pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2);
-
-            res_p = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixetFilter_p2p1p0,
-                            _mm256_add_epi16(sum_p3, p256_1)), 3);
-
-            flat_p1 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
-                            168));
-
-            res_q = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixetFilter_q2q1q0,
-                            _mm256_add_epi16(sum_q3, q256_1)), 3);
-
-            flat_q1 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
-                            168));
-
-            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
-
-            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
-
-            sum_p3 = _mm256_add_epi16(sum_p3, p256_3);
-
-            sum_q3 = _mm256_add_epi16(sum_q3, q256_3);
-
-            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5);
-
-            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5);
-
-            res_p = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_p,
-                            _mm256_add_epi16(sum_p7, p256_2)), 4);
-
-            flat2_p2 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
-                            168));
-
-            res_q = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_q,
-                            _mm256_add_epi16(sum_q7, q256_2)), 4);
-
-            flat2_q2 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
-                            168));
-
-            pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1);
-
-            pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1);
-
-            res_p = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixetFilter_p2p1p0,
-                            _mm256_add_epi16(sum_p3, p256_2)), 3);
-
-            flat_p2 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
-                            168));
-
-            res_q = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixetFilter_q2q1q0,
-                            _mm256_add_epi16(sum_q3, q256_2)), 3);
-
-            flat_q2 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
-                            168));
-
-            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
-
-            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
-
-            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4);
-
-            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4);
-
-            res_p = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_p,
-                            _mm256_add_epi16(sum_p7, p256_3)), 4);
-
-            flat2_p3 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
-                            168));
-
-            res_q = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_q,
-                            _mm256_add_epi16(sum_q7, q256_3)), 4);
-
-            flat2_q3 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
-                            168));
-
-            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
-
-            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
-
-            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3);
-
-            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3);
-
-            res_p = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_p,
-                            _mm256_add_epi16(sum_p7, p256_4)), 4);
-
-            flat2_p4 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
-                            168));
-
-            res_q = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_q,
-                            _mm256_add_epi16(sum_q7, q256_4)), 4);
-
-            flat2_q4 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
-                            168));
-
-            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
-
-            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
-
-            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2);
-
-            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2);
-
-            res_p = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_p,
-                            _mm256_add_epi16(sum_p7, p256_5)), 4);
-
-            flat2_p5 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
-                            168));
-
-            res_q = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_q,
-                            _mm256_add_epi16(sum_q7, q256_5)), 4);
-
-            flat2_q5 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
-                            168));
-
-            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
-
-            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
-
-            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1);
-
-            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1);
-
-            res_p = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_p,
-                            _mm256_add_epi16(sum_p7, p256_6)), 4);
-
-            flat2_p6 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
-                            168));
-
-            res_q = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_q,
-                            _mm256_add_epi16(sum_q7, q256_6)), 4);
-
-            flat2_q6 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
-                            168));
-        }
-
-        // wide flat
-        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-        p2 = _mm_andnot_si128(flat, p2);
-        flat_p2 = _mm_and_si128(flat, flat_p2);
-        p2 = _mm_or_si128(flat_p2, p2);
-
-        p1 = _mm_andnot_si128(flat, ps1);
-        flat_p1 = _mm_and_si128(flat, flat_p1);
-        p1 = _mm_or_si128(flat_p1, p1);
-
-        p0 = _mm_andnot_si128(flat, ps0);
-        flat_p0 = _mm_and_si128(flat, flat_p0);
-        p0 = _mm_or_si128(flat_p0, p0);
-
-        q0 = _mm_andnot_si128(flat, qs0);
-        flat_q0 = _mm_and_si128(flat, flat_q0);
-        q0 = _mm_or_si128(flat_q0, q0);
-
-        q1 = _mm_andnot_si128(flat, qs1);
-        flat_q1 = _mm_and_si128(flat, flat_q1);
-        q1 = _mm_or_si128(flat_q1, q1);
-
-        q2 = _mm_andnot_si128(flat, q2);
-        flat_q2 = _mm_and_si128(flat, flat_q2);
-        q2 = _mm_or_si128(flat_q2, q2);
-
-        p6 = _mm_andnot_si128(flat2, p6);
-        flat2_p6 = _mm_and_si128(flat2, flat2_p6);
-        p6 = _mm_or_si128(flat2_p6, p6);
-        _mm_storeu_si128((__m128i *) (s - 7 * p), p6);
-
-        p5 = _mm_andnot_si128(flat2, p5);
-        flat2_p5 = _mm_and_si128(flat2, flat2_p5);
-        p5 = _mm_or_si128(flat2_p5, p5);
-        _mm_storeu_si128((__m128i *) (s - 6 * p), p5);
-
-        p4 = _mm_andnot_si128(flat2, p4);
-        flat2_p4 = _mm_and_si128(flat2, flat2_p4);
-        p4 = _mm_or_si128(flat2_p4, p4);
-        _mm_storeu_si128((__m128i *) (s - 5 * p), p4);
-
-        p3 = _mm_andnot_si128(flat2, p3);
-        flat2_p3 = _mm_and_si128(flat2, flat2_p3);
-        p3 = _mm_or_si128(flat2_p3, p3);
-        _mm_storeu_si128((__m128i *) (s - 4 * p), p3);
-
-        p2 = _mm_andnot_si128(flat2, p2);
-        flat2_p2 = _mm_and_si128(flat2, flat2_p2);
-        p2 = _mm_or_si128(flat2_p2, p2);
-        _mm_storeu_si128((__m128i *) (s - 3 * p), p2);
-
-        p1 = _mm_andnot_si128(flat2, p1);
-        flat2_p1 = _mm_and_si128(flat2, flat2_p1);
-        p1 = _mm_or_si128(flat2_p1, p1);
-        _mm_storeu_si128((__m128i *) (s - 2 * p), p1);
-
-        p0 = _mm_andnot_si128(flat2, p0);
-        flat2_p0 = _mm_and_si128(flat2, flat2_p0);
-        p0 = _mm_or_si128(flat2_p0, p0);
-        _mm_storeu_si128((__m128i *) (s - 1 * p), p0);
-
-        q0 = _mm_andnot_si128(flat2, q0);
-        flat2_q0 = _mm_and_si128(flat2, flat2_q0);
-        q0 = _mm_or_si128(flat2_q0, q0);
-        _mm_storeu_si128((__m128i *) (s - 0 * p), q0);
-
-        q1 = _mm_andnot_si128(flat2, q1);
-        flat2_q1 = _mm_and_si128(flat2, flat2_q1);
-        q1 = _mm_or_si128(flat2_q1, q1);
-        _mm_storeu_si128((__m128i *) (s + 1 * p), q1);
-
-        q2 = _mm_andnot_si128(flat2, q2);
-        flat2_q2 = _mm_and_si128(flat2, flat2_q2);
-        q2 = _mm_or_si128(flat2_q2, q2);
-        _mm_storeu_si128((__m128i *) (s + 2 * p), q2);
-
-        q3 = _mm_andnot_si128(flat2, q3);
-        flat2_q3 = _mm_and_si128(flat2, flat2_q3);
-        q3 = _mm_or_si128(flat2_q3, q3);
-        _mm_storeu_si128((__m128i *) (s + 3 * p), q3);
-
-        q4 = _mm_andnot_si128(flat2, q4);
-        flat2_q4 = _mm_and_si128(flat2, flat2_q4);
-        q4 = _mm_or_si128(flat2_q4, q4);
-        _mm_storeu_si128((__m128i *) (s + 4 * p), q4);
-
-        q5 = _mm_andnot_si128(flat2, q5);
-        flat2_q5 = _mm_and_si128(flat2, flat2_q5);
-        q5 = _mm_or_si128(flat2_q5, q5);
-        _mm_storeu_si128((__m128i *) (s + 5 * p), q5);
-
-        q6 = _mm_andnot_si128(flat2, q6);
-        flat2_q6 = _mm_and_si128(flat2, flat2_q6);
-        q6 = _mm_or_si128(flat2_q6, q6);
-        _mm_storeu_si128((__m128i *) (s + 6 * p), q6);
-    }
-}
diff --git a/thirdparty/libvpx/vpx_dsp/x86/loopfilter_sse2.c b/thirdparty/libvpx/vpx_dsp/x86/loopfilter_sse2.c
deleted file mode 100644
index 739adf31d0..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/loopfilter_sse2.c
+++ /dev/null
@@ -1,1776 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <emmintrin.h>  // SSE2
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_ports/mem.h"
-#include "vpx_ports/emmintrin_compat.h"
-
-static INLINE __m128i abs_diff(__m128i a, __m128i b) {
-  return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
-}
-
-// filter_mask and hev_mask
-#define FILTER_HEV_MASK do {                                                   \
-  /* (abs(q1 - q0), abs(p1 - p0) */                                            \
-  __m128i flat = abs_diff(q1p1, q0p0);                                         \
-  /* abs(p1 - q1), abs(p0 - q0) */                                             \
-  const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);                           \
-  __m128i abs_p0q0, abs_p1q1, work;                                            \
-                                                                               \
-  /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */          \
-  hev = _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero);  \
-  hev = _mm_cmpgt_epi16(hev, thresh);                                          \
-  hev = _mm_packs_epi16(hev, hev);                                             \
-                                                                               \
-  /* const int8_t mask = filter_mask(*limit, *blimit, */                       \
-  /*                                 p3, p2, p1, p0, q0, q1, q2, q3); */       \
-  abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0);  /* abs(p0 - q0) * 2 */\
-  abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0);  /* abs(p1 - q1) */\
-  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);                                      \
-  abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1);  /* abs(p1 - q1) / 2 */      \
-  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */                                    \
-  mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);                                    \
-  /* abs(p3 - p2), abs(p2 - p1) */                                             \
-  work = abs_diff(p3p2, p2p1);                                                 \
-  flat = _mm_max_epu8(work, flat);                                             \
-  /* abs(q3 - q2), abs(q2 - q1) */                                             \
-  work = abs_diff(q3q2, q2q1);                                                 \
-  flat = _mm_max_epu8(work, flat);                                             \
-  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));                          \
-  mask = _mm_unpacklo_epi64(mask, flat);                                       \
-  mask = _mm_subs_epu8(mask, limit);                                           \
-  mask = _mm_cmpeq_epi8(mask, zero);                                           \
-  mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));                         \
-} while (0)
-
-#define FILTER4 do {                                                           \
-  const __m128i t3t4 = _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3,                    \
-                                    4, 4, 4, 4, 4, 4, 4, 4);                   \
-  const __m128i t80 = _mm_set1_epi8(0x80);                                     \
-  __m128i filter, filter2filter1, work;                                        \
-                                                                               \
-  ps1ps0 = _mm_xor_si128(p1p0, t80);  /* ^ 0x80 */                             \
-  qs1qs0 = _mm_xor_si128(q1q0, t80);                                           \
-                                                                               \
-  /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */                    \
-  work = _mm_subs_epi8(ps1ps0, qs1qs0);                                        \
-  filter = _mm_and_si128(_mm_srli_si128(work, 8), hev);                        \
-  /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */           \
-  filter = _mm_subs_epi8(filter, work);                                        \
-  filter = _mm_subs_epi8(filter, work);                                        \
-  filter = _mm_subs_epi8(filter, work);  /* + 3 * (qs0 - ps0) */               \
-  filter = _mm_and_si128(filter, mask);  /* & mask */                          \
-  filter = _mm_unpacklo_epi64(filter, filter);                                 \
-                                                                               \
-  /* filter1 = signed_char_clamp(filter + 4) >> 3; */                          \
-  /* filter2 = signed_char_clamp(filter + 3) >> 3; */                          \
-  filter2filter1 = _mm_adds_epi8(filter, t3t4);  /* signed_char_clamp */       \
-  filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1);                  \
-  filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1);          \
-  filter2filter1 = _mm_srai_epi16(filter2filter1, 11);  /* >> 3 */             \
-  filter = _mm_srai_epi16(filter, 11);  /* >> 3 */                             \
-  filter2filter1 = _mm_packs_epi16(filter2filter1, filter);                    \
-                                                                               \
-  /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */                        \
-  filter = _mm_subs_epi8(filter2filter1, ff);  /* + 1 */                       \
-  filter = _mm_unpacklo_epi8(filter, filter);                                  \
-  filter = _mm_srai_epi16(filter, 9);  /* round */                             \
-  filter = _mm_packs_epi16(filter, filter);                                    \
-  filter = _mm_andnot_si128(hev, filter);                                      \
-                                                                               \
-  hev = _mm_unpackhi_epi64(filter2filter1, filter);                            \
-  filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter);                 \
-                                                                               \
-  /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */      \
-  qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1);                              \
-  /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */      \
-  ps1ps0 = _mm_adds_epi8(ps1ps0, hev);                                         \
-  qs1qs0 = _mm_xor_si128(qs1qs0, t80);  /* ^ 0x80 */                           \
-  ps1ps0 = _mm_xor_si128(ps1ps0, t80);  /* ^ 0x80 */                           \
-} while (0)
-
-void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
-                               const uint8_t *_blimit, const uint8_t *_limit,
-                               const uint8_t *_thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i limit =
-      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
-                         _mm_loadl_epi64((const __m128i *)_limit));
-  const __m128i thresh =
-      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
-  const __m128i ff = _mm_cmpeq_epi8(zero, zero);
-  __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
-  __m128i mask, hev;
-
-  p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
-                            _mm_loadl_epi64((__m128i *)(s - 4 * p)));
-  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
-  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 0 * p)));
-  q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
-  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
-  p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
-  q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
-  q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
-
-  FILTER_HEV_MASK;
-  FILTER4;
-
-  _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0));  // *op1
-  _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);  // *op0
-  _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);  // *oq0
-  _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0));  // *oq1
-}
-
-void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
-                             const uint8_t *_blimit, const uint8_t *_limit,
-                             const uint8_t *_thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i limit =
-      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
-                         _mm_loadl_epi64((const __m128i *)_limit));
-  const __m128i thresh =
-      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
-  const __m128i ff = _mm_cmpeq_epi8(zero, zero);
-  __m128i x0, x1, x2, x3;
-  __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
-  __m128i mask, hev;
-
-  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-  q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)),
-                           _mm_loadl_epi64((__m128i *)(s + 1 * p - 4)));
-
-  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-  x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)),
-                         _mm_loadl_epi64((__m128i *)(s + 3 * p - 4)));
-
-  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
-  x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)),
-                         _mm_loadl_epi64((__m128i *)(s + 5 * p - 4)));
-
-  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
-  x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)),
-                         _mm_loadl_epi64((__m128i *)(s + 7 * p - 4)));
-
-  // Transpose 8x8
-  // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
-  p1p0 = _mm_unpacklo_epi16(q1q0, x1);
-  // 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
-  x0 = _mm_unpacklo_epi16(x2, x3);
-  // 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
-  p3p2 = _mm_unpacklo_epi32(p1p0, x0);
-  // 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
-  p1p0 = _mm_unpackhi_epi32(p1p0, x0);
-  p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8));  // swap lo and high
-  p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8));  // swap lo and high
-
-  // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
-  q1q0 = _mm_unpackhi_epi16(q1q0, x1);
-  // 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
-  x2 = _mm_unpackhi_epi16(x2, x3);
-  // 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
-  q3q2 = _mm_unpackhi_epi32(q1q0, x2);
-  // 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
-  q1q0 = _mm_unpacklo_epi32(q1q0, x2);
-
-  q0p0 = _mm_unpacklo_epi64(p1p0, q1q0);
-  q1p1 = _mm_unpackhi_epi64(p1p0, q1q0);
-  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
-  p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
-  q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
-
-  FILTER_HEV_MASK;
-  FILTER4;
-
-  // Transpose 8x4 to 4x8
-  // qs1qs0: 20 21 22 23 24 25 26 27  30 31 32 33 34 34 36 37
-  // ps1ps0: 10 11 12 13 14 15 16 17  00 01 02 03 04 05 06 07
-  // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
-  ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8));
-  // 10 30 11 31 12 32 13 33  14 34 15 35 16 36 17 37
-  x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0);
-  // 00 20 01 21 02 22 03 23  04 24 05 25 06 26 07 27
-  ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0);
-  // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
-  qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0);
-  // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
-  ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
-
-  *(int *)(s + 0 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
-  ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  *(int *)(s + 1 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
-  ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
-  ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
-
-  *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
-  qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
-  qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
-  qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
-}
-
-void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
-                                    const unsigned char *_blimit,
-                                    const unsigned char *_limit,
-                                    const unsigned char *_thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i one = _mm_set1_epi8(1);
-  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
-  __m128i mask, hev, flat, flat2;
-  __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
-  __m128i abs_p1p0;
-
-  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
-  q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4),
-                                       (__m64 *)(s + 4 * p)));
-  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
-  q3p3 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q3p3),
-                                       (__m64 *)(s + 3 * p)));
-  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
-  q2p2 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q2p2),
-                                       (__m64 *)(s + 2 * p)));
-  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
-  q1p1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q1p1),
-                                       (__m64 *)(s + 1 * p)));
-  p1q1 = _mm_shuffle_epi32(q1p1, 78);
-  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
-  q0p0 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q0p0),
-                                       (__m64 *)(s - 0 * p)));
-  p0q0 = _mm_shuffle_epi32(q0p0, 78);
-
-  {
-    __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
-    abs_p1p0 = abs_diff(q1p1, q0p0);
-    abs_q1q0 =  _mm_srli_si128(abs_p1p0, 8);
-    fe = _mm_set1_epi8(0xfe);
-    ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
-    abs_p0q0 = abs_diff(q0p0, p0q0);
-    abs_p1q1 = abs_diff(q1p1, p1q1);
-    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-
-    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(abs_p1p0, mask);
-    // mask |= (abs(p1 - p0) > limit) * -1;
-    // mask |= (abs(q1 - q0) > limit) * -1;
-
-    work = _mm_max_epu8(abs_diff(q2p2, q1p1),
-                        abs_diff(q3p3, q2p2));
-    mask = _mm_max_epu8(work, mask);
-    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-    mask = _mm_subs_epu8(mask, limit);
-    mask = _mm_cmpeq_epi8(mask, zero);
-  }
-
-  // lp filter
-  {
-    const __m128i t4 = _mm_set1_epi8(4);
-    const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i t1 = _mm_set1_epi16(0x1);
-    __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
-    __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
-    __m128i qs0 = _mm_xor_si128(p0q0, t80);
-    __m128i qs1 = _mm_xor_si128(p1q1, t80);
-    __m128i filt;
-    __m128i work_a;
-    __m128i filter1, filter2;
-    __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
-    __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
-
-    filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
-    work_a = _mm_subs_epi8(qs0, qs0ps0);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    // (vpx_filter + 3 * (qs0 - ps0)) & mask
-    filt = _mm_and_si128(filt, mask);
-
-    filter1 = _mm_adds_epi8(filt, t4);
-    filter2 = _mm_adds_epi8(filt, t3);
-
-    filter1 = _mm_unpacklo_epi8(zero, filter1);
-    filter1 = _mm_srai_epi16(filter1, 0xB);
-    filter2 = _mm_unpacklo_epi8(zero, filter2);
-    filter2 = _mm_srai_epi16(filter2, 0xB);
-
-    // Filter1 >> 3
-    filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
-    qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
-
-    // filt >> 1
-    filt = _mm_adds_epi16(filter1, t1);
-    filt = _mm_srai_epi16(filt, 1);
-    filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
-                            filt);
-    filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
-    qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
-    // loopfilter done
-
-    {
-      __m128i work;
-      flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
-      flat = _mm_max_epu8(abs_p1p0, flat);
-      flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
-      flat = _mm_subs_epu8(flat, one);
-      flat = _mm_cmpeq_epi8(flat, zero);
-      flat = _mm_and_si128(flat, mask);
-
-      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
-      q5p5 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q5p5),
-                                           (__m64 *)(s + 5 * p)));
-
-      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
-      q6p6 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q6p6),
-                                           (__m64 *)(s + 6 * p)));
-      flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));
-
-      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
-      q7p7 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q7p7),
-                                           (__m64 *)(s + 7 * p)));
-      work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));
-      flat2 = _mm_max_epu8(work, flat2);
-      flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
-      flat2 = _mm_subs_epu8(flat2, one);
-      flat2 = _mm_cmpeq_epi8(flat2, zero);
-      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
-    }
-
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    // flat and wide flat calculations
-    {
-      const __m128i eight = _mm_set1_epi16(8);
-      const __m128i four = _mm_set1_epi16(4);
-      __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
-      __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
-      __m128i pixelFilter_p, pixelFilter_q;
-      __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
-      __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
-
-      p7_16 = _mm_unpacklo_epi8(q7p7, zero);;
-      p6_16 = _mm_unpacklo_epi8(q6p6, zero);
-      p5_16 = _mm_unpacklo_epi8(q5p5, zero);
-      p4_16 = _mm_unpacklo_epi8(q4p4, zero);
-      p3_16 = _mm_unpacklo_epi8(q3p3, zero);
-      p2_16 = _mm_unpacklo_epi8(q2p2, zero);
-      p1_16 = _mm_unpacklo_epi8(q1p1, zero);
-      p0_16 = _mm_unpacklo_epi8(q0p0, zero);
-      q0_16 = _mm_unpackhi_epi8(q0p0, zero);
-      q1_16 = _mm_unpackhi_epi8(q1p1, zero);
-      q2_16 = _mm_unpackhi_epi8(q2p2, zero);
-      q3_16 = _mm_unpackhi_epi8(q3p3, zero);
-      q4_16 = _mm_unpackhi_epi8(q4p4, zero);
-      q5_16 = _mm_unpackhi_epi8(q5p5, zero);
-      q6_16 = _mm_unpackhi_epi8(q6p6, zero);
-      q7_16 = _mm_unpackhi_epi8(q7p7, zero);
-
-      pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
-                                    _mm_add_epi16(p4_16, p3_16));
-      pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
-                                    _mm_add_epi16(q4_16, q3_16));
-
-      pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
-      pixelFilter_p =  _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
-
-      pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
-      pixelFilter_q =  _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
-      pixelFilter_p =  _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p,
-                                                         pixelFilter_q));
-      pixetFilter_p2p1p0 =   _mm_add_epi16(four,
-                                           _mm_add_epi16(pixetFilter_p2p1p0,
-                                                         pixetFilter_q2q1q0));
-      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
-                                           _mm_add_epi16(p7_16, p0_16)), 4);
-      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
-                                           _mm_add_epi16(q7_16, q0_16)), 4);
-      flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
-      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
-                                           _mm_add_epi16(p3_16, p0_16)), 3);
-      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
-                                           _mm_add_epi16(q3_16, q0_16)), 3);
-
-      flat_q0p0 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(p7_16, p7_16);
-      sum_q7 = _mm_add_epi16(q7_16, q7_16);
-      sum_p3 = _mm_add_epi16(p3_16, p3_16);
-      sum_q3 = _mm_add_epi16(q3_16, q3_16);
-
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
-      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
-                             _mm_add_epi16(sum_p7, p1_16)), 4);
-      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
-                             _mm_add_epi16(sum_q7, q1_16)), 4);
-      flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
-
-      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
-      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
-      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
-                             _mm_add_epi16(sum_p3, p1_16)), 3);
-      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
-                             _mm_add_epi16(sum_q3, q1_16)), 3);
-      flat_q1p1 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-      sum_p3 = _mm_add_epi16(sum_p3, p3_16);
-      sum_q3 = _mm_add_epi16(sum_q3, q3_16);
-
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
-      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
-                             _mm_add_epi16(sum_p7, p2_16)), 4);
-      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
-                             _mm_add_epi16(sum_q7, q2_16)), 4);
-      flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
-
-      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
-      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
-
-      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
-                                           _mm_add_epi16(sum_p3, p2_16)), 3);
-      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
-                                           _mm_add_epi16(sum_q3, q2_16)), 3);
-      flat_q2p2 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
-      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
-                             _mm_add_epi16(sum_p7, p3_16)), 4);
-      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
-                             _mm_add_epi16(sum_q7, q3_16)), 4);
-      flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
-      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
-                             _mm_add_epi16(sum_p7, p4_16)), 4);
-      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
-                             _mm_add_epi16(sum_q7, q4_16)), 4);
-      flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
-      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
-                             _mm_add_epi16(sum_p7, p5_16)), 4);
-      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
-                             _mm_add_epi16(sum_q7, q5_16)), 4);
-      flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
-      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
-                             _mm_add_epi16(sum_p7, p6_16)), 4);
-      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
-                             _mm_add_epi16(sum_q7, q6_16)), 4);
-      flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
-    }
-    // wide flat
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-    flat = _mm_shuffle_epi32(flat, 68);
-    flat2 = _mm_shuffle_epi32(flat2, 68);
-
-    q2p2 = _mm_andnot_si128(flat, q2p2);
-    flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
-    q2p2 = _mm_or_si128(q2p2, flat_q2p2);
-
-    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
-    flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
-    q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
-
-    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
-    flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
-    q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
-
-    q6p6 = _mm_andnot_si128(flat2, q6p6);
-    flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
-    q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
-    _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
-    _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
-
-    q5p5 = _mm_andnot_si128(flat2, q5p5);
-    flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
-    q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
-    _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
-    _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
-
-    q4p4 = _mm_andnot_si128(flat2, q4p4);
-    flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
-    q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
-    _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
-    _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
-
-    q3p3 = _mm_andnot_si128(flat2, q3p3);
-    flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
-    q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
-    _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
-    _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
-
-    q2p2 = _mm_andnot_si128(flat2, q2p2);
-    flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
-    q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
-    _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
-    _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
-
-    q1p1 = _mm_andnot_si128(flat2, q1p1);
-    flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
-    q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
-    _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
-    _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
-
-    q0p0 = _mm_andnot_si128(flat2, q0p0);
-    flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
-    q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
-    _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
-    _mm_storeh_pi((__m64 *)(s - 0 * p),  _mm_castsi128_ps(q0p0));
-  }
-}
-
-static INLINE __m128i filter_add2_sub2(const __m128i *const total,
-                                       const __m128i *const a1,
-                                       const __m128i *const a2,
-                                       const __m128i *const s1,
-                                       const __m128i *const s2) {
-  __m128i x = _mm_add_epi16(*a1, *total);
-  x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2);
-  return x;
-}
-
-static INLINE __m128i filter8_mask(const __m128i *const flat,
-                                   const __m128i *const other_filt,
-                                   const __m128i *const f8_lo,
-                                   const __m128i *const f8_hi) {
-  const __m128i f8 = _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3),
-                                      _mm_srli_epi16(*f8_hi, 3));
-  const __m128i result = _mm_and_si128(*flat, f8);
-  return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
-}
-
-static INLINE __m128i filter16_mask(const __m128i *const flat,
-                                    const __m128i *const other_filt,
-                                    const __m128i *const f_lo,
-                                    const __m128i *const f_hi) {
-  const __m128i f = _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4),
-                                     _mm_srli_epi16(*f_hi, 4));
-  const __m128i result = _mm_and_si128(*flat, f);
-  return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
-}
-
-void vpx_lpf_horizontal_edge_16_sse2(unsigned char *s, int p,
-                                     const unsigned char *_blimit,
-                                     const unsigned char *_limit,
-                                     const unsigned char *_thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i one = _mm_set1_epi8(1);
-  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
-  __m128i mask, hev, flat, flat2;
-  __m128i p7, p6, p5;
-  __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
-  __m128i q5, q6, q7;
-
-  __m128i op2, op1, op0, oq0, oq1, oq2;
-
-  __m128i max_abs_p1p0q1q0;
-
-  p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
-  p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
-  p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
-  p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-  q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
-  q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
-  q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
-  q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
-
-  {
-    const __m128i abs_p1p0 = abs_diff(p1, p0);
-    const __m128i abs_q1q0 = abs_diff(q1, q0);
-    const __m128i fe = _mm_set1_epi8(0xfe);
-    const __m128i ff = _mm_cmpeq_epi8(zero, zero);
-    __m128i abs_p0q0 = abs_diff(p0, q0);
-    __m128i abs_p1q1 = abs_diff(p1, q1);
-    __m128i work;
-    max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
-
-    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
-    // mask |= (abs(p1 - p0) > limit) * -1;
-    // mask |= (abs(q1 - q0) > limit) * -1;
-    work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
-    mask = _mm_max_epu8(work, mask);
-    work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
-    mask = _mm_max_epu8(work, mask);
-    mask = _mm_subs_epu8(mask, limit);
-    mask = _mm_cmpeq_epi8(mask, zero);
-  }
-
-  {
-    __m128i work;
-    work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
-    flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
-    work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
-    flat = _mm_max_epu8(work, flat);
-    work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
-    flat = _mm_subs_epu8(flat, one);
-    flat = _mm_cmpeq_epi8(flat, zero);
-    flat = _mm_and_si128(flat, mask);
-    flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
-    flat2 = _mm_max_epu8(work, flat2);
-    work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
-    flat2 = _mm_max_epu8(work, flat2);
-    work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0));
-    flat2 = _mm_max_epu8(work, flat2);
-    flat2 = _mm_subs_epu8(flat2, one);
-    flat2 = _mm_cmpeq_epi8(flat2, zero);
-    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
-  }
-
-  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  // filter4
-  {
-    const __m128i t4 = _mm_set1_epi8(4);
-    const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i te0 = _mm_set1_epi8(0xe0);
-    const __m128i t1f = _mm_set1_epi8(0x1f);
-    const __m128i t1 = _mm_set1_epi8(0x1);
-    const __m128i t7f = _mm_set1_epi8(0x7f);
-    const __m128i ff = _mm_cmpeq_epi8(t4, t4);
-
-    __m128i filt;
-    __m128i work_a;
-    __m128i filter1, filter2;
-
-    op1 = _mm_xor_si128(p1, t80);
-    op0 = _mm_xor_si128(p0, t80);
-    oq0 = _mm_xor_si128(q0, t80);
-    oq1 = _mm_xor_si128(q1, t80);
-
-    hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-    filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
-
-    work_a = _mm_subs_epi8(oq0, op0);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    // (vpx_filter + 3 * (qs0 - ps0)) & mask
-    filt = _mm_and_si128(filt, mask);
-    filter1 = _mm_adds_epi8(filt, t4);
-    filter2 = _mm_adds_epi8(filt, t3);
-
-    // Filter1 >> 3
-    work_a = _mm_cmpgt_epi8(zero, filter1);
-    filter1 = _mm_srli_epi16(filter1, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter1 = _mm_and_si128(filter1, t1f);
-    filter1 = _mm_or_si128(filter1, work_a);
-    oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
-
-    // Filter2 >> 3
-    work_a = _mm_cmpgt_epi8(zero, filter2);
-    filter2 = _mm_srli_epi16(filter2, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter2 = _mm_and_si128(filter2, t1f);
-    filter2 = _mm_or_si128(filter2, work_a);
-    op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
-
-    // filt >> 1
-    filt = _mm_adds_epi8(filter1, t1);
-    work_a = _mm_cmpgt_epi8(zero, filt);
-    filt = _mm_srli_epi16(filt, 1);
-    work_a = _mm_and_si128(work_a, t80);
-    filt = _mm_and_si128(filt, t7f);
-    filt = _mm_or_si128(filt, work_a);
-    filt = _mm_andnot_si128(hev, filt);
-    op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
-    oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
-    // loopfilter done
-
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    // filter8
-    {
-      const __m128i four = _mm_set1_epi16(4);
-      const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
-      const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
-      const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
-      const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
-      const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
-      const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
-      const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
-      const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
-
-      const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
-      const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
-      const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
-      const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
-      const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
-      const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
-      const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
-      const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
-      __m128i f8_lo, f8_hi;
-
-      f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
-                            _mm_add_epi16(p3_lo, p2_lo));
-      f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
-                            _mm_add_epi16(p2_lo, p1_lo));
-      f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
-
-      f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
-                            _mm_add_epi16(p3_hi, p2_hi));
-      f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
-                            _mm_add_epi16(p2_hi, p1_hi));
-      f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
-
-      op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
-
-      f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
-      f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
-      op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
-
-      f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
-      f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
-      op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
-
-      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
-      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
-      oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
-
-      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
-      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
-      oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
-
-      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
-      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
-      oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
-    }
-
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    // wide flat calculations
-    {
-      const __m128i eight = _mm_set1_epi16(8);
-      const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero);
-      const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
-      const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
-      const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
-      const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
-      const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
-      const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
-      const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
-      const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
-      const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
-      const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
-      const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
-      const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
-      const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
-      const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
-      const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero);
-
-      const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero);
-      const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
-      const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
-      const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
-      const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
-      const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
-      const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
-      const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
-      const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
-      const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
-      const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
-      const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
-      const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
-      const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
-      const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
-      const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero);
-
-      __m128i f_lo;
-      __m128i f_hi;
-
-      f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo);  // p7 * 7
-      f_lo = _mm_add_epi16(_mm_slli_epi16(p6_lo, 1),
-                           _mm_add_epi16(p4_lo, f_lo));
-      f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
-                           _mm_add_epi16(p2_lo, p1_lo));
-      f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
-      f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo);
-
-      f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi);  // p7 * 7
-      f_hi = _mm_add_epi16(_mm_slli_epi16(p6_hi, 1),
-                           _mm_add_epi16(p4_hi, f_hi));
-      f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
-                           _mm_add_epi16(p2_hi, p1_hi));
-      f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
-      f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
-
-      p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
-
-      f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
-      p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
-
-      f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
-      p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
-
-      f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
-      p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
-
-      f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
-      op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
-
-      f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
-      op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
-
-      f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
-      op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
-      oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
-      oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
-      oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
-      q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
-      q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
-      q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
-      q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
-    }
-    // wide flat
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  }
-}
-
-void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
-                               const unsigned char *_blimit,
-                               const unsigned char *_limit,
-                               const unsigned char *_thresh) {
-  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
-  __m128i mask, hev, flat;
-  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
-  __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
-
-  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
-  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 2 * p)));
-  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
-  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
-                            _mm_loadl_epi64((__m128i *)(s - 0 * p)));
-  p1q1 = _mm_shuffle_epi32(q1p1, 78);
-  p0q0 = _mm_shuffle_epi32(q0p0, 78);
-
-  {
-    // filter_mask and hev_mask
-    const __m128i one = _mm_set1_epi8(1);
-    const __m128i fe = _mm_set1_epi8(0xfe);
-    const __m128i ff = _mm_cmpeq_epi8(fe, fe);
-    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
-    abs_p1p0 = abs_diff(q1p1, q0p0);
-    abs_q1q0 =  _mm_srli_si128(abs_p1p0, 8);
-
-    abs_p0q0 = abs_diff(q0p0, p0q0);
-    abs_p1q1 = abs_diff(q1p1, p1q1);
-    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-
-    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(abs_p1p0, mask);
-    // mask |= (abs(p1 - p0) > limit) * -1;
-    // mask |= (abs(q1 - q0) > limit) * -1;
-
-    work = _mm_max_epu8(abs_diff(q2p2, q1p1),
-                        abs_diff(q3p3, q2p2));
-    mask = _mm_max_epu8(work, mask);
-    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-    mask = _mm_subs_epu8(mask, limit);
-    mask = _mm_cmpeq_epi8(mask, zero);
-
-    // flat_mask4
-
-    flat = _mm_max_epu8(abs_diff(q2p2, q0p0),
-                        abs_diff(q3p3, q0p0));
-    flat = _mm_max_epu8(abs_p1p0, flat);
-    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
-    flat = _mm_subs_epu8(flat, one);
-    flat = _mm_cmpeq_epi8(flat, zero);
-    flat = _mm_and_si128(flat, mask);
-  }
-
-  {
-    const __m128i four = _mm_set1_epi16(4);
-    unsigned char *src = s;
-    {
-      __m128i workp_a, workp_b, workp_shft;
-      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
-      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
-      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
-      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
-      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
-      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
-      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
-      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
-
-      workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
-      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op2[0],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op1[0],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op0[0],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq0[0],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq1[0],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq2[0],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-    }
-  }
-  // lp filter
-  {
-    const __m128i t4 = _mm_set1_epi8(4);
-    const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i t1 = _mm_set1_epi8(0x1);
-    const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
-                                      t80);
-    const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
-                                      t80);
-    const __m128i qs0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)),
-                                      t80);
-    const __m128i qs1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)),
-                                      t80);
-    __m128i filt;
-    __m128i work_a;
-    __m128i filter1, filter2;
-
-    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
-    work_a = _mm_subs_epi8(qs0, ps0);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    // (vpx_filter + 3 * (qs0 - ps0)) & mask
-    filt = _mm_and_si128(filt, mask);
-
-    filter1 = _mm_adds_epi8(filt, t4);
-    filter2 = _mm_adds_epi8(filt, t3);
-
-    // Filter1 >> 3
-    filter1 = _mm_unpacklo_epi8(zero, filter1);
-    filter1 = _mm_srai_epi16(filter1, 11);
-    filter1 = _mm_packs_epi16(filter1, filter1);
-
-    // Filter2 >> 3
-    filter2 = _mm_unpacklo_epi8(zero, filter2);
-    filter2 = _mm_srai_epi16(filter2, 11);
-    filter2 = _mm_packs_epi16(filter2, zero);
-
-    // filt >> 1
-    filt = _mm_adds_epi8(filter1, t1);
-    filt = _mm_unpacklo_epi8(zero, filt);
-    filt = _mm_srai_epi16(filt, 9);
-    filt = _mm_packs_epi16(filt, zero);
-
-    filt = _mm_andnot_si128(hev, filt);
-
-    work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
-    q0 = _mm_loadl_epi64((__m128i *)flat_oq0);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q0 = _mm_and_si128(flat, q0);
-    q0 = _mm_or_si128(work_a, q0);
-
-    work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
-    q1 = _mm_loadl_epi64((__m128i *)flat_oq1);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q1 = _mm_and_si128(flat, q1);
-    q1 = _mm_or_si128(work_a, q1);
-
-    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
-    q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q2 = _mm_and_si128(flat, q2);
-    q2 = _mm_or_si128(work_a, q2);
-
-    work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
-    p0 = _mm_loadl_epi64((__m128i *)flat_op0);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p0 = _mm_and_si128(flat, p0);
-    p0 = _mm_or_si128(work_a, p0);
-
-    work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
-    p1 = _mm_loadl_epi64((__m128i *)flat_op1);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p1 = _mm_and_si128(flat, p1);
-    p1 = _mm_or_si128(work_a, p1);
-
-    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
-    p2 = _mm_loadl_epi64((__m128i *)flat_op2);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p2 = _mm_and_si128(flat, p2);
-    p2 = _mm_or_si128(work_a, p2);
-
-    _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
-    _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
-    _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
-    _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
-    _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
-    _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
-  }
-}
-
-void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p,
-                                    const uint8_t *_blimit0,
-                                    const uint8_t *_limit0,
-                                    const uint8_t *_thresh0,
-                                    const uint8_t *_blimit1,
-                                    const uint8_t *_limit1,
-                                    const uint8_t *_thresh1) {
-  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i blimit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
-                         _mm_load_si128((const __m128i *)_blimit1));
-  const __m128i limit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
-                         _mm_load_si128((const __m128i *)_limit1));
-  const __m128i thresh =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
-                         _mm_load_si128((const __m128i *)_thresh1));
-
-  __m128i mask, hev, flat;
-  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
-
-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-  {
-    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
-                                          _mm_subs_epu8(p0, p1));
-    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
-                                          _mm_subs_epu8(q0, q1));
-    const __m128i one = _mm_set1_epi8(1);
-    const __m128i fe = _mm_set1_epi8(0xfe);
-    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
-    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
-                                    _mm_subs_epu8(q0, p0));
-    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
-                                    _mm_subs_epu8(q1, p1));
-    __m128i work;
-
-    // filter_mask and hev_mask
-    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-
-    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(flat, mask);
-    // mask |= (abs(p1 - p0) > limit) * -1;
-    // mask |= (abs(q1 - q0) > limit) * -1;
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
-                                     _mm_subs_epu8(p1, p2)),
-                         _mm_or_si128(_mm_subs_epu8(p3, p2),
-                                      _mm_subs_epu8(p2, p3)));
-    mask = _mm_max_epu8(work, mask);
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
-                                     _mm_subs_epu8(q1, q2)),
-                         _mm_or_si128(_mm_subs_epu8(q3, q2),
-                                      _mm_subs_epu8(q2, q3)));
-    mask = _mm_max_epu8(work, mask);
-    mask = _mm_subs_epu8(mask, limit);
-    mask = _mm_cmpeq_epi8(mask, zero);
-
-    // flat_mask4
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
-                                     _mm_subs_epu8(p0, p2)),
-                         _mm_or_si128(_mm_subs_epu8(q2, q0),
-                                      _mm_subs_epu8(q0, q2)));
-    flat = _mm_max_epu8(work, flat);
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
-                                     _mm_subs_epu8(p0, p3)),
-                         _mm_or_si128(_mm_subs_epu8(q3, q0),
-                                      _mm_subs_epu8(q0, q3)));
-    flat = _mm_max_epu8(work, flat);
-    flat = _mm_subs_epu8(flat, one);
-    flat = _mm_cmpeq_epi8(flat, zero);
-    flat = _mm_and_si128(flat, mask);
-  }
-  {
-    const __m128i four = _mm_set1_epi16(4);
-    unsigned char *src = s;
-    int i = 0;
-
-    do {
-      __m128i workp_a, workp_b, workp_shft;
-      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
-      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
-      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
-      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
-      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
-      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
-      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
-      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
-
-      workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
-      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op2[i * 8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op1[i * 8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op0[i * 8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq0[i * 8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq1[i * 8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq2[i * 8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      src += 8;
-    } while (++i < 2);
-  }
-  // lp filter
-  {
-    const __m128i t4 = _mm_set1_epi8(4);
-    const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i te0 = _mm_set1_epi8(0xe0);
-    const __m128i t1f = _mm_set1_epi8(0x1f);
-    const __m128i t1 = _mm_set1_epi8(0x1);
-    const __m128i t7f = _mm_set1_epi8(0x7f);
-
-    const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
-                                      t80);
-    const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
-                                      t80);
-    const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
-                                      t80);
-    const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
-                                      t80);
-    __m128i filt;
-    __m128i work_a;
-    __m128i filter1, filter2;
-
-    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
-    work_a = _mm_subs_epi8(qs0, ps0);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    // (vpx_filter + 3 * (qs0 - ps0)) & mask
-    filt = _mm_and_si128(filt, mask);
-
-    filter1 = _mm_adds_epi8(filt, t4);
-    filter2 = _mm_adds_epi8(filt, t3);
-
-    // Filter1 >> 3
-    work_a = _mm_cmpgt_epi8(zero, filter1);
-    filter1 = _mm_srli_epi16(filter1, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter1 = _mm_and_si128(filter1, t1f);
-    filter1 = _mm_or_si128(filter1, work_a);
-
-    // Filter2 >> 3
-    work_a = _mm_cmpgt_epi8(zero, filter2);
-    filter2 = _mm_srli_epi16(filter2, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter2 = _mm_and_si128(filter2, t1f);
-    filter2 = _mm_or_si128(filter2, work_a);
-
-    // filt >> 1
-    filt = _mm_adds_epi8(filter1, t1);
-    work_a = _mm_cmpgt_epi8(zero, filt);
-    filt = _mm_srli_epi16(filt, 1);
-    work_a = _mm_and_si128(work_a, t80);
-    filt = _mm_and_si128(filt, t7f);
-    filt = _mm_or_si128(filt, work_a);
-
-    filt = _mm_andnot_si128(hev, filt);
-
-    work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
-    q0 = _mm_load_si128((__m128i *)flat_oq0);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q0 = _mm_and_si128(flat, q0);
-    q0 = _mm_or_si128(work_a, q0);
-
-    work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
-    q1 = _mm_load_si128((__m128i *)flat_oq1);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q1 = _mm_and_si128(flat, q1);
-    q1 = _mm_or_si128(work_a, q1);
-
-    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
-    q2 = _mm_load_si128((__m128i *)flat_oq2);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q2 = _mm_and_si128(flat, q2);
-    q2 = _mm_or_si128(work_a, q2);
-
-    work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
-    p0 = _mm_load_si128((__m128i *)flat_op0);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p0 = _mm_and_si128(flat, p0);
-    p0 = _mm_or_si128(work_a, p0);
-
-    work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
-    p1 = _mm_load_si128((__m128i *)flat_op1);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p1 = _mm_and_si128(flat, p1);
-    p1 = _mm_or_si128(work_a, p1);
-
-    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
-    p2 = _mm_load_si128((__m128i *)flat_op2);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p2 = _mm_and_si128(flat, p2);
-    p2 = _mm_or_si128(work_a, p2);
-
-    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
-    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
-    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
-    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
-  }
-}
-
-void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
-                                    const unsigned char *_blimit0,
-                                    const unsigned char *_limit0,
-                                    const unsigned char *_thresh0,
-                                    const unsigned char *_blimit1,
-                                    const unsigned char *_limit1,
-                                    const unsigned char *_thresh1) {
-  const __m128i blimit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
-                         _mm_load_si128((const __m128i *)_blimit1));
-  const __m128i limit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
-                         _mm_load_si128((const __m128i *)_limit1));
-  const __m128i thresh =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
-                         _mm_load_si128((const __m128i *)_thresh1));
-  const __m128i zero = _mm_set1_epi16(0);
-  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
-  __m128i mask, hev, flat;
-
-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-
-  // filter_mask and hev_mask
-  {
-    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
-                                          _mm_subs_epu8(p0, p1));
-    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
-                                          _mm_subs_epu8(q0, q1));
-    const __m128i fe = _mm_set1_epi8(0xfe);
-    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
-    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
-                                    _mm_subs_epu8(q0, p0));
-    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
-                                    _mm_subs_epu8(q1, p1));
-    __m128i work;
-
-    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-
-    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(flat, mask);
-    // mask |= (abs(p1 - p0) > limit) * -1;
-    // mask |= (abs(q1 - q0) > limit) * -1;
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
-                                     _mm_subs_epu8(p1, p2)),
-                         _mm_or_si128(_mm_subs_epu8(p3, p2),
-                                      _mm_subs_epu8(p2, p3)));
-    mask = _mm_max_epu8(work, mask);
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
-                                     _mm_subs_epu8(q1, q2)),
-                         _mm_or_si128(_mm_subs_epu8(q3, q2),
-                                      _mm_subs_epu8(q2, q3)));
-    mask = _mm_max_epu8(work, mask);
-    mask = _mm_subs_epu8(mask, limit);
-    mask = _mm_cmpeq_epi8(mask, zero);
-  }
-
-  // filter4
-  {
-    const __m128i t4 = _mm_set1_epi8(4);
-    const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i te0 = _mm_set1_epi8(0xe0);
-    const __m128i t1f = _mm_set1_epi8(0x1f);
-    const __m128i t1 = _mm_set1_epi8(0x1);
-    const __m128i t7f = _mm_set1_epi8(0x7f);
-
-    const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
-                                      t80);
-    const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
-                                      t80);
-    const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
-                                      t80);
-    const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
-                                      t80);
-    __m128i filt;
-    __m128i work_a;
-    __m128i filter1, filter2;
-
-    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
-    work_a = _mm_subs_epi8(qs0, ps0);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    // (vpx_filter + 3 * (qs0 - ps0)) & mask
-    filt = _mm_and_si128(filt, mask);
-
-    filter1 = _mm_adds_epi8(filt, t4);
-    filter2 = _mm_adds_epi8(filt, t3);
-
-    // Filter1 >> 3
-    work_a = _mm_cmpgt_epi8(zero, filter1);
-    filter1 = _mm_srli_epi16(filter1, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter1 = _mm_and_si128(filter1, t1f);
-    filter1 = _mm_or_si128(filter1, work_a);
-
-    // Filter2 >> 3
-    work_a = _mm_cmpgt_epi8(zero, filter2);
-    filter2 = _mm_srli_epi16(filter2, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter2 = _mm_and_si128(filter2, t1f);
-    filter2 = _mm_or_si128(filter2, work_a);
-
-    // filt >> 1
-    filt = _mm_adds_epi8(filter1, t1);
-    work_a = _mm_cmpgt_epi8(zero, filt);
-    filt = _mm_srli_epi16(filt, 1);
-    work_a = _mm_and_si128(work_a, t80);
-    filt = _mm_and_si128(filt, t7f);
-    filt = _mm_or_si128(filt, work_a);
-
-    filt = _mm_andnot_si128(hev, filt);
-
-    q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
-    q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
-    p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
-    p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
-
-    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
-    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
-  }
-}
-
-static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
-                                 int in_p, unsigned char *out, int out_p) {
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  __m128i x8, x9, x10, x11, x12, x13, x14, x15;
-
-  // 2-way interleave w/hoisting of unpacks
-  x0 = _mm_loadl_epi64((__m128i *)in0);  // 1
-  x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));  // 3
-  x0 = _mm_unpacklo_epi8(x0, x1);  // 1
-
-  x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));  // 5
-  x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p));  // 7
-  x1 = _mm_unpacklo_epi8(x2, x3);  // 2
-
-  x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p));  // 9
-  x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p));  // 11
-  x2 = _mm_unpacklo_epi8(x4, x5);  // 3
-
-  x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p));  // 13
-  x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p));  // 15
-  x3 = _mm_unpacklo_epi8(x6, x7);  // 4
-  x4 = _mm_unpacklo_epi16(x0, x1);  // 9
-
-  x8 = _mm_loadl_epi64((__m128i *)in1);  // 2
-  x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));  // 4
-  x8 = _mm_unpacklo_epi8(x8, x9);  // 5
-  x5 = _mm_unpacklo_epi16(x2, x3);  // 10
-
-  x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));  // 6
-  x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p));  // 8
-  x9 = _mm_unpacklo_epi8(x10, x11);  // 6
-
-  x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p));  // 10
-  x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p));  // 12
-  x10 = _mm_unpacklo_epi8(x12, x13);  // 7
-  x12 = _mm_unpacklo_epi16(x8, x9);  // 11
-
-  x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p));  // 14
-  x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p));  // 16
-  x11 = _mm_unpacklo_epi8(x14, x15);  // 8
-  x13 = _mm_unpacklo_epi16(x10, x11);  // 12
-
-  x6 = _mm_unpacklo_epi32(x4, x5);  // 13
-  x7 = _mm_unpackhi_epi32(x4, x5);  // 14
-  x14 = _mm_unpacklo_epi32(x12, x13);  // 15
-  x15 = _mm_unpackhi_epi32(x12, x13);  // 16
-
-  // Store first 4-line result
-  _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
-  _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
-  _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
-  _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
-
-  x4 = _mm_unpackhi_epi16(x0, x1);
-  x5 = _mm_unpackhi_epi16(x2, x3);
-  x12 = _mm_unpackhi_epi16(x8, x9);
-  x13 = _mm_unpackhi_epi16(x10, x11);
-
-  x6 = _mm_unpacklo_epi32(x4, x5);
-  x7 = _mm_unpackhi_epi32(x4, x5);
-  x14 = _mm_unpacklo_epi32(x12, x13);
-  x15 = _mm_unpackhi_epi32(x12, x13);
-
-  // Store second 4-line result
-  _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
-  _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
-  _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
-  _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
-}
-
-static INLINE void transpose(unsigned char *src[], int in_p,
-                             unsigned char *dst[], int out_p,
-                             int num_8x8_to_transpose) {
-  int idx8x8 = 0;
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  do {
-    unsigned char *in = src[idx8x8];
-    unsigned char *out = dst[idx8x8];
-
-    x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p));  // 00 01 02 03 04 05 06 07
-    x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p));  // 10 11 12 13 14 15 16 17
-    // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-    x0 = _mm_unpacklo_epi8(x0, x1);
-
-    x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p));  // 20 21 22 23 24 25 26 27
-    x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p));  // 30 31 32 33 34 35 36 37
-    // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-    x1 = _mm_unpacklo_epi8(x2, x3);
-
-    x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p));  // 40 41 42 43 44 45 46 47
-    x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p));  // 50 51 52 53 54 55 56 57
-    // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
-    x2 = _mm_unpacklo_epi8(x4, x5);
-
-    x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p));  // 60 61 62 63 64 65 66 67
-    x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p));  // 70 71 72 73 74 75 76 77
-    // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
-    x3 = _mm_unpacklo_epi8(x6, x7);
-
-    // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-    x4 = _mm_unpacklo_epi16(x0, x1);
-    // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
-    x5 = _mm_unpacklo_epi16(x2, x3);
-    // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
-    x6 = _mm_unpacklo_epi32(x4, x5);
-    _mm_storel_pd((double *)(out + 0*out_p),
-                  _mm_castsi128_pd(x6));  // 00 10 20 30 40 50 60 70
-    _mm_storeh_pd((double *)(out + 1*out_p),
-                  _mm_castsi128_pd(x6));  // 01 11 21 31 41 51 61 71
-    // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-    x7 = _mm_unpackhi_epi32(x4, x5);
-    _mm_storel_pd((double *)(out + 2*out_p),
-                  _mm_castsi128_pd(x7));  // 02 12 22 32 42 52 62 72
-    _mm_storeh_pd((double *)(out + 3*out_p),
-                  _mm_castsi128_pd(x7));  // 03 13 23 33 43 53 63 73
-
-    // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
-    x4 = _mm_unpackhi_epi16(x0, x1);
-    // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
-    x5 = _mm_unpackhi_epi16(x2, x3);
-    // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
-    x6 = _mm_unpacklo_epi32(x4, x5);
-    _mm_storel_pd((double *)(out + 4*out_p),
-                  _mm_castsi128_pd(x6));  // 04 14 24 34 44 54 64 74
-    _mm_storeh_pd((double *)(out + 5*out_p),
-                  _mm_castsi128_pd(x6));  // 05 15 25 35 45 55 65 75
-    // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
-    x7 = _mm_unpackhi_epi32(x4, x5);
-
-    _mm_storel_pd((double *)(out + 6*out_p),
-                  _mm_castsi128_pd(x7));  // 06 16 26 36 46 56 66 76
-    _mm_storeh_pd((double *)(out + 7*out_p),
-                  _mm_castsi128_pd(x7));  // 07 17 27 37 47 57 67 77
-  } while (++idx8x8 < num_8x8_to_transpose);
-}
-
-void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
-                                  const uint8_t *limit0,
-                                  const uint8_t *thresh0,
-                                  const uint8_t *blimit1,
-                                  const uint8_t *limit1,
-                                  const uint8_t *thresh1) {
-  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
-  unsigned char *src[2];
-  unsigned char *dst[2];
-
-  // Transpose 8x16
-  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
-
-  // Loop filtering
-  vpx_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
-                                 blimit1, limit1, thresh1);
-  src[0] = t_dst;
-  src[1] = t_dst + 8;
-  dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
-
-  // Transpose back
-  transpose(src, 16, dst, p, 2);
-}
-
-void vpx_lpf_vertical_8_sse2(unsigned char *s, int p,
-                             const unsigned char *blimit,
-                             const unsigned char *limit,
-                             const unsigned char *thresh) {
-  DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]);
-  unsigned char *src[1];
-  unsigned char *dst[1];
-
-  // Transpose 8x8
-  src[0] = s - 4;
-  dst[0] = t_dst;
-
-  transpose(src, p, dst, 8, 1);
-
-  // Loop filtering
-  vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh);
-
-  src[0] = t_dst;
-  dst[0] = s - 4;
-
-  // Transpose back
-  transpose(src, 8, dst, p, 1);
-}
-
-void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
-                                  const uint8_t *limit0,
-                                  const uint8_t *thresh0,
-                                  const uint8_t *blimit1,
-                                  const uint8_t *limit1,
-                                  const uint8_t *thresh1) {
-  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
-  unsigned char *src[2];
-  unsigned char *dst[2];
-
-  // Transpose 8x16
-  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
-
-  // Loop filtering
-  vpx_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
-                                 blimit1, limit1, thresh1);
-  src[0] = t_dst;
-  src[1] = t_dst + 8;
-
-  dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
-
-  // Transpose back
-  transpose(src, 16, dst, p, 2);
-}
-
-void vpx_lpf_vertical_16_sse2(unsigned char *s, int p,
-                              const unsigned char *blimit,
-                              const unsigned char *limit,
-                              const unsigned char *thresh) {
-  DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 16]);
-  unsigned char *src[2];
-  unsigned char *dst[2];
-
-  src[0] = s - 8;
-  src[1] = s;
-  dst[0] = t_dst;
-  dst[1] = t_dst + 8 * 8;
-
-  // Transpose 16x8
-  transpose(src, p, dst, 8, 2);
-
-  // Loop filtering
-  vpx_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh);
-
-  src[0] = t_dst;
-  src[1] = t_dst + 8 * 8;
-  dst[0] = s - 8;
-  dst[1] = s;
-
-  // Transpose back
-  transpose(src, 8, dst, p, 2);
-}
-
-void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
-                                   const uint8_t *blimit, const uint8_t *limit,
-                                   const uint8_t *thresh) {
-  DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
-
-  // Transpose 16x16
-  transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
-  transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
-
-  // Loop filtering
-  vpx_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh);
-
-  // Transpose back
-  transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
-  transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
-}
diff --git a/thirdparty/libvpx/vpx_dsp/x86/txfm_common_sse2.h b/thirdparty/libvpx/vpx_dsp/x86/txfm_common_sse2.h
deleted file mode 100644
index 536b206876..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/txfm_common_sse2.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_DSP_X86_TXFM_COMMON_SSE2_H_
-#define VPX_DSP_X86_TXFM_COMMON_SSE2_H_
-
-#include <emmintrin.h>
-#include "vpx/vpx_integer.h"
-
-#define pair_set_epi16(a, b) \
-  _mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
-                (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
-
-#define dual_set_epi16(a, b) \
-  _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \
-                (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a))
-
-#define octa_set_epi16(a, b, c, d, e, f, g, h) \
-  _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \
-                 (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h))
-
-#endif  // VPX_DSP_X86_TXFM_COMMON_SSE2_H_
diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_asm_stubs.c b/thirdparty/libvpx/vpx_dsp/x86/vpx_asm_stubs.c
deleted file mode 100644
index 422b0fc422..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/vpx_asm_stubs.c
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_config.h"
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/x86/convolve.h"
-
-#if HAVE_SSE2
-filter8_1dfunction vpx_filter_block1d16_v8_sse2;
-filter8_1dfunction vpx_filter_block1d16_h8_sse2;
-filter8_1dfunction vpx_filter_block1d8_v8_sse2;
-filter8_1dfunction vpx_filter_block1d8_h8_sse2;
-filter8_1dfunction vpx_filter_block1d4_v8_sse2;
-filter8_1dfunction vpx_filter_block1d4_h8_sse2;
-filter8_1dfunction vpx_filter_block1d16_v8_avg_sse2;
-filter8_1dfunction vpx_filter_block1d16_h8_avg_sse2;
-filter8_1dfunction vpx_filter_block1d8_v8_avg_sse2;
-filter8_1dfunction vpx_filter_block1d8_h8_avg_sse2;
-filter8_1dfunction vpx_filter_block1d4_v8_avg_sse2;
-filter8_1dfunction vpx_filter_block1d4_h8_avg_sse2;
-
-filter8_1dfunction vpx_filter_block1d16_v2_sse2;
-filter8_1dfunction vpx_filter_block1d16_h2_sse2;
-filter8_1dfunction vpx_filter_block1d8_v2_sse2;
-filter8_1dfunction vpx_filter_block1d8_h2_sse2;
-filter8_1dfunction vpx_filter_block1d4_v2_sse2;
-filter8_1dfunction vpx_filter_block1d4_h2_sse2;
-filter8_1dfunction vpx_filter_block1d16_v2_avg_sse2;
-filter8_1dfunction vpx_filter_block1d16_h2_avg_sse2;
-filter8_1dfunction vpx_filter_block1d8_v2_avg_sse2;
-filter8_1dfunction vpx_filter_block1d8_h2_avg_sse2;
-filter8_1dfunction vpx_filter_block1d4_v2_avg_sse2;
-filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2;
-
-// void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const int16_t *filter_x, int x_step_q4,
-//                               const int16_t *filter_y, int y_step_q4,
-//                               int w, int h);
-// void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                              uint8_t *dst, ptrdiff_t dst_stride,
-//                              const int16_t *filter_x, int x_step_q4,
-//                              const int16_t *filter_y, int y_step_q4,
-//                              int w, int h);
-// void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                                   uint8_t *dst, ptrdiff_t dst_stride,
-//                                   const int16_t *filter_x, int x_step_q4,
-//                                   const int16_t *filter_y, int y_step_q4,
-//                                   int w, int h);
-// void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                                  uint8_t *dst, ptrdiff_t dst_stride,
-//                                  const int16_t *filter_x, int x_step_q4,
-//                                  const int16_t *filter_y, int y_step_q4,
-//                                  int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
-FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
-
-// void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                         uint8_t *dst, ptrdiff_t dst_stride,
-//                         const int16_t *filter_x, int x_step_q4,
-//                         const int16_t *filter_y, int y_step_q4,
-//                         int w, int h);
-// void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                             uint8_t *dst, ptrdiff_t dst_stride,
-//                             const int16_t *filter_x, int x_step_q4,
-//                             const int16_t *filter_y, int y_step_q4,
-//                             int w, int h);
-FUN_CONV_2D(, sse2);
-FUN_CONV_2D(avg_ , sse2);
-
-#if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
-
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
-
-// void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src,
-//                                      ptrdiff_t src_stride,
-//                                      uint8_t *dst,
-//                                      ptrdiff_t dst_stride,
-//                                      const int16_t *filter_x,
-//                                      int x_step_q4,
-//                                      const int16_t *filter_y,
-//                                      int y_step_q4,
-//                                      int w, int h, int bd);
-// void vpx_highbd_convolve8_vert_sse2(const uint8_t *src,
-//                                     ptrdiff_t src_stride,
-//                                     uint8_t *dst,
-//                                     ptrdiff_t dst_stride,
-//                                     const int16_t *filter_x,
-//                                     int x_step_q4,
-//                                     const int16_t *filter_y,
-//                                     int y_step_q4,
-//                                     int w, int h, int bd);
-// void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src,
-//                                          ptrdiff_t src_stride,
-//                                          uint8_t *dst,
-//                                          ptrdiff_t dst_stride,
-//                                          const int16_t *filter_x,
-//                                          int x_step_q4,
-//                                          const int16_t *filter_y,
-//                                          int y_step_q4,
-//                                          int w, int h, int bd);
-// void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src,
-//                                         ptrdiff_t src_stride,
-//                                         uint8_t *dst,
-//                                         ptrdiff_t dst_stride,
-//                                         const int16_t *filter_x,
-//                                         int x_step_q4,
-//                                         const int16_t *filter_y,
-//                                         int y_step_q4,
-//                                         int w, int h, int bd);
-HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
-HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
-HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
-                 sse2);
-
-// void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                                uint8_t *dst, ptrdiff_t dst_stride,
-//                                const int16_t *filter_x, int x_step_q4,
-//                                const int16_t *filter_y, int y_step_q4,
-//                                int w, int h, int bd);
-// void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                                    uint8_t *dst, ptrdiff_t dst_stride,
-//                                    const int16_t *filter_x, int x_step_q4,
-//                                    const int16_t *filter_y, int y_step_q4,
-//                                    int w, int h, int bd);
-HIGH_FUN_CONV_2D(, sse2);
-HIGH_FUN_CONV_2D(avg_ , sse2);
-#endif  // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
-#endif  // HAVE_SSE2
diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm b/thirdparty/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
deleted file mode 100644
index abc0270655..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
+++ /dev/null
@@ -1,228 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro convolve_fn 1-2
-%ifidn %1, avg
-%define AUX_XMM_REGS 4
-%else
-%define AUX_XMM_REGS 0
-%endif
-%ifidn %2, highbd
-%define pavg pavgw
-cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
-                                              dst, dst_stride, \
-                                              fx, fxs, fy, fys, w, h, bd
-%else
-%define pavg pavgb
-cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
-                                           dst, dst_stride, \
-                                           fx, fxs, fy, fys, w, h
-%endif
-  mov r4d, dword wm
-%ifidn %2, highbd
-  shl r4d, 1
-  shl srcq, 1
-  shl src_strideq, 1
-  shl dstq, 1
-  shl dst_strideq, 1
-%else
-  cmp r4d, 4
-  je .w4
-%endif
-  cmp r4d, 8
-  je .w8
-  cmp r4d, 16
-  je .w16
-  cmp r4d, 32
-  je .w32
-%ifidn %2, highbd
-  cmp r4d, 64
-  je .w64
-
-  mov                    r4d, dword hm
-.loop128:
-  movu                    m0, [srcq]
-  movu                    m1, [srcq+16]
-  movu                    m2, [srcq+32]
-  movu                    m3, [srcq+48]
-%ifidn %1, avg
-  pavg                    m0, [dstq]
-  pavg                    m1, [dstq+16]
-  pavg                    m2, [dstq+32]
-  pavg                    m3, [dstq+48]
-%endif
-  mova             [dstq   ], m0
-  mova             [dstq+16], m1
-  mova             [dstq+32], m2
-  mova             [dstq+48], m3
-  movu                    m0, [srcq+64]
-  movu                    m1, [srcq+80]
-  movu                    m2, [srcq+96]
-  movu                    m3, [srcq+112]
-  add                   srcq, src_strideq
-%ifidn %1, avg
-  pavg                    m0, [dstq+64]
-  pavg                    m1, [dstq+80]
-  pavg                    m2, [dstq+96]
-  pavg                    m3, [dstq+112]
-%endif
-  mova             [dstq+64], m0
-  mova             [dstq+80], m1
-  mova             [dstq+96], m2
-  mova            [dstq+112], m3
-  add                   dstq, dst_strideq
-  dec                    r4d
-  jnz .loop128
-  RET
-%endif
-
-.w64
-  mov                    r4d, dword hm
-.loop64:
-  movu                    m0, [srcq]
-  movu                    m1, [srcq+16]
-  movu                    m2, [srcq+32]
-  movu                    m3, [srcq+48]
-  add                   srcq, src_strideq
-%ifidn %1, avg
-  pavg                    m0, [dstq]
-  pavg                    m1, [dstq+16]
-  pavg                    m2, [dstq+32]
-  pavg                    m3, [dstq+48]
-%endif
-  mova             [dstq   ], m0
-  mova             [dstq+16], m1
-  mova             [dstq+32], m2
-  mova             [dstq+48], m3
-  add                   dstq, dst_strideq
-  dec                    r4d
-  jnz .loop64
-  RET
-
-.w32:
-  mov                    r4d, dword hm
-.loop32:
-  movu                    m0, [srcq]
-  movu                    m1, [srcq+16]
-  movu                    m2, [srcq+src_strideq]
-  movu                    m3, [srcq+src_strideq+16]
-  lea                   srcq, [srcq+src_strideq*2]
-%ifidn %1, avg
-  pavg                    m0, [dstq]
-  pavg                    m1, [dstq            +16]
-  pavg                    m2, [dstq+dst_strideq]
-  pavg                    m3, [dstq+dst_strideq+16]
-%endif
-  mova [dstq               ], m0
-  mova [dstq            +16], m1
-  mova [dstq+dst_strideq   ], m2
-  mova [dstq+dst_strideq+16], m3
-  lea                   dstq, [dstq+dst_strideq*2]
-  sub                    r4d, 2
-  jnz .loop32
-  RET
-
-.w16:
-  mov                    r4d, dword hm
-  lea                    r5q, [src_strideq*3]
-  lea                    r6q, [dst_strideq*3]
-.loop16:
-  movu                    m0, [srcq]
-  movu                    m1, [srcq+src_strideq]
-  movu                    m2, [srcq+src_strideq*2]
-  movu                    m3, [srcq+r5q]
-  lea                   srcq, [srcq+src_strideq*4]
-%ifidn %1, avg
-  pavg                    m0, [dstq]
-  pavg                    m1, [dstq+dst_strideq]
-  pavg                    m2, [dstq+dst_strideq*2]
-  pavg                    m3, [dstq+r6q]
-%endif
-  mova  [dstq              ], m0
-  mova  [dstq+dst_strideq  ], m1
-  mova  [dstq+dst_strideq*2], m2
-  mova  [dstq+r6q          ], m3
-  lea                   dstq, [dstq+dst_strideq*4]
-  sub                    r4d, 4
-  jnz .loop16
-  RET
-
-.w8:
-  mov                    r4d, dword hm
-  lea                    r5q, [src_strideq*3]
-  lea                    r6q, [dst_strideq*3]
-.loop8:
-  movh                    m0, [srcq]
-  movh                    m1, [srcq+src_strideq]
-  movh                    m2, [srcq+src_strideq*2]
-  movh                    m3, [srcq+r5q]
-  lea                   srcq, [srcq+src_strideq*4]
-%ifidn %1, avg
-  movh                    m4, [dstq]
-  movh                    m5, [dstq+dst_strideq]
-  movh                    m6, [dstq+dst_strideq*2]
-  movh                    m7, [dstq+r6q]
-  pavg                    m0, m4
-  pavg                    m1, m5
-  pavg                    m2, m6
-  pavg                    m3, m7
-%endif
-  movh  [dstq              ], m0
-  movh  [dstq+dst_strideq  ], m1
-  movh  [dstq+dst_strideq*2], m2
-  movh  [dstq+r6q          ], m3
-  lea                   dstq, [dstq+dst_strideq*4]
-  sub                    r4d, 4
-  jnz .loop8
-  RET
-
-%ifnidn %2, highbd
-.w4:
-  mov                    r4d, dword hm
-  lea                    r5q, [src_strideq*3]
-  lea                    r6q, [dst_strideq*3]
-.loop4:
-  movd                    m0, [srcq]
-  movd                    m1, [srcq+src_strideq]
-  movd                    m2, [srcq+src_strideq*2]
-  movd                    m3, [srcq+r5q]
-  lea                   srcq, [srcq+src_strideq*4]
-%ifidn %1, avg
-  movd                    m4, [dstq]
-  movd                    m5, [dstq+dst_strideq]
-  movd                    m6, [dstq+dst_strideq*2]
-  movd                    m7, [dstq+r6q]
-  pavg                    m0, m4
-  pavg                    m1, m5
-  pavg                    m2, m6
-  pavg                    m3, m7
-%endif
-  movd  [dstq              ], m0
-  movd  [dstq+dst_strideq  ], m1
-  movd  [dstq+dst_strideq*2], m2
-  movd  [dstq+r6q          ], m3
-  lea                   dstq, [dstq+dst_strideq*4]
-  sub                    r4d, 4
-  jnz .loop4
-  RET
-%endif
-%endmacro
-
-INIT_XMM sse2
-convolve_fn copy
-convolve_fn avg
-%if CONFIG_VP9_HIGHBITDEPTH
-convolve_fn copy, highbd
-convolve_fn avg, highbd
-%endif
diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
deleted file mode 100644
index d8a92354c9..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ /dev/null
@@ -1,606 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-// Due to a header conflict between math.h and intrinsics includes with ceil()
-// in certain configurations under vs9 this include needs to precede
-// immintrin.h.
-
-#include <immintrin.h>
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/x86/convolve.h"
-#include "vpx_ports/mem.h"
-
-// filters for 16_h8 and 16_v8
-DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
-  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
-  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
-  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
-  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
-  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
-  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
-  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
-  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
-};
-
-#if defined(__clang__)
-// -- GODOT start -
-# if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3) || \
-    (!defined(__MACPORTS__) && defined(__APPLE__) && \
-        ((__clang_major__ == 4 && __clang_minor__ <= 2) || \
-            (__clang_major__ == 5 && __clang_minor__ == 0)))
-// -- GODOT end --
-#  define MM256_BROADCASTSI128_SI256(x) \
-       _mm_broadcastsi128_si256((__m128i const *)&(x))
-# else  // clang > 3.3, and not 5.0 on macosx.
-#  define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-# endif  // clang <= 3.3
-#elif defined(__GNUC__)
-# if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
-#  define MM256_BROADCASTSI128_SI256(x) \
-       _mm_broadcastsi128_si256((__m128i const *)&(x))
-# elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
-#  define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
-# else  // gcc > 4.7
-#  define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-# endif  // gcc <= 4.6
-#else  // !(gcc || clang)
-# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-#endif  // __clang__
-
-static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
-                                         ptrdiff_t src_pixels_per_line,
-                                         uint8_t *output_ptr,
-                                         ptrdiff_t output_pitch,
-                                         uint32_t output_height,
-                                         const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
-  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
-  __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
-  __m256i srcReg32b1, srcReg32b2, filtersReg32;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to 8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the first 16 bits (first and second byte)
-  // across 256 bit register
-  firstFilters = _mm256_shuffle_epi8(filtersReg32,
-                 _mm256_set1_epi16(0x100u));
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32,
-                  _mm256_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32,
-                 _mm256_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits (seventh and eighth byte)
-  // across 256 bit register
-  forthFilters = _mm256_shuffle_epi8(filtersReg32,
-                 _mm256_set1_epi16(0x706u));
-
-  filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);
-  filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);
-  filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);
-  filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);
-
-  // multiple the size of the source and destination stride by two
-  src_stride = src_pixels_per_line << 1;
-  dst_stride = output_pitch << 1;
-  for (i = output_height; i > 1; i-=2) {
-    // load the 2 strides of source
-    srcReg32b1 = _mm256_castsi128_si256(
-                 _mm_loadu_si128((const __m128i *)(src_ptr - 3)));
-    srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
-                 _mm_loadu_si128((const __m128i *)
-                 (src_ptr+src_pixels_per_line-3)), 1);
-
-    // filter the source buffer
-    srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
-    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
-
-    // filter the source buffer
-    srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
-    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
-                       _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
-
-    // reading 2 strides of the next 16 bytes
-    // (part of it was being read by earlier read)
-    srcReg32b2 = _mm256_castsi128_si256(
-                 _mm_loadu_si128((const __m128i *)(src_ptr + 5)));
-    srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
-                 _mm_loadu_si128((const __m128i *)
-                 (src_ptr+src_pixels_per_line+5)), 1);
-
-    // add and saturate the results together
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
-                       _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
-
-    // filter the source buffer
-    srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
-
-    // filter the source buffer
-    srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
-    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
-                       _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
-    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
-                       _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
-
-
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);
-
-    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64);
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
-    srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1,
-                                           srcRegFilt32b2_1);
-
-    src_ptr+=src_stride;
-
-    // save 16 bytes
-    _mm_store_si128((__m128i*)output_ptr,
-    _mm256_castsi256_si128(srcRegFilt32b1_1));
-
-    // save the next 16 bits
-    _mm_store_si128((__m128i*)(output_ptr+output_pitch),
-    _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
-    output_ptr+=dst_stride;
-  }
-
-  // if the number of strides is odd.
-  // process only 16 bytes
-  if (i > 0) {
-    __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
-    __m128i srcRegFilt2, srcRegFilt3;
-
-    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
-
-    // filter the source buffer
-    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,
-                    _mm256_castsi256_si128(filt1Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg1,
-                  _mm256_castsi256_si128(filt4Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,
-                    _mm256_castsi256_si128(firstFilters));
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
-                  _mm256_castsi256_si128(forthFilters));
-
-    // add and saturate the results together
-    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
-
-    // filter the source buffer
-    srcRegFilt3= _mm_shuffle_epi8(srcReg1,
-                 _mm256_castsi256_si128(filt2Reg));
-    srcRegFilt2= _mm_shuffle_epi8(srcReg1,
-                 _mm256_castsi256_si128(filt3Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
-                  _mm256_castsi256_si128(secondFilters));
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
-                  _mm256_castsi256_si128(thirdFilters));
-
-    // add and saturate the results together
-    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
-                    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
-
-    // reading the next 16 bytes
-    // (part of it was being read by earlier read)
-    srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
-
-    // add and saturate the results together
-    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
-                    _mm_max_epi16(srcRegFilt3, srcRegFilt2));
-
-    // filter the source buffer
-    srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2,
-                    _mm256_castsi256_si128(filt1Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
-                  _mm256_castsi256_si128(filt4Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1,
-                    _mm256_castsi256_si128(firstFilters));
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
-                  _mm256_castsi256_si128(forthFilters));
-
-    // add and saturate the results together
-    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
-
-    // filter the source buffer
-    srcRegFilt3 = _mm_shuffle_epi8(srcReg2,
-                  _mm256_castsi256_si128(filt2Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
-                  _mm256_castsi256_si128(filt3Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
-                  _mm256_castsi256_si128(secondFilters));
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
-                  _mm256_castsi256_si128(thirdFilters));
-
-    // add and saturate the results together
-    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
-                    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
-    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
-                    _mm_max_epi16(srcRegFilt3, srcRegFilt2));
-
-
-    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
-                    _mm256_castsi256_si128(addFilterReg64));
-
-    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
-                    _mm256_castsi256_si128(addFilterReg64));
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
-    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
-
-    // save 16 bytes
-    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
-  }
-}
-
-static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
-                                         ptrdiff_t src_pitch,
-                                         uint8_t *output_ptr,
-                                         ptrdiff_t out_pitch,
-                                         uint32_t output_height,
-                                         const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i addFilterReg64;
-  __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
-  __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
-  __m256i srcReg32b11, srcReg32b12, filtersReg32;
-  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the
-  // same data in both lanes of 128 bit register.
-  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the first 16 bits (first and second byte)
-  // across 256 bit register
-  firstFilters = _mm256_shuffle_epi8(filtersReg32,
-                 _mm256_set1_epi16(0x100u));
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32,
-                  _mm256_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32,
-                 _mm256_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits (seventh and eighth byte)
-  // across 256 bit register
-  forthFilters = _mm256_shuffle_epi8(filtersReg32,
-                 _mm256_set1_epi16(0x706u));
-
-  // multiple the size of the source and destination stride by two
-  src_stride = src_pitch << 1;
-  dst_stride = out_pitch << 1;
-
-  // load 16 bytes 7 times in stride of src_pitch
-  srcReg32b1 = _mm256_castsi128_si256(
-               _mm_loadu_si128((const __m128i *)(src_ptr)));
-  srcReg32b2 = _mm256_castsi128_si256(
-               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)));
-  srcReg32b3 = _mm256_castsi128_si256(
-               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)));
-  srcReg32b4 = _mm256_castsi128_si256(
-               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)));
-  srcReg32b5 = _mm256_castsi128_si256(
-               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
-  srcReg32b6 = _mm256_castsi128_si256(
-               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
-  srcReg32b7 = _mm256_castsi128_si256(
-               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
-
-  // have each consecutive loads on the same 256 register
-  srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
-               _mm256_castsi256_si128(srcReg32b2), 1);
-  srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
-               _mm256_castsi256_si128(srcReg32b3), 1);
-  srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,
-               _mm256_castsi256_si128(srcReg32b4), 1);
-  srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,
-               _mm256_castsi256_si128(srcReg32b5), 1);
-  srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,
-               _mm256_castsi256_si128(srcReg32b6), 1);
-  srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,
-               _mm256_castsi256_si128(srcReg32b7), 1);
-
-  // merge every two consecutive registers except the last one
-  srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
-  srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
-
-  // save
-  srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
-
-  // save
-  srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
-
-  // save
-  srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
-
-  // save
-  srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
-
-
-  for (i = output_height; i > 1; i-=2) {
-     // load the last 2 loads of 16 bytes and have every two
-     // consecutive loads in the same 256 bit register
-     srcReg32b8 = _mm256_castsi128_si256(
-     _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
-     srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
-     _mm256_castsi256_si128(srcReg32b8), 1);
-     srcReg32b9 = _mm256_castsi128_si256(
-     _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
-     srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
-     _mm256_castsi256_si128(srcReg32b9), 1);
-
-     // merge every two consecutive registers
-     // save
-     srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
-     srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
-
-     // multiply 2 adjacent elements with the filter and add the result
-     srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
-     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
-
-     // add and saturate the results together
-     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
-
-     // multiply 2 adjacent elements with the filter and add the result
-     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
-     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
-
-     // add and saturate the results together
-     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
-                   _mm256_min_epi16(srcReg32b8, srcReg32b12));
-     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
-                   _mm256_max_epi16(srcReg32b8, srcReg32b12));
-
-     // multiply 2 adjacent elements with the filter and add the result
-     srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
-     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
-
-     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
-
-     // multiply 2 adjacent elements with the filter and add the result
-     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
-     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
-
-     // add and saturate the results together
-     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
-                  _mm256_min_epi16(srcReg32b8, srcReg32b12));
-     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
-                  _mm256_max_epi16(srcReg32b8, srcReg32b12));
-
-     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
-     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
-
-     // shift by 7 bit each 16 bit
-     srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
-     srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);
-
-     // shrink to 8 bit each 16 bits, the first lane contain the first
-     // convolve result and the second lane contain the second convolve
-     // result
-     srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
-
-     src_ptr+=src_stride;
-
-     // save 16 bytes
-     _mm_store_si128((__m128i*)output_ptr,
-     _mm256_castsi256_si128(srcReg32b1));
-
-     // save the next 16 bits
-     _mm_store_si128((__m128i*)(output_ptr+out_pitch),
-     _mm256_extractf128_si256(srcReg32b1, 1));
-
-     output_ptr+=dst_stride;
-
-     // save part of the registers for next strides
-     srcReg32b10 = srcReg32b11;
-     srcReg32b1 = srcReg32b3;
-     srcReg32b11 = srcReg32b2;
-     srcReg32b3 = srcReg32b5;
-     srcReg32b2 = srcReg32b4;
-     srcReg32b5 = srcReg32b7;
-     srcReg32b7 = srcReg32b9;
-  }
-  if (i > 0) {
-    __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
-    __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
-    // load the last 16 bytes
-    srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
-
-    // merge the last 2 results together
-    srcRegFilt4 = _mm_unpacklo_epi8(
-                  _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
-    srcRegFilt7 = _mm_unpackhi_epi8(
-                  _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
-                  _mm256_castsi256_si128(firstFilters));
-    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4,
-                  _mm256_castsi256_si128(forthFilters));
-    srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
-                  _mm256_castsi256_si128(firstFilters));
-    srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7,
-                  _mm256_castsi256_si128(forthFilters));
-
-    // add and saturate the results together
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
-
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
-                  _mm256_castsi256_si128(secondFilters));
-    srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
-                  _mm256_castsi256_si128(secondFilters));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
-                  _mm256_castsi256_si128(thirdFilters));
-    srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
-                  _mm256_castsi256_si128(thirdFilters));
-
-    // add and saturate the results together
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
-                  _mm_min_epi16(srcRegFilt4, srcRegFilt6));
-    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
-                  _mm_min_epi16(srcRegFilt5, srcRegFilt7));
-
-    // add and saturate the results together
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
-                  _mm_max_epi16(srcRegFilt4, srcRegFilt6));
-    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
-                  _mm_max_epi16(srcRegFilt5, srcRegFilt7));
-
-
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
-                  _mm256_castsi256_si128(addFilterReg64));
-    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
-                  _mm256_castsi256_si128(addFilterReg64));
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-    srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
-
-    // save 16 bytes
-    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
-  }
-}
-
-#if HAVE_AVX2 && HAVE_SSSE3
-filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
-#if ARCH_X86_64
-filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
-#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_intrin_ssse3
-#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_intrin_ssse3
-#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_intrin_ssse3
-#else  // ARCH_X86
-filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
-filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
-filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
-#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_ssse3
-#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3
-#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3
-#endif  // ARCH_X86_64
-filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
-filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
-filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
-filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
-filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
-filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
-#define vpx_filter_block1d4_v8_avx2 vpx_filter_block1d4_v8_ssse3
-#define vpx_filter_block1d16_v2_avx2 vpx_filter_block1d16_v2_ssse3
-#define vpx_filter_block1d16_h2_avx2 vpx_filter_block1d16_h2_ssse3
-#define vpx_filter_block1d8_v2_avx2  vpx_filter_block1d8_v2_ssse3
-#define vpx_filter_block1d8_h2_avx2  vpx_filter_block1d8_h2_ssse3
-#define vpx_filter_block1d4_v2_avx2  vpx_filter_block1d4_v2_ssse3
-#define vpx_filter_block1d4_h2_avx2  vpx_filter_block1d4_h2_ssse3
-// void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
-//                                uint8_t *dst, ptrdiff_t dst_stride,
-//                                const int16_t *filter_x, int x_step_q4,
-//                                const int16_t *filter_y, int y_step_q4,
-//                                int w, int h);
-// void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
-//                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const int16_t *filter_x, int x_step_q4,
-//                               const int16_t *filter_y, int y_step_q4,
-//                               int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
-
-// void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
-//                          uint8_t *dst, ptrdiff_t dst_stride,
-//                          const int16_t *filter_x, int x_step_q4,
-//                          const int16_t *filter_y, int y_step_q4,
-//                          int w, int h);
-FUN_CONV_2D(, avx2);
-#endif  // HAVE_AX2 && HAVE_SSSE3
diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
deleted file mode 100644
index 6fd52087c7..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ /dev/null
@@ -1,915 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-// Due to a header conflict between math.h and intrinsics includes with ceil()
-// in certain configurations under vs9 this include needs to precede
-// tmmintrin.h.
-
-#include <tmmintrin.h>
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/vpx_filter.h"
-#include "vpx_dsp/x86/convolve.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vpx_ports/mem.h"
-#include "vpx_ports/emmintrin_compat.h"
-
-// filters only for the 4_h8 convolution
-DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
-  0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
-  4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-// filters for 8_h8 and 16_h8
-DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
-  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
-  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
-  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
-  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
-};
-
-// These are reused by the avx2 intrinsics.
-filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
-
-void vpx_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr,
-                                         ptrdiff_t src_pixels_per_line,
-                                         uint8_t *output_ptr,
-                                         ptrdiff_t output_pitch,
-                                         uint32_t output_height,
-                                         const int16_t *filter) {
-  __m128i firstFilters, secondFilters, shuffle1, shuffle2;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
-  __m128i addFilterReg64, filtersReg, srcReg, minReg;
-  unsigned int i;
-
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the first 16 bits in the filter into the first lane
-  firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
-  // duplicate only the third 16 bit in the filter into the first lane
-  secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
-  // duplicate only the seconds 16 bits in the filter into the second lane
-  // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
-  firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
-  // duplicate only the forth 16 bits in the filter into the second lane
-  // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
-  secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
-
-  // loading the local filters
-  shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8);
-  shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
-
-  for (i = 0; i < output_height; i++) {
-    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
-
-    // filter the source buffer
-    srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1);
-    srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
-    // extract the higher half of the lane
-    srcRegFilt3 =  _mm_srli_si128(srcRegFilt1, 8);
-    srcRegFilt4 =  _mm_srli_si128(srcRegFilt2, 8);
-
-    minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
-
-    // add and saturate all the results together
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-    srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-    // shift by 7 bit each 16 bits
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-    // shrink to 8 bit each 16 bits
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-    src_ptr+=src_pixels_per_line;
-
-    // save only 4 bytes
-    *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
-
-    output_ptr+=output_pitch;
-  }
-}
-
-void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,
-                                         ptrdiff_t src_pixels_per_line,
-                                         uint8_t *output_ptr,
-                                         ptrdiff_t output_pitch,
-                                         uint32_t output_height,
-                                         const int16_t *filter) {
-  __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
-  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
-  __m128i addFilterReg64, filtersReg, minReg;
-  unsigned int i;
-
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the first 16 bits (first and second byte)
-  // across 128 bit register
-  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 128 bit register
-  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 128 bit register
-  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits (seventh and eighth byte)
-  // across 128 bit register
-  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
-  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
-  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
-  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
-
-  for (i = 0; i < output_height; i++) {
-    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
-
-    // filter the source buffer
-    srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
-    srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
-    // filter the source buffer
-    srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);
-    srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
-    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
-
-    // add and saturate all the results together
-    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-
-    srcRegFilt2= _mm_max_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-    // shift by 7 bit each 16 bits
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-    // shrink to 8 bit each 16 bits
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
-    src_ptr+=src_pixels_per_line;
-
-    // save only 8 bytes
-    _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
-
-    output_ptr+=output_pitch;
-  }
-}
-
-void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
-                                         ptrdiff_t src_pitch,
-                                         uint8_t *output_ptr,
-                                         ptrdiff_t out_pitch,
-                                         uint32_t output_height,
-                                         const int16_t *filter) {
-  __m128i addFilterReg64, filtersReg, minReg;
-  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
-  __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
-  __m128i srcReg8;
-  unsigned int i;
-
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the first 16 bits in the filter
-  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-  // duplicate only the second 16 bits in the filter
-  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-  // duplicate only the third 16 bits in the filter
-  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits in the filter
-  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-  // load the first 7 rows of 8 bytes
-  srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
-  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
-  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
-  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
-  srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
-  srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
-  srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
-
-  for (i = 0; i < output_height; i++) {
-    // load the last 8 bytes
-    srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
-
-    // merge the result together
-    srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
-    srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
-
-    // merge the result together
-    srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
-    srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
-    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
-
-    // add and saturate the results together
-    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
-    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-    // shrink to 8 bit each 16 bits
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
-    src_ptr+=src_pitch;
-
-    // shift down a row
-    srcReg1 = srcReg2;
-    srcReg2 = srcReg3;
-    srcReg3 = srcReg4;
-    srcReg4 = srcReg5;
-    srcReg5 = srcReg6;
-    srcReg6 = srcReg7;
-    srcReg7 = srcReg8;
-
-    // save only 8 bytes convolve result
-    _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
-
-    output_ptr+=out_pitch;
-  }
-}
-
-filter8_1dfunction vpx_filter_block1d16_v8_ssse3;
-filter8_1dfunction vpx_filter_block1d16_h8_ssse3;
-filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
-filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
-filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
-filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
-filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3;
-filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3;
-filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
-filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;
-filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3;
-filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3;
-
-filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
-filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
-filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
-filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
-filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
-filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
-filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3;
-filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3;
-filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3;
-filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3;
-filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3;
-filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
-
-// void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                                uint8_t *dst, ptrdiff_t dst_stride,
-//                                const int16_t *filter_x, int x_step_q4,
-//                                const int16_t *filter_y, int y_step_q4,
-//                                int w, int h);
-// void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const int16_t *filter_x, int x_step_q4,
-//                               const int16_t *filter_y, int y_step_q4,
-//                               int w, int h);
-// void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                                    uint8_t *dst, ptrdiff_t dst_stride,
-//                                    const int16_t *filter_x, int x_step_q4,
-//                                    const int16_t *filter_y, int y_step_q4,
-//                                    int w, int h);
-// void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                                   uint8_t *dst, ptrdiff_t dst_stride,
-//                                   const int16_t *filter_x, int x_step_q4,
-//                                   const int16_t *filter_y, int y_step_q4,
-//                                   int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
-FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
-            ssse3);
-
-#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,           \
-                      out0, out1, out2, out3, out4, out5, out6, out7) { \
-  const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1);                    \
-  const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3);                    \
-  const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5);                    \
-  const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7);                    \
-                                                                        \
-  const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1);               \
-  const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1);               \
-  const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3);               \
-  const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3);               \
-                                                                        \
-  const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2);               \
-  const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2);               \
-  const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3);               \
-  const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3);               \
-                                                                        \
-  out0 = _mm_unpacklo_epi64(tr2_0, tr2_0);                              \
-  out1 = _mm_unpackhi_epi64(tr2_0, tr2_0);                              \
-  out2 = _mm_unpacklo_epi64(tr2_1, tr2_1);                              \
-  out3 = _mm_unpackhi_epi64(tr2_1, tr2_1);                              \
-  out4 = _mm_unpacklo_epi64(tr2_2, tr2_2);                              \
-  out5 = _mm_unpackhi_epi64(tr2_2, tr2_2);                              \
-  out6 = _mm_unpacklo_epi64(tr2_3, tr2_3);                              \
-  out7 = _mm_unpackhi_epi64(tr2_3, tr2_3);                              \
-}
-
-static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch,
-                                  uint8_t *dst, const int16_t *x_filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)x_filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_loadl_epi64((const __m128i *)src_x);
-  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch));
-  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2));
-  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3));
-  const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4));
-  const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5));
-  const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6));
-  const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7));
-  // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
-  const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
-  // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
-  const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
-  // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57
-  const __m128i tr0_2 = _mm_unpacklo_epi16(E, F);
-  // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77
-  const __m128i tr0_3 = _mm_unpacklo_epi16(G, H);
-  // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
-  const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73
-  const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77
-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-  // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
-  const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2);
-  const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2);
-  const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3);
-  const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
-  // shrink to 8 bit each 16 bits
-  temp = _mm_packus_epi16(temp, temp);
-  // save only 8 bytes convolve result
-  _mm_storel_epi64((__m128i*)dst, temp);
-}
-
-static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride) {
-  __m128i A, B, C, D, E, F, G, H;
-
-  A = _mm_loadl_epi64((const __m128i *)src);
-  B = _mm_loadl_epi64((const __m128i *)(src + src_stride));
-  C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
-  D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
-  E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4));
-  F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5));
-  G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6));
-  H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7));
-
-  TRANSPOSE_8X8(A, B, C, D, E, F, G, H,
-                A, B, C, D, E, F, G, H);
-
-  _mm_storel_epi64((__m128i*)dst, A);
-  _mm_storel_epi64((__m128i*)(dst + dst_stride * 1), B);
-  _mm_storel_epi64((__m128i*)(dst + dst_stride * 2), C);
-  _mm_storel_epi64((__m128i*)(dst + dst_stride * 3), D);
-  _mm_storel_epi64((__m128i*)(dst + dst_stride * 4), E);
-  _mm_storel_epi64((__m128i*)(dst + dst_stride * 5), F);
-  _mm_storel_epi64((__m128i*)(dst + dst_stride * 6), G);
-  _mm_storel_epi64((__m128i*)(dst + dst_stride * 7), H);
-}
-
-static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *x_filters,
-                                    int x0_q4, int x_step_q4, int w, int h) {
-  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
-  int x, y, z;
-  src -= SUBPEL_TAPS / 2 - 1;
-
-  // This function processes 8x8 areas.  The intermediate height is not always
-  // a multiple of 8, so force it to be a multiple of 8 here.
-  y = h + (8 - (h & 0x7));
-
-  do {
-    int x_q4 = x0_q4;
-    for (x = 0; x < w; x += 8) {
-      // process 8 src_x steps
-      for (z = 0; z < 8; ++z) {
-        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-        const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-        if (x_q4 & SUBPEL_MASK) {
-          filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter);
-        } else {
-          int i;
-          for (i = 0; i < 8; ++i) {
-            temp[z * 8 + i] = src_x[i * src_stride + 3];
-          }
-        }
-        x_q4 += x_step_q4;
-      }
-
-      // transpose the 8x8 filters values back to dst
-      transpose8x8_to_dst(temp, 8, dst + x, dst_stride);
-    }
-
-    src += src_stride * 8;
-    dst += dst_stride * 8;
-  } while (y -= 8);
-}
-
-static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                  uint8_t *dst, const int16_t *filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
-  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
-  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
-  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
-  // TRANSPOSE...
-  // 00 01 02 03 04 05 06 07
-  // 10 11 12 13 14 15 16 17
-  // 20 21 22 23 24 25 26 27
-  // 30 31 32 33 34 35 36 37
-  //
-  // TO
-  //
-  // 00 10 20 30
-  // 01 11 21 31
-  // 02 12 22 32
-  // 03 13 23 33
-  // 04 14 24 34
-  // 05 15 25 35
-  // 06 16 26 36
-  // 07 17 27 37
-  //
-  // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
-  const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
-  // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
-  const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
-  // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
-  const __m128i s1s0  = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
-  const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  // 02 03 12 13 22 23 32 33
-  const __m128i s3s2 = _mm_srli_si128(s1s0, 8);
-  // 06 07 16 17 26 27 36 37
-  const __m128i s7s6 = _mm_srli_si128(s5s4, 8);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
-  // shrink to 8 bit each 16 bits
-  temp = _mm_packus_epi16(temp, temp);
-  // save only 4 bytes
-  *(int *)dst = _mm_cvtsi128_si32(temp);
-}
-
-static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride) {
-  __m128i A = _mm_cvtsi32_si128(*(const int *)src);
-  __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride));
-  __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2));
-  __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3));
-  // 00 10 01 11 02 12 03 13
-  const __m128i tr0_0 = _mm_unpacklo_epi8(A, B);
-  // 20 30 21 31 22 32 23 33
-  const __m128i tr0_1 = _mm_unpacklo_epi8(C, D);
-  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-  A = _mm_unpacklo_epi16(tr0_0, tr0_1);
-  B = _mm_srli_si128(A, 4);
-  C = _mm_srli_si128(A, 8);
-  D = _mm_srli_si128(A, 12);
-
-  *(int *)(dst) =  _mm_cvtsi128_si32(A);
-  *(int *)(dst + dst_stride) =  _mm_cvtsi128_si32(B);
-  *(int *)(dst + dst_stride * 2) =  _mm_cvtsi128_si32(C);
-  *(int *)(dst + dst_stride * 3) =  _mm_cvtsi128_si32(D);
-}
-
-static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *x_filters,
-                                    int x0_q4, int x_step_q4, int w, int h) {
-  DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
-  int x, y, z;
-  src -= SUBPEL_TAPS / 2 - 1;
-
-  for (y = 0; y < h; y += 4) {
-    int x_q4 = x0_q4;
-    for (x = 0; x < w; x += 4) {
-      // process 4 src_x steps
-      for (z = 0; z < 4; ++z) {
-        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-        const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-        if (x_q4 & SUBPEL_MASK) {
-          filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter);
-        } else {
-          int i;
-          for (i = 0; i < 4; ++i) {
-            temp[z * 4 + i] = src_x[i * src_stride + 3];
-          }
-        }
-        x_q4 += x_step_q4;
-      }
-
-      // transpose the 4x4 filters values back to dst
-      transpose4x4_to_dst(temp, 4, dst + x, dst_stride);
-    }
-
-    src += src_stride * 4;
-    dst += dst_stride * 4;
-  }
-}
-
-static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                 uint8_t *dst, const int16_t *filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr);
-  const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch));
-  const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2));
-  const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3));
-  const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4));
-  const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5));
-  const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6));
-  const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7));
-  const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
-  const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
-  const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
-  const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
-  // shrink to 8 bit each 16 bits
-  temp = _mm_packus_epi16(temp, temp);
-  // save only 4 bytes
-  *(int *)dst = _mm_cvtsi128_si32(temp);
-}
-
-static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const InterpKernel *y_filters,
-                                   int y0_q4, int y_step_q4, int w, int h) {
-  int y;
-  int y_q4 = y0_q4;
-
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  for (y = 0; y < h; ++y) {
-    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-
-    if (y_q4 & SUBPEL_MASK) {
-      filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
-    } else {
-      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
-    }
-
-    y_q4 += y_step_q4;
-  }
-}
-
-static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                 uint8_t *dst, const int16_t *filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
-  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
-  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
-  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
-  const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
-  const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
-  const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
-  const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
-  const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
-  const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
-  const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
-  const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
-  // shrink to 8 bit each 16 bits
-  temp = _mm_packus_epi16(temp, temp);
-  // save only 8 bytes convolve result
-  _mm_storel_epi64((__m128i*)dst, temp);
-}
-
-static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const InterpKernel *y_filters,
-                                   int y0_q4, int y_step_q4, int w, int h) {
-  int y;
-  int y_q4 = y0_q4;
-
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  for (y = 0; y < h; ++y) {
-    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-    if (y_q4 & SUBPEL_MASK) {
-      filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
-    } else {
-      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
-    }
-    y_q4 += y_step_q4;
-  }
-}
-
-static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                  uint8_t *dst, const int16_t *filter, int w) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  int i;
-
-  for (i = 0; i < w; i += 16) {
-    const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr);
-    const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
-    const __m128i C =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
-    const __m128i D =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
-    const __m128i E =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
-    const __m128i F =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
-    const __m128i G =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
-    const __m128i H =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
-    // merge the result together
-    const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B);
-    const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H);
-    const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B);
-    const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H);
-    // multiply 2 adjacent elements with the filter and add the result
-    const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0);
-    const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6);
-    const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0);
-    const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6);
-    // add and saturate the results together
-    const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo);
-    const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi);
-    // merge the result together
-    const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D);
-    const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D);
-    // multiply 2 adjacent elements with the filter and add the result
-    const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2);
-    const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2);
-    // merge the result together
-    const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F);
-    const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F);
-    // multiply 2 adjacent elements with the filter and add the result
-    const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4);
-    const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4);
-    // add and saturate the results together
-    __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo));
-    __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi));
-
-    // add and saturate the results together
-    temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo));
-    temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi));
-    // round and shift by 7 bit each 16 bit
-    temp_lo = _mm_mulhrs_epi16(temp_lo, k_256);
-    temp_hi = _mm_mulhrs_epi16(temp_hi, k_256);
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
-    src_ptr += 16;
-     // save 16 bytes convolve result
-    _mm_store_si128((__m128i*)&dst[i], temp_hi);
-  }
-}
-
-static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *y_filters,
-                                    int y0_q4, int y_step_q4, int w, int h) {
-  int y;
-  int y_q4 = y0_q4;
-
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  for (y = 0; y < h; ++y) {
-    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-    if (y_q4 & SUBPEL_MASK) {
-      filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter,
-                            w);
-    } else {
-      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
-    }
-    y_q4 += y_step_q4;
-  }
-}
-
-static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const InterpKernel *const x_filters,
-                             int x0_q4, int x_step_q4,
-                             const InterpKernel *const y_filters,
-                             int y0_q4, int y_step_q4,
-                             int w, int h) {
-  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
-  // 2d filtering proceeds in 2 steps:
-  //   (1) Interpolate horizontally into an intermediate buffer, temp.
-  //   (2) Interpolate temp vertically to derive the sub-pixel result.
-  // Deriving the maximum number of rows in the temp buffer (135):
-  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
-  // --Largest block size is 64x64 pixels.
-  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
-  //   original frame (in 1/16th pixel units).
-  // --Must round-up because block may be located at sub-pixel position.
-  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
-  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  // --Require an additional 8 rows for the horiz_w8 transpose tail.
-  DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
-  const int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
-  assert(w <= 64);
-  assert(h <= 64);
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
-
-  if (w >= 8) {
-    scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
-                            w, intermediate_height);
-  } else {
-    scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
-                            w, intermediate_height);
-  }
-
-  if (w >= 16) {
-    scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                            dst_stride, y_filters, y0_q4, y_step_q4, w, h);
-  } else if (w == 8) {
-    scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                           dst_stride, y_filters, y0_q4, y_step_q4, w, h);
-  } else {
-    scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                           dst_stride, y_filters, y0_q4, y_step_q4, w, h);
-  }
-}
-
-static const InterpKernel *get_filter_base(const int16_t *filter) {
-  // NOTE: This assumes that the filter table is 256-byte aligned.
-  // TODO(agrange) Modify to make independent of table alignment.
-  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
-}
-
-static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
-  return (int)((const InterpKernel *)(intptr_t)f - base);
-}
-
-void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, ptrdiff_t dst_stride,
-                         const int16_t *filter_x, int x_step_q4,
-                         const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  scaledconvolve2d(src, src_stride, dst, dst_stride,
-                   filters_x, x0_q4, x_step_q4,
-                   filters_y, y0_q4, y_step_q4, w, h);
-}
-
-// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                          uint8_t *dst, ptrdiff_t dst_stride,
-//                          const int16_t *filter_x, int x_step_q4,
-//                          const int16_t *filter_y, int y_step_q4,
-//                          int w, int h);
-// void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                              uint8_t *dst, ptrdiff_t dst_stride,
-//                              const int16_t *filter_x, int x_step_q4,
-//                              const int16_t *filter_y, int y_step_q4,
-//                              int w, int h);
-FUN_CONV_2D(, ssse3);
-FUN_CONV_2D(avg_ , ssse3);
diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm
deleted file mode 100644
index 08f3d6a6cf..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm
+++ /dev/null
@@ -1,987 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;Note: tap3 and tap4 have to be applied and added after other taps to avoid
-;overflow.
-
-%macro GET_FILTERS_4 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm7, [rdx]                 ;load filters
-    pshuflw     xmm0, xmm7, 0b              ;k0
-    pshuflw     xmm1, xmm7, 01010101b       ;k1
-    pshuflw     xmm2, xmm7, 10101010b       ;k2
-    pshuflw     xmm3, xmm7, 11111111b       ;k3
-    psrldq      xmm7, 8
-    pshuflw     xmm4, xmm7, 0b              ;k4
-    pshuflw     xmm5, xmm7, 01010101b       ;k5
-    pshuflw     xmm6, xmm7, 10101010b       ;k6
-    pshuflw     xmm7, xmm7, 11111111b       ;k7
-
-    punpcklqdq  xmm0, xmm1
-    punpcklqdq  xmm2, xmm3
-    punpcklqdq  xmm5, xmm4
-    punpcklqdq  xmm6, xmm7
-
-    movdqa      k0k1, xmm0
-    movdqa      k2k3, xmm2
-    movdqa      k5k4, xmm5
-    movdqa      k6k7, xmm6
-
-    movq        xmm6, rcx
-    pshufd      xmm6, xmm6, 0
-    movdqa      krd, xmm6
-
-    pxor        xmm7, xmm7
-    movdqa      zero, xmm7
-%endm
-
-%macro APPLY_FILTER_4 1
-    punpckldq   xmm0, xmm1                  ;two row in one register
-    punpckldq   xmm6, xmm7
-    punpckldq   xmm2, xmm3
-    punpckldq   xmm5, xmm4
-
-    punpcklbw   xmm0, zero                  ;unpack to word
-    punpcklbw   xmm6, zero
-    punpcklbw   xmm2, zero
-    punpcklbw   xmm5, zero
-
-    pmullw      xmm0, k0k1                  ;multiply the filter factors
-    pmullw      xmm6, k6k7
-    pmullw      xmm2, k2k3
-    pmullw      xmm5, k5k4
-
-    paddsw      xmm0, xmm6                  ;sum
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 8
-    paddsw      xmm0, xmm1
-    paddsw      xmm0, xmm2
-    psrldq      xmm2, 8
-    paddsw      xmm0, xmm5
-    psrldq      xmm5, 8
-    paddsw      xmm0, xmm2
-    paddsw      xmm0, xmm5
-
-    paddsw      xmm0, krd                   ;rounding
-    psraw       xmm0, 7                     ;shift
-    packuswb    xmm0, xmm0                  ;pack to byte
-
-%if %1
-    movd        xmm1, [rdi]
-    pavgb       xmm0, xmm1
-%endif
-    movd        [rdi], xmm0
-%endm
-
-%macro GET_FILTERS 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm7, [rdx]                 ;load filters
-    pshuflw     xmm0, xmm7, 0b              ;k0
-    pshuflw     xmm1, xmm7, 01010101b       ;k1
-    pshuflw     xmm2, xmm7, 10101010b       ;k2
-    pshuflw     xmm3, xmm7, 11111111b       ;k3
-    pshufhw     xmm4, xmm7, 0b              ;k4
-    pshufhw     xmm5, xmm7, 01010101b       ;k5
-    pshufhw     xmm6, xmm7, 10101010b       ;k6
-    pshufhw     xmm7, xmm7, 11111111b       ;k7
-
-    punpcklwd   xmm0, xmm0
-    punpcklwd   xmm1, xmm1
-    punpcklwd   xmm2, xmm2
-    punpcklwd   xmm3, xmm3
-    punpckhwd   xmm4, xmm4
-    punpckhwd   xmm5, xmm5
-    punpckhwd   xmm6, xmm6
-    punpckhwd   xmm7, xmm7
-
-    movdqa      k0,   xmm0                  ;store filter factors on stack
-    movdqa      k1,   xmm1
-    movdqa      k2,   xmm2
-    movdqa      k3,   xmm3
-    movdqa      k4,   xmm4
-    movdqa      k5,   xmm5
-    movdqa      k6,   xmm6
-    movdqa      k7,   xmm7
-
-    movq        xmm6, rcx
-    pshufd      xmm6, xmm6, 0
-    movdqa      krd, xmm6                   ;rounding
-
-    pxor        xmm7, xmm7
-    movdqa      zero, xmm7
-%endm
-
-%macro LOAD_VERT_8 1
-    movq        xmm0, [rsi + %1]            ;0
-    movq        xmm1, [rsi + rax + %1]      ;1
-    movq        xmm6, [rsi + rdx * 2 + %1]  ;6
-    lea         rsi,  [rsi + rax]
-    movq        xmm7, [rsi + rdx * 2 + %1]  ;7
-    movq        xmm2, [rsi + rax + %1]      ;2
-    movq        xmm3, [rsi + rax * 2 + %1]  ;3
-    movq        xmm4, [rsi + rdx + %1]      ;4
-    movq        xmm5, [rsi + rax * 4 + %1]  ;5
-%endm
-
-%macro APPLY_FILTER_8 2
-    punpcklbw   xmm0, zero
-    punpcklbw   xmm1, zero
-    punpcklbw   xmm6, zero
-    punpcklbw   xmm7, zero
-    punpcklbw   xmm2, zero
-    punpcklbw   xmm5, zero
-    punpcklbw   xmm3, zero
-    punpcklbw   xmm4, zero
-
-    pmullw      xmm0, k0
-    pmullw      xmm1, k1
-    pmullw      xmm6, k6
-    pmullw      xmm7, k7
-    pmullw      xmm2, k2
-    pmullw      xmm5, k5
-    pmullw      xmm3, k3
-    pmullw      xmm4, k4
-
-    paddsw      xmm0, xmm1
-    paddsw      xmm0, xmm6
-    paddsw      xmm0, xmm7
-    paddsw      xmm0, xmm2
-    paddsw      xmm0, xmm5
-    paddsw      xmm0, xmm3
-    paddsw      xmm0, xmm4
-
-    paddsw      xmm0, krd                   ;rounding
-    psraw       xmm0, 7                     ;shift
-    packuswb    xmm0, xmm0                  ;pack back to byte
-%if %1
-    movq        xmm1, [rdi + %2]
-    pavgb       xmm0, xmm1
-%endif
-    movq        [rdi + %2], xmm0
-%endm
-
-;void vpx_filter_block1d4_v8_sse2
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
-;)
-global sym(vpx_filter_block1d4_v8_sse2) PRIVATE
-sym(vpx_filter_block1d4_v8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 6
-    %define k0k1 [rsp + 16 * 0]
-    %define k2k3 [rsp + 16 * 1]
-    %define k5k4 [rsp + 16 * 2]
-    %define k6k7 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define zero [rsp + 16 * 5]
-
-    GET_FILTERS_4
-
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movd        xmm0, [rsi]                 ;load src: row 0
-    movd        xmm1, [rsi + rax]           ;1
-    movd        xmm6, [rsi + rdx * 2]       ;6
-    lea         rsi,  [rsi + rax]
-    movd        xmm7, [rsi + rdx * 2]       ;7
-    movd        xmm2, [rsi + rax]           ;2
-    movd        xmm3, [rsi + rax * 2]       ;3
-    movd        xmm4, [rsi + rdx]           ;4
-    movd        xmm5, [rsi + rax * 4]       ;5
-
-    APPLY_FILTER_4 0
-
-    lea         rdi, [rdi + rbx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 6
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vpx_filter_block1d8_v8_sse2
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
-;)
-global sym(vpx_filter_block1d8_v8_sse2) PRIVATE
-sym(vpx_filter_block1d8_v8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    LOAD_VERT_8 0
-    APPLY_FILTER_8 0, 0
-
-    lea         rdi, [rdi + rbx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vpx_filter_block1d16_v8_sse2
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
-;)
-global sym(vpx_filter_block1d16_v8_sse2) PRIVATE
-sym(vpx_filter_block1d16_v8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    LOAD_VERT_8 0
-    APPLY_FILTER_8 0, 0
-    sub         rsi, rax
-
-    LOAD_VERT_8 8
-    APPLY_FILTER_8 0, 8
-    add         rdi, rbx
-
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d4_v8_avg_sse2) PRIVATE
-sym(vpx_filter_block1d4_v8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 6
-    %define k0k1 [rsp + 16 * 0]
-    %define k2k3 [rsp + 16 * 1]
-    %define k5k4 [rsp + 16 * 2]
-    %define k6k7 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define zero [rsp + 16 * 5]
-
-    GET_FILTERS_4
-
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movd        xmm0, [rsi]                 ;load src: row 0
-    movd        xmm1, [rsi + rax]           ;1
-    movd        xmm6, [rsi + rdx * 2]       ;6
-    lea         rsi,  [rsi + rax]
-    movd        xmm7, [rsi + rdx * 2]       ;7
-    movd        xmm2, [rsi + rax]           ;2
-    movd        xmm3, [rsi + rax * 2]       ;3
-    movd        xmm4, [rsi + rdx]           ;4
-    movd        xmm5, [rsi + rax * 4]       ;5
-
-    APPLY_FILTER_4 1
-
-    lea         rdi, [rdi + rbx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 6
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d8_v8_avg_sse2) PRIVATE
-sym(vpx_filter_block1d8_v8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-.loop:
-    LOAD_VERT_8 0
-    APPLY_FILTER_8 1, 0
-
-    lea         rdi, [rdi + rbx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d16_v8_avg_sse2) PRIVATE
-sym(vpx_filter_block1d16_v8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-.loop:
-    LOAD_VERT_8 0
-    APPLY_FILTER_8 1, 0
-    sub         rsi, rax
-
-    LOAD_VERT_8 8
-    APPLY_FILTER_8 1, 8
-    add         rdi, rbx
-
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vpx_filter_block1d4_h8_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
-;)
-global sym(vpx_filter_block1d4_h8_sse2) PRIVATE
-sym(vpx_filter_block1d4_h8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 6
-    %define k0k1 [rsp + 16 * 0]
-    %define k2k3 [rsp + 16 * 1]
-    %define k5k4 [rsp + 16 * 2]
-    %define k6k7 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define zero [rsp + 16 * 5]
-
-    GET_FILTERS_4
-
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 3]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm3, 3
-    psrldq      xmm5, 5
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_4 0
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 6
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vpx_filter_block1d8_h8_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
-;)
-global sym(vpx_filter_block1d8_h8_sse2) PRIVATE
-sym(vpx_filter_block1d8_h8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 3]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm5, 5
-    psrldq      xmm3, 3
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_8 0, 0
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vpx_filter_block1d16_h8_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
-;)
-global sym(vpx_filter_block1d16_h8_sse2) PRIVATE
-sym(vpx_filter_block1d16_h8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 3]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm5, 5
-    psrldq      xmm3, 3
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_8 0, 0
-
-    movdqu      xmm0,   [rsi + 5]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm5, 5
-    psrldq      xmm3, 3
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_8 0, 8
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d4_h8_avg_sse2) PRIVATE
-sym(vpx_filter_block1d4_h8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 6
-    %define k0k1 [rsp + 16 * 0]
-    %define k2k3 [rsp + 16 * 1]
-    %define k5k4 [rsp + 16 * 2]
-    %define k6k7 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define zero [rsp + 16 * 5]
-
-    GET_FILTERS_4
-
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 3]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm3, 3
-    psrldq      xmm5, 5
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_4 1
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 6
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d8_h8_avg_sse2) PRIVATE
-sym(vpx_filter_block1d8_h8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 3]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm5, 5
-    psrldq      xmm3, 3
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_8 1, 0
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d16_h8_avg_sse2) PRIVATE
-sym(vpx_filter_block1d16_h8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 3]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm5, 5
-    psrldq      xmm3, 3
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_8 1, 0
-
-    movdqu      xmm0,   [rsi + 5]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm5, 5
-    psrldq      xmm3, 3
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_8 1, 8
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
deleted file mode 100644
index d2cb8ea292..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
+++ /dev/null
@@ -1,629 +0,0 @@
-;
-;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pw_64:    times 8 dw 64
-
-; %define USE_PMULHRSW
-; NOTE: pmulhrsw has a latency of 5 cycles.  Tests showed a performance loss
-; when using this instruction.
-;
-; The add order below (based on ffvp9) must be followed to prevent outranges.
-; x = k0k1 + k4k5
-; y = k2k3 + k6k7
-; z = signed SAT(x + y)
-
-SECTION .text
-%if ARCH_X86_64
-  %define LOCAL_VARS_SIZE 16*4
-%else
-  %define LOCAL_VARS_SIZE 16*6
-%endif
-
-%macro SETUP_LOCAL_VARS 0
-    ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 +
-    ; pmaddubsw has a higher latency on some platforms, this might be eased by
-    ; interleaving the instructions.
-    %define    k0k1  [rsp + 16*0]
-    %define    k2k3  [rsp + 16*1]
-    %define    k4k5  [rsp + 16*2]
-    %define    k6k7  [rsp + 16*3]
-    packsswb     m4, m4
-    ; TODO(slavarnway): multiple pshufb instructions had a higher latency on
-    ; some platforms.
-    pshuflw      m0, m4, 0b              ;k0_k1
-    pshuflw      m1, m4, 01010101b       ;k2_k3
-    pshuflw      m2, m4, 10101010b       ;k4_k5
-    pshuflw      m3, m4, 11111111b       ;k6_k7
-    punpcklqdq   m0, m0
-    punpcklqdq   m1, m1
-    punpcklqdq   m2, m2
-    punpcklqdq   m3, m3
-    mova       k0k1, m0
-    mova       k2k3, m1
-    mova       k4k5, m2
-    mova       k6k7, m3
-%if ARCH_X86_64
-    %define     krd  m12
-    %define     tmp  m13
-    mova        krd, [GLOBAL(pw_64)]
-%else
-    %define     tmp  [rsp + 16*4]
-    %define     krd  [rsp + 16*5]
-%if CONFIG_PIC=0
-    mova         m6, [GLOBAL(pw_64)]
-%else
-    ; build constants without accessing global memory
-    pcmpeqb      m6, m6                  ;all ones
-    psrlw        m6, 15
-    psllw        m6, 6                   ;aka pw_64
-%endif
-    mova        krd, m6
-%endif
-%endm
-
-%macro HORIZx4_ROW 2
-    mova      %2, %1
-    punpcklbw %1, %1
-    punpckhbw %2, %2
-
-    mova      m3, %2
-    palignr   %2, %1, 1
-    palignr   m3, %1, 5
-
-    pmaddubsw %2, k0k1k4k5
-    pmaddubsw m3, k2k3k6k7
-    mova      m4, %2        ;k0k1
-    mova      m5, m3        ;k2k3
-    psrldq    %2, 8         ;k4k5
-    psrldq    m3, 8         ;k6k7
-    paddsw    %2, m4
-    paddsw    m5, m3
-    paddsw    %2, m5
-    paddsw    %2, krd
-    psraw     %2, 7
-    packuswb  %2, %2
-%endm
-
-;-------------------------------------------------------------------------------
-%macro SUBPIX_HFILTER4 1
-cglobal filter_block1d4_%1, 6, 6+(ARCH_X86_64*2), 11, LOCAL_VARS_SIZE, \
-                            src, sstride, dst, dstride, height, filter
-    mova                m4, [filterq]
-    packsswb            m4, m4
-%if ARCH_X86_64
-    %define       k0k1k4k5 m8
-    %define       k2k3k6k7 m9
-    %define            krd m10
-    %define    orig_height r7d
-    mova               krd, [GLOBAL(pw_64)]
-    pshuflw       k0k1k4k5, m4, 0b              ;k0_k1
-    pshufhw       k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5
-    pshuflw       k2k3k6k7, m4, 01010101b       ;k2_k3
-    pshufhw       k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7
-%else
-    %define       k0k1k4k5 [rsp + 16*0]
-    %define       k2k3k6k7 [rsp + 16*1]
-    %define            krd [rsp + 16*2]
-    %define    orig_height [rsp + 16*3]
-    pshuflw             m6, m4, 0b              ;k0_k1
-    pshufhw             m6, m6, 10101010b       ;k0_k1_k4_k5
-    pshuflw             m7, m4, 01010101b       ;k2_k3
-    pshufhw             m7, m7, 11111111b       ;k2_k3_k6_k7
-%if CONFIG_PIC=0
-    mova                m1, [GLOBAL(pw_64)]
-%else
-    ; build constants without accessing global memory
-    pcmpeqb             m1, m1                  ;all ones
-    psrlw               m1, 15
-    psllw               m1, 6                   ;aka pw_64
-%endif
-    mova          k0k1k4k5, m6
-    mova          k2k3k6k7, m7
-    mova               krd, m1
-%endif
-    mov        orig_height, heightd
-    shr            heightd, 1
-.loop:
-    ;Do two rows at once
-    movh                m0, [srcq - 3]
-    movh                m1, [srcq + 5]
-    punpcklqdq          m0, m1
-    mova                m1, m0
-    movh                m2, [srcq + sstrideq - 3]
-    movh                m3, [srcq + sstrideq + 5]
-    punpcklqdq          m2, m3
-    mova                m3, m2
-    punpcklbw           m0, m0
-    punpckhbw           m1, m1
-    punpcklbw           m2, m2
-    punpckhbw           m3, m3
-    mova                m4, m1
-    palignr             m4, m0,  1
-    pmaddubsw           m4, k0k1k4k5
-    palignr             m1, m0,  5
-    pmaddubsw           m1, k2k3k6k7
-    mova                m7, m3
-    palignr             m7, m2,  1
-    pmaddubsw           m7, k0k1k4k5
-    palignr             m3, m2,  5
-    pmaddubsw           m3, k2k3k6k7
-    mova                m0, m4                  ;k0k1
-    mova                m5, m1                  ;k2k3
-    mova                m2, m7                  ;k0k1 upper
-    psrldq              m4, 8                   ;k4k5
-    psrldq              m1, 8                   ;k6k7
-    paddsw              m4, m0
-    paddsw              m5, m1
-    mova                m1, m3                  ;k2k3 upper
-    psrldq              m7, 8                   ;k4k5 upper
-    psrldq              m3, 8                   ;k6k7 upper
-    paddsw              m7, m2
-    paddsw              m4, m5
-    paddsw              m1, m3
-    paddsw              m7, m1
-    paddsw              m4, krd
-    psraw               m4, 7
-    packuswb            m4, m4
-    paddsw              m7, krd
-    psraw               m7, 7
-    packuswb            m7, m7
-
-%ifidn %1, h8_avg
-    movd                m0, [dstq]
-    pavgb               m4, m0
-    movd                m2, [dstq + dstrideq]
-    pavgb               m7, m2
-%endif
-    movd            [dstq], m4
-    movd [dstq + dstrideq], m7
-
-    lea               srcq, [srcq + sstrideq        ]
-    prefetcht0              [srcq + 4 * sstrideq - 3]
-    lea               srcq, [srcq + sstrideq        ]
-    lea               dstq, [dstq + 2 * dstrideq    ]
-    prefetcht0              [srcq + 2 * sstrideq - 3]
-
-    dec            heightd
-    jnz              .loop
-
-    ; Do last row if output_height is odd
-    mov            heightd, orig_height
-    and            heightd, 1
-    je               .done
-
-    movh                m0, [srcq - 3]    ; load src
-    movh                m1, [srcq + 5]
-    punpcklqdq          m0, m1
-
-    HORIZx4_ROW         m0, m1
-%ifidn %1, h8_avg
-    movd                m0, [dstq]
-    pavgb               m1, m0
-%endif
-    movd            [dstq], m1
-.done
-    RET
-%endm
-
-%macro HORIZx8_ROW 5
-    mova        %2, %1
-    punpcklbw   %1, %1
-    punpckhbw   %2, %2
-
-    mova        %3, %2
-    mova        %4, %2
-    mova        %5, %2
-
-    palignr     %2, %1, 1
-    palignr     %3, %1, 5
-    palignr     %4, %1, 9
-    palignr     %5, %1, 13
-
-    pmaddubsw   %2, k0k1
-    pmaddubsw   %3, k2k3
-    pmaddubsw   %4, k4k5
-    pmaddubsw   %5, k6k7
-    paddsw      %2, %4
-    paddsw      %5, %3
-    paddsw      %2, %5
-    paddsw      %2, krd
-    psraw       %2, 7
-    packuswb    %2, %2
-    SWAP        %1, %2
-%endm
-
-;-------------------------------------------------------------------------------
-%macro SUBPIX_HFILTER8 1
-cglobal filter_block1d8_%1, 6, 6+(ARCH_X86_64*1), 14, LOCAL_VARS_SIZE, \
-                            src, sstride, dst, dstride, height, filter
-    mova                 m4, [filterq]
-    SETUP_LOCAL_VARS
-%if ARCH_X86_64
-    %define     orig_height r7d
-%else
-    %define     orig_height heightmp
-%endif
-    mov         orig_height, heightd
-    shr             heightd, 1
-
-.loop:
-    movh                 m0, [srcq - 3]
-    movh                 m3, [srcq + 5]
-    movh                 m4, [srcq + sstrideq - 3]
-    movh                 m7, [srcq + sstrideq + 5]
-    punpcklqdq           m0, m3
-    mova                 m1, m0
-    punpcklbw            m0, m0
-    punpckhbw            m1, m1
-    mova                 m5, m1
-    palignr              m5, m0, 13
-    pmaddubsw            m5, k6k7
-    mova                 m2, m1
-    mova                 m3, m1
-    palignr              m1, m0, 1
-    pmaddubsw            m1, k0k1
-    punpcklqdq           m4, m7
-    mova                 m6, m4
-    punpcklbw            m4, m4
-    palignr              m2, m0, 5
-    punpckhbw            m6, m6
-    palignr              m3, m0, 9
-    mova                 m7, m6
-    pmaddubsw            m2, k2k3
-    pmaddubsw            m3, k4k5
-
-    palignr              m7, m4, 13
-    mova                 m0, m6
-    palignr              m0, m4, 5
-    pmaddubsw            m7, k6k7
-    paddsw               m1, m3
-    paddsw               m2, m5
-    paddsw               m1, m2
-    mova                 m5, m6
-    palignr              m6, m4, 1
-    pmaddubsw            m0, k2k3
-    pmaddubsw            m6, k0k1
-    palignr              m5, m4, 9
-    paddsw               m1, krd
-    pmaddubsw            m5, k4k5
-    psraw                m1, 7
-    paddsw               m0, m7
-%ifidn %1, h8_avg
-    movh                 m7, [dstq]
-    movh                 m2, [dstq + dstrideq]
-%endif
-    packuswb             m1, m1
-    paddsw               m6, m5
-    paddsw               m6, m0
-    paddsw               m6, krd
-    psraw                m6, 7
-    packuswb             m6, m6
-%ifidn %1, h8_avg
-    pavgb                m1, m7
-    pavgb                m6, m2
-%endif
-    movh             [dstq], m1
-    movh  [dstq + dstrideq], m6
-
-    lea                srcq, [srcq + sstrideq        ]
-    prefetcht0               [srcq + 4 * sstrideq - 3]
-    lea                srcq, [srcq + sstrideq        ]
-    lea                dstq, [dstq + 2 * dstrideq    ]
-    prefetcht0               [srcq + 2 * sstrideq - 3]
-    dec             heightd
-    jnz             .loop
-
-    ;Do last row if output_height is odd
-    mov             heightd, orig_height
-    and             heightd, 1
-    je                .done
-
-    movh                 m0, [srcq - 3]
-    movh                 m3, [srcq + 5]
-    punpcklqdq           m0, m3
-
-    HORIZx8_ROW          m0, m1, m2, m3, m4
-
-%ifidn %1, h8_avg
-    movh                 m1, [dstq]
-    pavgb                m0, m1
-%endif
-    movh             [dstq], m0
-.done:
-    RET
-%endm
-
-;-------------------------------------------------------------------------------
-%macro SUBPIX_HFILTER16 1
-cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*0), 14, LOCAL_VARS_SIZE, \
-                             src, sstride, dst, dstride, height, filter
-    mova          m4, [filterq]
-    SETUP_LOCAL_VARS
-.loop:
-    prefetcht0        [srcq + 2 * sstrideq -3]
-
-    movh          m0, [srcq -  3]
-    movh          m4, [srcq +  5]
-    movh          m6, [srcq + 13]
-    punpcklqdq    m0, m4
-    mova          m7, m0
-    punpckhbw     m0, m0
-    mova          m1, m0
-    punpcklqdq    m4, m6
-    mova          m3, m0
-    punpcklbw     m7, m7
-
-    palignr       m3, m7, 13
-    mova          m2, m0
-    pmaddubsw     m3, k6k7
-    palignr       m0, m7, 1
-    pmaddubsw     m0, k0k1
-    palignr       m1, m7, 5
-    pmaddubsw     m1, k2k3
-    palignr       m2, m7, 9
-    pmaddubsw     m2, k4k5
-    paddsw        m1, m3
-    mova          m3, m4
-    punpckhbw     m4, m4
-    mova          m5, m4
-    punpcklbw     m3, m3
-    mova          m7, m4
-    palignr       m5, m3, 5
-    mova          m6, m4
-    palignr       m4, m3, 1
-    pmaddubsw     m4, k0k1
-    pmaddubsw     m5, k2k3
-    palignr       m6, m3, 9
-    pmaddubsw     m6, k4k5
-    palignr       m7, m3, 13
-    pmaddubsw     m7, k6k7
-    paddsw        m0, m2
-    paddsw        m0, m1
-%ifidn %1, h8_avg
-    mova          m1, [dstq]
-%endif
-    paddsw        m4, m6
-    paddsw        m5, m7
-    paddsw        m4, m5
-    paddsw        m0, krd
-    paddsw        m4, krd
-    psraw         m0, 7
-    psraw         m4, 7
-    packuswb      m0, m4
-%ifidn %1, h8_avg
-    pavgb         m0, m1
-%endif
-    lea         srcq, [srcq + sstrideq]
-    mova      [dstq], m0
-    lea         dstq, [dstq + dstrideq]
-    dec      heightd
-    jnz        .loop
-    RET
-%endm
-
-INIT_XMM ssse3
-SUBPIX_HFILTER16 h8
-SUBPIX_HFILTER16 h8_avg
-SUBPIX_HFILTER8  h8
-SUBPIX_HFILTER8  h8_avg
-SUBPIX_HFILTER4  h8
-SUBPIX_HFILTER4  h8_avg
-
-;-------------------------------------------------------------------------------
-%macro SUBPIX_VFILTER 2
-cglobal filter_block1d%2_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
-                             src, sstride, dst, dstride, height, filter
-    mova          m4, [filterq]
-    SETUP_LOCAL_VARS
-%if ARCH_X86_64
-    %define      src1q r7
-    %define  sstride6q r8
-    %define dst_stride dstrideq
-%else
-    %define      src1q filterq
-    %define  sstride6q dstrideq
-    %define dst_stride dstridemp
-%endif
-    mov       src1q, srcq
-    add       src1q, sstrideq
-    lea   sstride6q, [sstrideq + sstrideq * 4]
-    add   sstride6q, sstrideq                   ;pitch * 6
-
-%ifidn %2, 8
-    %define movx movh
-%else
-    %define movx movd
-%endif
-.loop:
-    movx         m0, [srcq                ]     ;A
-    movx         m1, [srcq + sstrideq     ]     ;B
-    punpcklbw    m0, m1                         ;A B
-    movx         m2, [srcq + sstrideq * 2 ]     ;C
-    pmaddubsw    m0, k0k1
-    mova         m6, m2
-    movx         m3, [src1q + sstrideq * 2]     ;D
-    punpcklbw    m2, m3                         ;C D
-    pmaddubsw    m2, k2k3
-    movx         m4, [srcq + sstrideq * 4 ]     ;E
-    mova         m7, m4
-    movx         m5, [src1q + sstrideq * 4]     ;F
-    punpcklbw    m4, m5                         ;E F
-    pmaddubsw    m4, k4k5
-    punpcklbw    m1, m6                         ;A B next iter
-    movx         m6, [srcq + sstride6q    ]     ;G
-    punpcklbw    m5, m6                         ;E F next iter
-    punpcklbw    m3, m7                         ;C D next iter
-    pmaddubsw    m5, k4k5
-    movx         m7, [src1q + sstride6q   ]     ;H
-    punpcklbw    m6, m7                         ;G H
-    pmaddubsw    m6, k6k7
-    pmaddubsw    m3, k2k3
-    pmaddubsw    m1, k0k1
-    paddsw       m0, m4
-    paddsw       m2, m6
-    movx         m6, [srcq + sstrideq * 8 ]     ;H next iter
-    punpcklbw    m7, m6
-    pmaddubsw    m7, k6k7
-    paddsw       m0, m2
-    paddsw       m0, krd
-    psraw        m0, 7
-    paddsw       m1, m5
-    packuswb     m0, m0
-
-    paddsw       m3, m7
-    paddsw       m1, m3
-    paddsw       m1, krd
-    psraw        m1, 7
-    lea        srcq, [srcq + sstrideq * 2 ]
-    lea       src1q, [src1q + sstrideq * 2]
-    packuswb     m1, m1
-
-%ifidn %1, v8_avg
-    movx         m2, [dstq]
-    pavgb        m0, m2
-%endif
-    movx     [dstq], m0
-    add        dstq, dst_stride
-%ifidn %1, v8_avg
-    movx         m3, [dstq]
-    pavgb        m1, m3
-%endif
-    movx     [dstq], m1
-    add        dstq, dst_stride
-    sub     heightd, 2
-    cmp     heightd, 1
-    jg        .loop
-
-    cmp     heightd, 0
-    je        .done
-
-    movx         m0, [srcq                ]     ;A
-    movx         m1, [srcq + sstrideq     ]     ;B
-    movx         m6, [srcq + sstride6q    ]     ;G
-    punpcklbw    m0, m1                         ;A B
-    movx         m7, [src1q + sstride6q   ]     ;H
-    pmaddubsw    m0, k0k1
-    movx         m2, [srcq + sstrideq * 2 ]     ;C
-    punpcklbw    m6, m7                         ;G H
-    movx         m3, [src1q + sstrideq * 2]     ;D
-    pmaddubsw    m6, k6k7
-    movx         m4, [srcq + sstrideq * 4 ]     ;E
-    punpcklbw    m2, m3                         ;C D
-    movx         m5, [src1q + sstrideq * 4]     ;F
-    punpcklbw    m4, m5                         ;E F
-    pmaddubsw    m2, k2k3
-    pmaddubsw    m4, k4k5
-    paddsw       m2, m6
-    paddsw       m0, m4
-    paddsw       m0, m2
-    paddsw       m0, krd
-    psraw        m0, 7
-    packuswb     m0, m0
-%ifidn %1, v8_avg
-    movx         m1, [dstq]
-    pavgb        m0, m1
-%endif
-    movx     [dstq], m0
-.done:
-    RET
-%endm
-
-;-------------------------------------------------------------------------------
-%macro SUBPIX_VFILTER16 1
-cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
-                             src, sstride, dst, dstride, height, filter
-    mova          m4, [filterq]
-    SETUP_LOCAL_VARS
-%if ARCH_X86_64
-    %define      src1q r7
-    %define  sstride6q r8
-    %define dst_stride dstrideq
-%else
-    %define      src1q filterq
-    %define  sstride6q dstrideq
-    %define dst_stride dstridemp
-%endif
-    mov        src1q, srcq
-    add        src1q, sstrideq
-    lea    sstride6q, [sstrideq + sstrideq * 4]
-    add    sstride6q, sstrideq                   ;pitch * 6
-
-.loop:
-    movh          m0, [srcq                ]     ;A
-    movh          m1, [srcq + sstrideq     ]     ;B
-    movh          m2, [srcq + sstrideq * 2 ]     ;C
-    movh          m3, [src1q + sstrideq * 2]     ;D
-    movh          m4, [srcq + sstrideq * 4 ]     ;E
-    movh          m5, [src1q + sstrideq * 4]     ;F
-
-    punpcklbw     m0, m1                         ;A B
-    movh          m6, [srcq + sstride6q]         ;G
-    punpcklbw     m2, m3                         ;C D
-    movh          m7, [src1q + sstride6q]        ;H
-    punpcklbw     m4, m5                         ;E F
-    pmaddubsw     m0, k0k1
-    movh          m3, [srcq + 8]                 ;A
-    pmaddubsw     m2, k2k3
-    punpcklbw     m6, m7                         ;G H
-    movh          m5, [srcq + sstrideq + 8]      ;B
-    pmaddubsw     m4, k4k5
-    punpcklbw     m3, m5                         ;A B
-    movh          m7, [srcq + sstrideq * 2 + 8]  ;C
-    pmaddubsw     m6, k6k7
-    movh          m5, [src1q + sstrideq * 2 + 8] ;D
-    punpcklbw     m7, m5                         ;C D
-    paddsw        m2, m6
-    pmaddubsw     m3, k0k1
-    movh          m1, [srcq + sstrideq * 4 + 8]  ;E
-    paddsw        m0, m4
-    pmaddubsw     m7, k2k3
-    movh          m6, [src1q + sstrideq * 4 + 8] ;F
-    punpcklbw     m1, m6                         ;E F
-    paddsw        m0, m2
-    paddsw        m0, krd
-    movh          m2, [srcq + sstride6q + 8]     ;G
-    pmaddubsw     m1, k4k5
-    movh          m5, [src1q + sstride6q + 8]    ;H
-    psraw         m0, 7
-    punpcklbw     m2, m5                         ;G H
-    pmaddubsw     m2, k6k7
-%ifidn %1, v8_avg
-    mova          m4, [dstq]
-%endif
-    movh      [dstq], m0
-    paddsw        m7, m2
-    paddsw        m3, m1
-    paddsw        m3, m7
-    paddsw        m3, krd
-    psraw         m3, 7
-    packuswb      m0, m3
-
-    add         srcq, sstrideq
-    add        src1q, sstrideq
-%ifidn %1, v8_avg
-    pavgb         m0, m4
-%endif
-    mova      [dstq], m0
-    add         dstq, dst_stride
-    dec      heightd
-    jnz        .loop
-    RET
-%endm
-
-INIT_XMM ssse3
-SUBPIX_VFILTER16     v8
-SUBPIX_VFILTER16 v8_avg
-SUBPIX_VFILTER       v8, 8
-SUBPIX_VFILTER   v8_avg, 8
-SUBPIX_VFILTER       v8, 4
-SUBPIX_VFILTER   v8_avg, 4
diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm
deleted file mode 100644
index a378dd0402..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm
+++ /dev/null
@@ -1,448 +0,0 @@
-;
-;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro GET_PARAM_4 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm3, [rdx]                 ;load filters
-    pshuflw     xmm4, xmm3, 11111111b       ;k3
-    psrldq      xmm3, 8
-    pshuflw     xmm3, xmm3, 0b              ;k4
-    punpcklqdq  xmm4, xmm3                  ;k3k4
-
-    movq        xmm3, rcx                   ;rounding
-    pshufd      xmm3, xmm3, 0
-
-    pxor        xmm2, xmm2
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-%endm
-
-%macro APPLY_FILTER_4 1
-
-    punpckldq   xmm0, xmm1                  ;two row in one register
-    punpcklbw   xmm0, xmm2                  ;unpack to word
-    pmullw      xmm0, xmm4                  ;multiply the filter factors
-
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 8
-    paddsw      xmm0, xmm1
-
-    paddsw      xmm0, xmm3                  ;rounding
-    psraw       xmm0, 7                     ;shift
-    packuswb    xmm0, xmm0                  ;pack to byte
-
-%if %1
-    movd        xmm1, [rdi]
-    pavgb       xmm0, xmm1
-%endif
-
-    movd        [rdi], xmm0
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-%endm
-
-%macro GET_PARAM 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm7, [rdx]                 ;load filters
-
-    pshuflw     xmm6, xmm7, 11111111b       ;k3
-    pshufhw     xmm7, xmm7, 0b              ;k4
-    punpcklwd   xmm6, xmm6
-    punpckhwd   xmm7, xmm7
-
-    movq        xmm4, rcx                   ;rounding
-    pshufd      xmm4, xmm4, 0
-
-    pxor        xmm5, xmm5
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-%endm
-
-%macro APPLY_FILTER_8 1
-    punpcklbw   xmm0, xmm5
-    punpcklbw   xmm1, xmm5
-
-    pmullw      xmm0, xmm6
-    pmullw      xmm1, xmm7
-    paddsw      xmm0, xmm1
-    paddsw      xmm0, xmm4                  ;rounding
-    psraw       xmm0, 7                     ;shift
-    packuswb    xmm0, xmm0                  ;pack back to byte
-%if %1
-    movq        xmm1, [rdi]
-    pavgb       xmm0, xmm1
-%endif
-    movq        [rdi], xmm0                 ;store the result
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-%endm
-
-%macro APPLY_FILTER_16 1
-    punpcklbw   xmm0, xmm5
-    punpcklbw   xmm1, xmm5
-    punpckhbw   xmm2, xmm5
-    punpckhbw   xmm3, xmm5
-
-    pmullw      xmm0, xmm6
-    pmullw      xmm1, xmm7
-    pmullw      xmm2, xmm6
-    pmullw      xmm3, xmm7
-
-    paddsw      xmm0, xmm1
-    paddsw      xmm2, xmm3
-
-    paddsw      xmm0, xmm4                  ;rounding
-    paddsw      xmm2, xmm4
-    psraw       xmm0, 7                     ;shift
-    psraw       xmm2, 7
-    packuswb    xmm0, xmm2                  ;pack back to byte
-%if %1
-    movdqu      xmm1, [rdi]
-    pavgb       xmm0, xmm1
-%endif
-    movdqu      [rdi], xmm0                 ;store the result
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-%endm
-
-global sym(vpx_filter_block1d4_v2_sse2) PRIVATE
-sym(vpx_filter_block1d4_v2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movd        xmm0, [rsi]                 ;load src
-    movd        xmm1, [rsi + rax]
-
-    APPLY_FILTER_4 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d8_v2_sse2) PRIVATE
-sym(vpx_filter_block1d8_v2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movq        xmm0, [rsi]                 ;0
-    movq        xmm1, [rsi + rax]           ;1
-
-    APPLY_FILTER_8 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d16_v2_sse2) PRIVATE
-sym(vpx_filter_block1d16_v2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu        xmm0, [rsi]               ;0
-    movdqu        xmm1, [rsi + rax]         ;1
-    movdqa        xmm2, xmm0
-    movdqa        xmm3, xmm1
-
-    APPLY_FILTER_16 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d4_v2_avg_sse2) PRIVATE
-sym(vpx_filter_block1d4_v2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movd        xmm0, [rsi]                 ;load src
-    movd        xmm1, [rsi + rax]
-
-    APPLY_FILTER_4 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d8_v2_avg_sse2) PRIVATE
-sym(vpx_filter_block1d8_v2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movq        xmm0, [rsi]                 ;0
-    movq        xmm1, [rsi + rax]           ;1
-
-    APPLY_FILTER_8 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d16_v2_avg_sse2) PRIVATE
-sym(vpx_filter_block1d16_v2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu        xmm0, [rsi]               ;0
-    movdqu        xmm1, [rsi + rax]         ;1
-    movdqa        xmm2, xmm0
-    movdqa        xmm3, xmm1
-
-    APPLY_FILTER_16 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d4_h2_sse2) PRIVATE
-sym(vpx_filter_block1d4_h2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_4 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d8_h2_sse2) PRIVATE
-sym(vpx_filter_block1d8_h2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_8 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d16_h2_sse2) PRIVATE
-sym(vpx_filter_block1d16_h2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0,   [rsi]               ;load src
-    movdqu      xmm1,   [rsi + 1]
-    movdqa      xmm2, xmm0
-    movdqa      xmm3, xmm1
-
-    APPLY_FILTER_16 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d4_h2_avg_sse2) PRIVATE
-sym(vpx_filter_block1d4_h2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_4 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d8_h2_avg_sse2) PRIVATE
-sym(vpx_filter_block1d8_h2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_8 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d16_h2_avg_sse2) PRIVATE
-sym(vpx_filter_block1d16_h2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0,   [rsi]               ;load src
-    movdqu      xmm1,   [rsi + 1]
-    movdqa      xmm2, xmm0
-    movdqa      xmm3, xmm1
-
-    APPLY_FILTER_16 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
deleted file mode 100644
index 3c8cfd2253..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
+++ /dev/null
@@ -1,422 +0,0 @@
-;
-;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro GET_PARAM_4 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm3, [rdx]                 ;load filters
-    psrldq      xmm3, 6
-    packsswb    xmm3, xmm3
-    pshuflw     xmm3, xmm3, 0b              ;k3_k4
-
-    movq        xmm2, rcx                   ;rounding
-    pshufd      xmm2, xmm2, 0
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-%endm
-
-%macro APPLY_FILTER_4 1
-    punpcklbw   xmm0, xmm1
-    pmaddubsw   xmm0, xmm3
-
-    paddsw      xmm0, xmm2                  ;rounding
-    psraw       xmm0, 7                     ;shift
-    packuswb    xmm0, xmm0                  ;pack to byte
-
-%if %1
-    movd        xmm1, [rdi]
-    pavgb       xmm0, xmm1
-%endif
-    movd        [rdi], xmm0
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-%endm
-
-%macro GET_PARAM 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm7, [rdx]                 ;load filters
-    psrldq      xmm7, 6
-    packsswb    xmm7, xmm7
-    pshuflw     xmm7, xmm7, 0b              ;k3_k4
-    punpcklwd   xmm7, xmm7
-
-    movq        xmm6, rcx                   ;rounding
-    pshufd      xmm6, xmm6, 0
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-%endm
-
-%macro APPLY_FILTER_8 1
-    punpcklbw   xmm0, xmm1
-    pmaddubsw   xmm0, xmm7
-
-    paddsw      xmm0, xmm6                  ;rounding
-    psraw       xmm0, 7                     ;shift
-    packuswb    xmm0, xmm0                  ;pack back to byte
-
-%if %1
-    movq        xmm1, [rdi]
-    pavgb       xmm0, xmm1
-%endif
-    movq        [rdi], xmm0                 ;store the result
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-%endm
-
-%macro APPLY_FILTER_16 1
-    punpcklbw   xmm0, xmm1
-    punpckhbw   xmm2, xmm1
-    pmaddubsw   xmm0, xmm7
-    pmaddubsw   xmm2, xmm7
-
-    paddsw      xmm0, xmm6                  ;rounding
-    paddsw      xmm2, xmm6
-    psraw       xmm0, 7                     ;shift
-    psraw       xmm2, 7
-    packuswb    xmm0, xmm2                  ;pack back to byte
-
-%if %1
-    movdqu      xmm1, [rdi]
-    pavgb       xmm0, xmm1
-%endif
-    movdqu      [rdi], xmm0                 ;store the result
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-%endm
-
-global sym(vpx_filter_block1d4_v2_ssse3) PRIVATE
-sym(vpx_filter_block1d4_v2_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movd        xmm0, [rsi]                 ;load src
-    movd        xmm1, [rsi + rax]
-
-    APPLY_FILTER_4 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d8_v2_ssse3) PRIVATE
-sym(vpx_filter_block1d8_v2_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movq        xmm0, [rsi]                 ;0
-    movq        xmm1, [rsi + rax]           ;1
-
-    APPLY_FILTER_8 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d16_v2_ssse3) PRIVATE
-sym(vpx_filter_block1d16_v2_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu        xmm0, [rsi]               ;0
-    movdqu        xmm1, [rsi + rax]         ;1
-    movdqa        xmm2, xmm0
-
-    APPLY_FILTER_16 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d4_v2_avg_ssse3) PRIVATE
-sym(vpx_filter_block1d4_v2_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movd        xmm0, [rsi]                 ;load src
-    movd        xmm1, [rsi + rax]
-
-    APPLY_FILTER_4 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d8_v2_avg_ssse3) PRIVATE
-sym(vpx_filter_block1d8_v2_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movq        xmm0, [rsi]                 ;0
-    movq        xmm1, [rsi + rax]           ;1
-
-    APPLY_FILTER_8 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d16_v2_avg_ssse3) PRIVATE
-sym(vpx_filter_block1d16_v2_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu        xmm0, [rsi]               ;0
-    movdqu        xmm1, [rsi + rax]         ;1
-    movdqa        xmm2, xmm0
-
-    APPLY_FILTER_16 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d4_h2_ssse3) PRIVATE
-sym(vpx_filter_block1d4_h2_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_4 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d8_h2_ssse3) PRIVATE
-sym(vpx_filter_block1d8_h2_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_8 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d16_h2_ssse3) PRIVATE
-sym(vpx_filter_block1d16_h2_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0,   [rsi]               ;load src
-    movdqu      xmm1,   [rsi + 1]
-    movdqa      xmm2, xmm0
-
-    APPLY_FILTER_16 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d4_h2_avg_ssse3) PRIVATE
-sym(vpx_filter_block1d4_h2_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_4 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d8_h2_avg_ssse3) PRIVATE
-sym(vpx_filter_block1d8_h2_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_8 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d16_h2_avg_ssse3) PRIVATE
-sym(vpx_filter_block1d16_h2_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0,   [rsi]               ;load src
-    movdqu      xmm1,   [rsi + 1]
-    movdqa      xmm2, xmm0
-
-    APPLY_FILTER_16 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret