summaryrefslogtreecommitdiff
path: root/thirdparty/libvpx/vp8
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty/libvpx/vp8')
-rw-r--r--thirdparty/libvpx/vp8/common/alloccommon.c190
-rw-r--r--thirdparty/libvpx/vp8/common/alloccommon.h31
-rw-r--r--thirdparty/libvpx/vp8/common/arm/loopfilter_arm.c181
-rw-r--r--thirdparty/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c591
-rw-r--r--thirdparty/libvpx/vp8/common/arm/neon/copymem_neon.c59
-rw-r--r--thirdparty/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c42
-rw-r--r--thirdparty/libvpx/vp8/common/arm/neon/dequant_idct_neon.c142
-rw-r--r--thirdparty/libvpx/vp8/common/arm/neon/dequantizeb_neon.c25
-rw-r--r--thirdparty/libvpx/vp8/common/arm/neon/idct_blk_neon.c96
-rw-r--r--thirdparty/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c63
-rw-r--r--thirdparty/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c185
-rw-r--r--thirdparty/libvpx/vp8/common/arm/neon/iwalsh_neon.c102
-rw-r--r--thirdparty/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c111
-rw-r--r--thirdparty/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c283
-rw-r--r--thirdparty/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c625
-rw-r--r--thirdparty/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c123
-rw-r--r--thirdparty/libvpx/vp8/common/arm/neon/sixtappredict_neon.c1377
-rw-r--r--thirdparty/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c550
-rw-r--r--thirdparty/libvpx/vp8/common/blockd.c22
-rw-r--r--thirdparty/libvpx/vp8/common/blockd.h312
-rw-r--r--thirdparty/libvpx/vp8/common/coefupdateprobs.h197
-rw-r--r--thirdparty/libvpx/vp8/common/common.h48
-rw-r--r--thirdparty/libvpx/vp8/common/copy_c.c32
-rw-r--r--thirdparty/libvpx/vp8/common/debugmodes.c155
-rw-r--r--thirdparty/libvpx/vp8/common/default_coef_probs.h200
-rw-r--r--thirdparty/libvpx/vp8/common/dequantize.c43
-rw-r--r--thirdparty/libvpx/vp8/common/entropy.c188
-rw-r--r--thirdparty/libvpx/vp8/common/entropy.h109
-rw-r--r--thirdparty/libvpx/vp8/common/entropymode.c171
-rw-r--r--thirdparty/libvpx/vp8/common/entropymode.h88
-rw-r--r--thirdparty/libvpx/vp8/common/entropymv.c49
-rw-r--r--thirdparty/libvpx/vp8/common/entropymv.h52
-rw-r--r--thirdparty/libvpx/vp8/common/extend.c188
-rw-r--r--thirdparty/libvpx/vp8/common/extend.h33
-rw-r--r--thirdparty/libvpx/vp8/common/filter.c493
-rw-r--r--thirdparty/libvpx/vp8/common/filter.h32
-rw-r--r--thirdparty/libvpx/vp8/common/findnearmv.c193
-rw-r--r--thirdparty/libvpx/vp8/common/findnearmv.h195
-rw-r--r--thirdparty/libvpx/vp8/common/generic/systemdependent.c106
-rw-r--r--thirdparty/libvpx/vp8/common/header.h51
-rw-r--r--thirdparty/libvpx/vp8/common/idct_blk.c90
-rw-r--r--thirdparty/libvpx/vp8/common/idctllm.c205
-rw-r--r--thirdparty/libvpx/vp8/common/invtrans.h70
-rw-r--r--thirdparty/libvpx/vp8/common/loopfilter.h113
-rw-r--r--thirdparty/libvpx/vp8/common/loopfilter_filters.c430
-rw-r--r--thirdparty/libvpx/vp8/common/mbpitch.c68
-rw-r--r--thirdparty/libvpx/vp8/common/modecont.c40
-rw-r--r--thirdparty/libvpx/vp8/common/modecont.h25
-rw-r--r--thirdparty/libvpx/vp8/common/mv.h36
-rw-r--r--thirdparty/libvpx/vp8/common/onyxc_int.h185
-rw-r--r--thirdparty/libvpx/vp8/common/onyxd.h63
-rw-r--r--thirdparty/libvpx/vp8/common/ppflags.h49
-rw-r--r--thirdparty/libvpx/vp8/common/quant_common.c135
-rw-r--r--thirdparty/libvpx/vp8/common/quant_common.h34
-rw-r--r--thirdparty/libvpx/vp8/common/reconinter.c544
-rw-r--r--thirdparty/libvpx/vp8/common/reconinter.h43
-rw-r--r--thirdparty/libvpx/vp8/common/reconintra.c117
-rw-r--r--thirdparty/libvpx/vp8/common/reconintra.h44
-rw-r--r--thirdparty/libvpx/vp8/common/reconintra4x4.c54
-rw-r--r--thirdparty/libvpx/vp8/common/reconintra4x4.h48
-rw-r--r--thirdparty/libvpx/vp8/common/rtcd.c19
-rw-r--r--thirdparty/libvpx/vp8/common/setupintrarecon.c39
-rw-r--r--thirdparty/libvpx/vp8/common/setupintrarecon.h45
-rw-r--r--thirdparty/libvpx/vp8/common/swapyv12buffer.c34
-rw-r--r--thirdparty/libvpx/vp8/common/swapyv12buffer.h27
-rw-r--r--thirdparty/libvpx/vp8/common/systemdependent.h27
-rw-r--r--thirdparty/libvpx/vp8/common/threading.h232
-rw-r--r--thirdparty/libvpx/vp8/common/treecoder.c143
-rw-r--r--thirdparty/libvpx/vp8/common/treecoder.h98
-rw-r--r--thirdparty/libvpx/vp8/common/vp8_entropymodedata.h254
-rw-r--r--thirdparty/libvpx/vp8/common/vp8_loopfilter.c661
-rw-r--r--thirdparty/libvpx/vp8/common/x86/copy_sse2.asm93
-rw-r--r--thirdparty/libvpx/vp8/common/x86/copy_sse3.asm146
-rw-r--r--thirdparty/libvpx/vp8/common/x86/dequantize_mmx.asm258
-rw-r--r--thirdparty/libvpx/vp8/common/x86/filter_x86.c35
-rw-r--r--thirdparty/libvpx/vp8/common/x86/filter_x86.h33
-rw-r--r--thirdparty/libvpx/vp8/common/x86/idct_blk_mmx.c128
-rw-r--r--thirdparty/libvpx/vp8/common/x86/idct_blk_sse2.c89
-rw-r--r--thirdparty/libvpx/vp8/common/x86/idctllm_mmx.asm295
-rw-r--r--thirdparty/libvpx/vp8/common/x86/idctllm_sse2.asm708
-rw-r--r--thirdparty/libvpx/vp8/common/x86/iwalsh_mmx.asm140
-rw-r--r--thirdparty/libvpx/vp8/common/x86/iwalsh_sse2.asm121
-rw-r--r--thirdparty/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm815
-rw-r--r--thirdparty/libvpx/vp8/common/x86/loopfilter_sse2.asm1640
-rw-r--r--thirdparty/libvpx/vp8/common/x86/loopfilter_x86.c198
-rw-r--r--thirdparty/libvpx/vp8/common/x86/recon_mmx.asm274
-rw-r--r--thirdparty/libvpx/vp8/common/x86/recon_sse2.asm116
-rw-r--r--thirdparty/libvpx/vp8/common/x86/subpixel_mmx.asm702
-rw-r--r--thirdparty/libvpx/vp8/common/x86/subpixel_sse2.asm1372
-rw-r--r--thirdparty/libvpx/vp8/common/x86/subpixel_ssse3.asm1508
-rw-r--r--thirdparty/libvpx/vp8/common/x86/vp8_asm_stubs.c625
-rw-r--r--thirdparty/libvpx/vp8/common/x86/vp8_loopfilter_mmx.asm1753
-rw-r--r--thirdparty/libvpx/vp8/decoder/dboolhuff.c77
-rw-r--r--thirdparty/libvpx/vp8/decoder/dboolhuff.h141
-rw-r--r--thirdparty/libvpx/vp8/decoder/decodeframe.c1397
-rw-r--r--thirdparty/libvpx/vp8/decoder/decodemv.c670
-rw-r--r--thirdparty/libvpx/vp8/decoder/decodemv.h26
-rw-r--r--thirdparty/libvpx/vp8/decoder/decoderthreading.h30
-rw-r--r--thirdparty/libvpx/vp8/decoder/detokenize.c245
-rw-r--r--thirdparty/libvpx/vp8/decoder/detokenize.h27
-rw-r--r--thirdparty/libvpx/vp8/decoder/onyxd_if.c521
-rw-r--r--thirdparty/libvpx/vp8/decoder/onyxd_int.h161
-rw-r--r--thirdparty/libvpx/vp8/decoder/threading.c928
-rw-r--r--thirdparty/libvpx/vp8/decoder/treereader.h49
-rw-r--r--thirdparty/libvpx/vp8/vp8_dx_iface.c828
105 files changed, 27854 insertions, 0 deletions
diff --git a/thirdparty/libvpx/vp8/common/alloccommon.c b/thirdparty/libvpx/vp8/common/alloccommon.c
new file mode 100644
index 0000000000..8dfd4ce203
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/alloccommon.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "alloccommon.h"
+#include "blockd.h"
+#include "vpx_mem/vpx_mem.h"
+#include "onyxc_int.h"
+#include "findnearmv.h"
+#include "entropymode.h"
+#include "systemdependent.h"
+
+void vp8_de_alloc_frame_buffers(VP8_COMMON *oci)
+{
+ int i;
+ for (i = 0; i < NUM_YV12_BUFFERS; i++)
+ vp8_yv12_de_alloc_frame_buffer(&oci->yv12_fb[i]);
+
+ vp8_yv12_de_alloc_frame_buffer(&oci->temp_scale_frame);
+#if CONFIG_POSTPROC
+ vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer);
+ if (oci->post_proc_buffer_int_used)
+ vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer_int);
+
+ vpx_free(oci->pp_limits_buffer);
+ oci->pp_limits_buffer = NULL;
+#endif
+
+ vpx_free(oci->above_context);
+ vpx_free(oci->mip);
+#if CONFIG_ERROR_CONCEALMENT
+ vpx_free(oci->prev_mip);
+ oci->prev_mip = NULL;
+#endif
+
+ oci->above_context = NULL;
+ oci->mip = NULL;
+}
+
+int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height)
+{
+ int i;
+
+ vp8_de_alloc_frame_buffers(oci);
+
+ /* our internal buffers are always multiples of 16 */
+ if ((width & 0xf) != 0)
+ width += 16 - (width & 0xf);
+
+ if ((height & 0xf) != 0)
+ height += 16 - (height & 0xf);
+
+
+ for (i = 0; i < NUM_YV12_BUFFERS; i++)
+ {
+ oci->fb_idx_ref_cnt[i] = 0;
+ oci->yv12_fb[i].flags = 0;
+ if (vp8_yv12_alloc_frame_buffer(&oci->yv12_fb[i], width, height, VP8BORDERINPIXELS) < 0)
+ goto allocation_fail;
+ }
+
+ oci->new_fb_idx = 0;
+ oci->lst_fb_idx = 1;
+ oci->gld_fb_idx = 2;
+ oci->alt_fb_idx = 3;
+
+ oci->fb_idx_ref_cnt[0] = 1;
+ oci->fb_idx_ref_cnt[1] = 1;
+ oci->fb_idx_ref_cnt[2] = 1;
+ oci->fb_idx_ref_cnt[3] = 1;
+
+ if (vp8_yv12_alloc_frame_buffer(&oci->temp_scale_frame, width, 16, VP8BORDERINPIXELS) < 0)
+ goto allocation_fail;
+
+ oci->mb_rows = height >> 4;
+ oci->mb_cols = width >> 4;
+ oci->MBs = oci->mb_rows * oci->mb_cols;
+ oci->mode_info_stride = oci->mb_cols + 1;
+ oci->mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO));
+
+ if (!oci->mip)
+ goto allocation_fail;
+
+ oci->mi = oci->mip + oci->mode_info_stride + 1;
+
+ /* Allocation of previous mode info will be done in vp8_decode_frame()
+ * as it is a decoder only data */
+
+ oci->above_context = vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * oci->mb_cols, 1);
+
+ if (!oci->above_context)
+ goto allocation_fail;
+
+#if CONFIG_POSTPROC
+ if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer, width, height, VP8BORDERINPIXELS) < 0)
+ goto allocation_fail;
+
+ oci->post_proc_buffer_int_used = 0;
+ memset(&oci->postproc_state, 0, sizeof(oci->postproc_state));
+ memset(oci->post_proc_buffer.buffer_alloc, 128,
+ oci->post_proc_buffer.frame_size);
+
+ /* Allocate buffer to store post-processing filter coefficients.
+ *
+ * Note: Round up mb_cols to support SIMD reads
+ */
+ oci->pp_limits_buffer = vpx_memalign(16, 24 * ((oci->mb_cols + 1) & ~1));
+ if (!oci->pp_limits_buffer)
+ goto allocation_fail;
+#endif
+
+ return 0;
+
+allocation_fail:
+ vp8_de_alloc_frame_buffers(oci);
+ return 1;
+}
+
+void vp8_setup_version(VP8_COMMON *cm)
+{
+ switch (cm->version)
+ {
+ case 0:
+ cm->no_lpf = 0;
+ cm->filter_type = NORMAL_LOOPFILTER;
+ cm->use_bilinear_mc_filter = 0;
+ cm->full_pixel = 0;
+ break;
+ case 1:
+ cm->no_lpf = 0;
+ cm->filter_type = SIMPLE_LOOPFILTER;
+ cm->use_bilinear_mc_filter = 1;
+ cm->full_pixel = 0;
+ break;
+ case 2:
+ cm->no_lpf = 1;
+ cm->filter_type = NORMAL_LOOPFILTER;
+ cm->use_bilinear_mc_filter = 1;
+ cm->full_pixel = 0;
+ break;
+ case 3:
+ cm->no_lpf = 1;
+ cm->filter_type = SIMPLE_LOOPFILTER;
+ cm->use_bilinear_mc_filter = 1;
+ cm->full_pixel = 1;
+ break;
+ default:
+ /*4,5,6,7 are reserved for future use*/
+ cm->no_lpf = 0;
+ cm->filter_type = NORMAL_LOOPFILTER;
+ cm->use_bilinear_mc_filter = 0;
+ cm->full_pixel = 0;
+ break;
+ }
+}
+void vp8_create_common(VP8_COMMON *oci)
+{
+ vp8_machine_specific_config(oci);
+
+ vp8_init_mbmode_probs(oci);
+ vp8_default_bmode_probs(oci->fc.bmode_prob);
+
+ oci->mb_no_coeff_skip = 1;
+ oci->no_lpf = 0;
+ oci->filter_type = NORMAL_LOOPFILTER;
+ oci->use_bilinear_mc_filter = 0;
+ oci->full_pixel = 0;
+ oci->multi_token_partition = ONE_PARTITION;
+ oci->clamp_type = RECON_CLAMP_REQUIRED;
+
+ /* Initialize reference frame sign bias structure to defaults */
+ memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
+
+ /* Default disable buffer to buffer copying */
+ oci->copy_buffer_to_gf = 0;
+ oci->copy_buffer_to_arf = 0;
+}
+
+void vp8_remove_common(VP8_COMMON *oci)
+{
+ vp8_de_alloc_frame_buffers(oci);
+}
diff --git a/thirdparty/libvpx/vp8/common/alloccommon.h b/thirdparty/libvpx/vp8/common/alloccommon.h
new file mode 100644
index 0000000000..93e99d76b1
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/alloccommon.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_ALLOCCOMMON_H_
+#define VP8_COMMON_ALLOCCOMMON_H_
+
+#include "onyxc_int.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_create_common(VP8_COMMON *oci);
+void vp8_remove_common(VP8_COMMON *oci);
+void vp8_de_alloc_frame_buffers(VP8_COMMON *oci);
+int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height);
+void vp8_setup_version(VP8_COMMON *oci);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_ALLOCCOMMON_H_
diff --git a/thirdparty/libvpx/vp8/common/arm/loopfilter_arm.c b/thirdparty/libvpx/vp8/common/arm/loopfilter_arm.c
new file mode 100644
index 0000000000..5840c2bbaa
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/arm/loopfilter_arm.c
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/onyxc_int.h"
+
+#define prototype_loopfilter(sym) \
+ void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
+ const unsigned char *limit, const unsigned char *thresh, int count)
+
+#if HAVE_MEDIA
+extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6);
+extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6);
+extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6);
+extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6);
+#endif
+
+#if HAVE_NEON
+typedef void loopfilter_y_neon(unsigned char *src, int pitch,
+ unsigned char blimit, unsigned char limit, unsigned char thresh);
+typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
+ unsigned char blimit, unsigned char limit, unsigned char thresh,
+ unsigned char *v);
+
+extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon;
+extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon;
+extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon;
+extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon;
+
+extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
+extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
+extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
+extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon;
+#endif
+
+#if HAVE_MEDIA
+/* ARMV6/MEDIA loopfilter functions*/
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit)
+{
+ vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit);
+ vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit);
+ vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit);
+}
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit)
+{
+ vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit);
+ vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit);
+ vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit);
+}
+#endif
+
+#if HAVE_NEON
+/* NEON loopfilter functions */
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ unsigned char mblim = *lfi->mblim;
+ unsigned char lim = *lfi->lim;
+ unsigned char hev_thr = *lfi->hev_thr;
+ vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
+
+ if (u_ptr)
+ vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
+}
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ unsigned char mblim = *lfi->mblim;
+ unsigned char lim = *lfi->lim;
+ unsigned char hev_thr = *lfi->hev_thr;
+
+ vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
+
+ if (u_ptr)
+ vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
+}
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ unsigned char blim = *lfi->blim;
+ unsigned char lim = *lfi->lim;
+ unsigned char hev_thr = *lfi->hev_thr;
+
+ vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim, lim, hev_thr);
+ vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim, lim, hev_thr);
+ vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim, lim, hev_thr);
+
+ if (u_ptr)
+ vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, blim, lim, hev_thr, v_ptr + 4 * uv_stride);
+}
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ unsigned char blim = *lfi->blim;
+ unsigned char lim = *lfi->lim;
+ unsigned char hev_thr = *lfi->hev_thr;
+
+ vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr);
+ vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr);
+ vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim, hev_thr);
+
+ if (u_ptr)
+ vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, hev_thr, v_ptr + 4);
+}
+#endif
diff --git a/thirdparty/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c b/thirdparty/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c
new file mode 100644
index 0000000000..bb6ea76ba4
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c
@@ -0,0 +1,591 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+static const uint8_t bifilter4_coeff[8][2] = {
+ {128, 0},
+ {112, 16},
+ { 96, 32},
+ { 80, 48},
+ { 64, 64},
+ { 48, 80},
+ { 32, 96},
+ { 16, 112}
+};
+
+void vp8_bilinear_predict8x4_neon(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8;
+ uint8x8_t d7u8, d9u8, d11u8, d22u8, d23u8, d24u8, d25u8, d26u8;
+ uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8;
+ uint16x8_t q1u16, q2u16, q3u16, q4u16;
+ uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16;
+
+ if (xoffset == 0) { // skip_1stpass_filter
+ d22u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d23u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d24u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d25u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d26u8 = vld1_u8(src_ptr);
+ } else {
+ q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q5u8 = vld1q_u8(src_ptr);
+
+ d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
+
+ q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
+ q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
+ q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+ q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+ q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
+
+ d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1);
+ d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1);
+ d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+ d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+ d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
+
+ q6u16 = vmlal_u8(q6u16, d3u8, d1u8);
+ q7u16 = vmlal_u8(q7u16, d5u8, d1u8);
+ q8u16 = vmlal_u8(q8u16, d7u8, d1u8);
+ q9u16 = vmlal_u8(q9u16, d9u8, d1u8);
+ q10u16 = vmlal_u8(q10u16, d11u8, d1u8);
+
+ d22u8 = vqrshrn_n_u16(q6u16, 7);
+ d23u8 = vqrshrn_n_u16(q7u16, 7);
+ d24u8 = vqrshrn_n_u16(q8u16, 7);
+ d25u8 = vqrshrn_n_u16(q9u16, 7);
+ d26u8 = vqrshrn_n_u16(q10u16, 7);
+ }
+
+ // secondpass_filter
+ if (yoffset == 0) { // skip_2ndpass_filter
+ vst1_u8((uint8_t *)dst_ptr, d22u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d23u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d24u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d25u8);
+ } else {
+ d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
+
+ q1u16 = vmull_u8(d22u8, d0u8);
+ q2u16 = vmull_u8(d23u8, d0u8);
+ q3u16 = vmull_u8(d24u8, d0u8);
+ q4u16 = vmull_u8(d25u8, d0u8);
+
+ q1u16 = vmlal_u8(q1u16, d23u8, d1u8);
+ q2u16 = vmlal_u8(q2u16, d24u8, d1u8);
+ q3u16 = vmlal_u8(q3u16, d25u8, d1u8);
+ q4u16 = vmlal_u8(q4u16, d26u8, d1u8);
+
+ d2u8 = vqrshrn_n_u16(q1u16, 7);
+ d3u8 = vqrshrn_n_u16(q2u16, 7);
+ d4u8 = vqrshrn_n_u16(q3u16, 7);
+ d5u8 = vqrshrn_n_u16(q4u16, 7);
+
+ vst1_u8((uint8_t *)dst_ptr, d2u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d3u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d4u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d5u8);
+ }
+ return;
+}
+
+void vp8_bilinear_predict8x8_neon(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8, d11u8;
+ uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8;
+ uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8;
+ uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16;
+ uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16;
+
+ if (xoffset == 0) { // skip_1stpass_filter
+ d22u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d23u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d24u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d25u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d26u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d27u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d28u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d29u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+ d30u8 = vld1_u8(src_ptr);
+ } else {
+ q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+
+ d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
+
+ q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
+ q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
+ q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+ q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+
+ d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1);
+ d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1);
+ d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+ d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+
+ q6u16 = vmlal_u8(q6u16, d3u8, d1u8);
+ q7u16 = vmlal_u8(q7u16, d5u8, d1u8);
+ q8u16 = vmlal_u8(q8u16, d7u8, d1u8);
+ q9u16 = vmlal_u8(q9u16, d9u8, d1u8);
+
+ d22u8 = vqrshrn_n_u16(q6u16, 7);
+ d23u8 = vqrshrn_n_u16(q7u16, 7);
+ d24u8 = vqrshrn_n_u16(q8u16, 7);
+ d25u8 = vqrshrn_n_u16(q9u16, 7);
+
+ // first_pass filtering on the rest 5-line data
+ q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q5u8 = vld1q_u8(src_ptr);
+
+ q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
+ q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
+ q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+ q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+ q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
+
+ d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1);
+ d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1);
+ d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+ d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+ d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
+
+ q6u16 = vmlal_u8(q6u16, d3u8, d1u8);
+ q7u16 = vmlal_u8(q7u16, d5u8, d1u8);
+ q8u16 = vmlal_u8(q8u16, d7u8, d1u8);
+ q9u16 = vmlal_u8(q9u16, d9u8, d1u8);
+ q10u16 = vmlal_u8(q10u16, d11u8, d1u8);
+
+ d26u8 = vqrshrn_n_u16(q6u16, 7);
+ d27u8 = vqrshrn_n_u16(q7u16, 7);
+ d28u8 = vqrshrn_n_u16(q8u16, 7);
+ d29u8 = vqrshrn_n_u16(q9u16, 7);
+ d30u8 = vqrshrn_n_u16(q10u16, 7);
+ }
+
+ // secondpass_filter
+ if (yoffset == 0) { // skip_2ndpass_filter
+ vst1_u8((uint8_t *)dst_ptr, d22u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d23u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d24u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d25u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d26u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d27u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d28u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d29u8);
+ } else {
+ d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
+
+ q1u16 = vmull_u8(d22u8, d0u8);
+ q2u16 = vmull_u8(d23u8, d0u8);
+ q3u16 = vmull_u8(d24u8, d0u8);
+ q4u16 = vmull_u8(d25u8, d0u8);
+ q5u16 = vmull_u8(d26u8, d0u8);
+ q6u16 = vmull_u8(d27u8, d0u8);
+ q7u16 = vmull_u8(d28u8, d0u8);
+ q8u16 = vmull_u8(d29u8, d0u8);
+
+ q1u16 = vmlal_u8(q1u16, d23u8, d1u8);
+ q2u16 = vmlal_u8(q2u16, d24u8, d1u8);
+ q3u16 = vmlal_u8(q3u16, d25u8, d1u8);
+ q4u16 = vmlal_u8(q4u16, d26u8, d1u8);
+ q5u16 = vmlal_u8(q5u16, d27u8, d1u8);
+ q6u16 = vmlal_u8(q6u16, d28u8, d1u8);
+ q7u16 = vmlal_u8(q7u16, d29u8, d1u8);
+ q8u16 = vmlal_u8(q8u16, d30u8, d1u8);
+
+ d2u8 = vqrshrn_n_u16(q1u16, 7);
+ d3u8 = vqrshrn_n_u16(q2u16, 7);
+ d4u8 = vqrshrn_n_u16(q3u16, 7);
+ d5u8 = vqrshrn_n_u16(q4u16, 7);
+ d6u8 = vqrshrn_n_u16(q5u16, 7);
+ d7u8 = vqrshrn_n_u16(q6u16, 7);
+ d8u8 = vqrshrn_n_u16(q7u16, 7);
+ d9u8 = vqrshrn_n_u16(q8u16, 7);
+
+ vst1_u8((uint8_t *)dst_ptr, d2u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d3u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d4u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d5u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d6u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d7u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d8u8); dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d9u8);
+ }
+ return;
+}
+
+void vp8_bilinear_predict16x16_neon(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ int i;
+ unsigned char tmp[272];
+ unsigned char *tmpp;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
+ uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8;
+ uint8x8_t d19u8, d20u8, d21u8;
+ uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
+ uint8x16_t q11u8, q12u8, q13u8, q14u8, q15u8;
+ uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16;
+ uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16;
+
+ if (xoffset == 0) { // secondpass_bfilter16x16_only
+ d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
+
+ q11u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ for (i = 4; i > 0; i--) {
+ q12u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q13u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q14u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+ q15u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+
+ q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
+ q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
+ q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
+ q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
+ q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
+ q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
+ q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
+ q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
+
+ q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
+ q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
+ q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
+ q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
+ q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
+ q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
+ q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
+ q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
+
+ d2u8 = vqrshrn_n_u16(q1u16, 7);
+ d3u8 = vqrshrn_n_u16(q2u16, 7);
+ d4u8 = vqrshrn_n_u16(q3u16, 7);
+ d5u8 = vqrshrn_n_u16(q4u16, 7);
+ d6u8 = vqrshrn_n_u16(q5u16, 7);
+ d7u8 = vqrshrn_n_u16(q6u16, 7);
+ d8u8 = vqrshrn_n_u16(q7u16, 7);
+ d9u8 = vqrshrn_n_u16(q8u16, 7);
+
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ q2u8 = vcombine_u8(d4u8, d5u8);
+ q3u8 = vcombine_u8(d6u8, d7u8);
+ q4u8 = vcombine_u8(d8u8, d9u8);
+
+ q11u8 = q15u8;
+
+ vst1q_u8((uint8_t *)dst_ptr, q1u8); dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q2u8); dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q3u8); dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q4u8); dst_ptr += dst_pitch;
+ }
+ return;
+ }
+
+ if (yoffset == 0) { // firstpass_bfilter16x16_only
+ d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
+
+ for (i = 4; i > 0 ; i--) {
+ d2u8 = vld1_u8(src_ptr);
+ d3u8 = vld1_u8(src_ptr + 8);
+ d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+ d5u8 = vld1_u8(src_ptr);
+ d6u8 = vld1_u8(src_ptr + 8);
+ d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+ d8u8 = vld1_u8(src_ptr);
+ d9u8 = vld1_u8(src_ptr + 8);
+ d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+ d11u8 = vld1_u8(src_ptr);
+ d12u8 = vld1_u8(src_ptr + 8);
+ d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+
+ q7u16 = vmull_u8(d2u8, d0u8);
+ q8u16 = vmull_u8(d3u8, d0u8);
+ q9u16 = vmull_u8(d5u8, d0u8);
+ q10u16 = vmull_u8(d6u8, d0u8);
+ q11u16 = vmull_u8(d8u8, d0u8);
+ q12u16 = vmull_u8(d9u8, d0u8);
+ q13u16 = vmull_u8(d11u8, d0u8);
+ q14u16 = vmull_u8(d12u8, d0u8);
+
+ d2u8 = vext_u8(d2u8, d3u8, 1);
+ d5u8 = vext_u8(d5u8, d6u8, 1);
+ d8u8 = vext_u8(d8u8, d9u8, 1);
+ d11u8 = vext_u8(d11u8, d12u8, 1);
+
+ q7u16 = vmlal_u8(q7u16, d2u8, d1u8);
+ q9u16 = vmlal_u8(q9u16, d5u8, d1u8);
+ q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
+ q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
+
+ d3u8 = vext_u8(d3u8, d4u8, 1);
+ d6u8 = vext_u8(d6u8, d7u8, 1);
+ d9u8 = vext_u8(d9u8, d10u8, 1);
+ d12u8 = vext_u8(d12u8, d13u8, 1);
+
+ q8u16 = vmlal_u8(q8u16, d3u8, d1u8);
+ q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
+ q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
+ q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
+
+ d14u8 = vqrshrn_n_u16(q7u16, 7);
+ d15u8 = vqrshrn_n_u16(q8u16, 7);
+ d16u8 = vqrshrn_n_u16(q9u16, 7);
+ d17u8 = vqrshrn_n_u16(q10u16, 7);
+ d18u8 = vqrshrn_n_u16(q11u16, 7);
+ d19u8 = vqrshrn_n_u16(q12u16, 7);
+ d20u8 = vqrshrn_n_u16(q13u16, 7);
+ d21u8 = vqrshrn_n_u16(q14u16, 7);
+
+ q7u8 = vcombine_u8(d14u8, d15u8);
+ q8u8 = vcombine_u8(d16u8, d17u8);
+ q9u8 = vcombine_u8(d18u8, d19u8);
+ q10u8 =vcombine_u8(d20u8, d21u8);
+
+ vst1q_u8((uint8_t *)dst_ptr, q7u8); dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q8u8); dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q9u8); dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q10u8); dst_ptr += dst_pitch;
+ }
+ return;
+ }
+
+ d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
+
+ d2u8 = vld1_u8(src_ptr);
+ d3u8 = vld1_u8(src_ptr + 8);
+ d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+ d5u8 = vld1_u8(src_ptr);
+ d6u8 = vld1_u8(src_ptr + 8);
+ d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+ d8u8 = vld1_u8(src_ptr);
+ d9u8 = vld1_u8(src_ptr + 8);
+ d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+ d11u8 = vld1_u8(src_ptr);
+ d12u8 = vld1_u8(src_ptr + 8);
+ d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+
+ // First Pass: output_height lines x output_width columns (17x16)
+ tmpp = tmp;
+ for (i = 3; i > 0; i--) {
+ q7u16 = vmull_u8(d2u8, d0u8);
+ q8u16 = vmull_u8(d3u8, d0u8);
+ q9u16 = vmull_u8(d5u8, d0u8);
+ q10u16 = vmull_u8(d6u8, d0u8);
+ q11u16 = vmull_u8(d8u8, d0u8);
+ q12u16 = vmull_u8(d9u8, d0u8);
+ q13u16 = vmull_u8(d11u8, d0u8);
+ q14u16 = vmull_u8(d12u8, d0u8);
+
+ d2u8 = vext_u8(d2u8, d3u8, 1);
+ d5u8 = vext_u8(d5u8, d6u8, 1);
+ d8u8 = vext_u8(d8u8, d9u8, 1);
+ d11u8 = vext_u8(d11u8, d12u8, 1);
+
+ q7u16 = vmlal_u8(q7u16, d2u8, d1u8);
+ q9u16 = vmlal_u8(q9u16, d5u8, d1u8);
+ q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
+ q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
+
+ d3u8 = vext_u8(d3u8, d4u8, 1);
+ d6u8 = vext_u8(d6u8, d7u8, 1);
+ d9u8 = vext_u8(d9u8, d10u8, 1);
+ d12u8 = vext_u8(d12u8, d13u8, 1);
+
+ q8u16 = vmlal_u8(q8u16, d3u8, d1u8);
+ q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
+ q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
+ q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
+
+ d14u8 = vqrshrn_n_u16(q7u16, 7);
+ d15u8 = vqrshrn_n_u16(q8u16, 7);
+ d16u8 = vqrshrn_n_u16(q9u16, 7);
+ d17u8 = vqrshrn_n_u16(q10u16, 7);
+ d18u8 = vqrshrn_n_u16(q11u16, 7);
+ d19u8 = vqrshrn_n_u16(q12u16, 7);
+ d20u8 = vqrshrn_n_u16(q13u16, 7);
+ d21u8 = vqrshrn_n_u16(q14u16, 7);
+
+ d2u8 = vld1_u8(src_ptr);
+ d3u8 = vld1_u8(src_ptr + 8);
+ d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+ d5u8 = vld1_u8(src_ptr);
+ d6u8 = vld1_u8(src_ptr + 8);
+ d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+ d8u8 = vld1_u8(src_ptr);
+ d9u8 = vld1_u8(src_ptr + 8);
+ d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+ d11u8 = vld1_u8(src_ptr);
+ d12u8 = vld1_u8(src_ptr + 8);
+ d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+
+ q7u8 = vcombine_u8(d14u8, d15u8);
+ q8u8 = vcombine_u8(d16u8, d17u8);
+ q9u8 = vcombine_u8(d18u8, d19u8);
+ q10u8 = vcombine_u8(d20u8, d21u8);
+
+ vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q9u8); tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q10u8); tmpp += 16;
+ }
+
+ // First-pass filtering for rest 5 lines
+ d14u8 = vld1_u8(src_ptr);
+ d15u8 = vld1_u8(src_ptr + 8);
+ d16u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+
+ q9u16 = vmull_u8(d2u8, d0u8);
+ q10u16 = vmull_u8(d3u8, d0u8);
+ q11u16 = vmull_u8(d5u8, d0u8);
+ q12u16 = vmull_u8(d6u8, d0u8);
+ q13u16 = vmull_u8(d8u8, d0u8);
+ q14u16 = vmull_u8(d9u8, d0u8);
+
+ d2u8 = vext_u8(d2u8, d3u8, 1);
+ d5u8 = vext_u8(d5u8, d6u8, 1);
+ d8u8 = vext_u8(d8u8, d9u8, 1);
+
+ q9u16 = vmlal_u8(q9u16, d2u8, d1u8);
+ q11u16 = vmlal_u8(q11u16, d5u8, d1u8);
+ q13u16 = vmlal_u8(q13u16, d8u8, d1u8);
+
+ d3u8 = vext_u8(d3u8, d4u8, 1);
+ d6u8 = vext_u8(d6u8, d7u8, 1);
+ d9u8 = vext_u8(d9u8, d10u8, 1);
+
+ q10u16 = vmlal_u8(q10u16, d3u8, d1u8);
+ q12u16 = vmlal_u8(q12u16, d6u8, d1u8);
+ q14u16 = vmlal_u8(q14u16, d9u8, d1u8);
+
+ q1u16 = vmull_u8(d11u8, d0u8);
+ q2u16 = vmull_u8(d12u8, d0u8);
+ q3u16 = vmull_u8(d14u8, d0u8);
+ q4u16 = vmull_u8(d15u8, d0u8);
+
+ d11u8 = vext_u8(d11u8, d12u8, 1);
+ d14u8 = vext_u8(d14u8, d15u8, 1);
+
+ q1u16 = vmlal_u8(q1u16, d11u8, d1u8);
+ q3u16 = vmlal_u8(q3u16, d14u8, d1u8);
+
+ d12u8 = vext_u8(d12u8, d13u8, 1);
+ d15u8 = vext_u8(d15u8, d16u8, 1);
+
+ q2u16 = vmlal_u8(q2u16, d12u8, d1u8);
+ q4u16 = vmlal_u8(q4u16, d15u8, d1u8);
+
+ d10u8 = vqrshrn_n_u16(q9u16, 7);
+ d11u8 = vqrshrn_n_u16(q10u16, 7);
+ d12u8 = vqrshrn_n_u16(q11u16, 7);
+ d13u8 = vqrshrn_n_u16(q12u16, 7);
+ d14u8 = vqrshrn_n_u16(q13u16, 7);
+ d15u8 = vqrshrn_n_u16(q14u16, 7);
+ d16u8 = vqrshrn_n_u16(q1u16, 7);
+ d17u8 = vqrshrn_n_u16(q2u16, 7);
+ d18u8 = vqrshrn_n_u16(q3u16, 7);
+ d19u8 = vqrshrn_n_u16(q4u16, 7);
+
+ q5u8 = vcombine_u8(d10u8, d11u8);
+ q6u8 = vcombine_u8(d12u8, d13u8);
+ q7u8 = vcombine_u8(d14u8, d15u8);
+ q8u8 = vcombine_u8(d16u8, d17u8);
+ q9u8 = vcombine_u8(d18u8, d19u8);
+
+ vst1q_u8((uint8_t *)tmpp, q5u8); tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q6u8); tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q9u8);
+
+ // secondpass_filter
+ d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
+
+ tmpp = tmp;
+ q11u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ for (i = 4; i > 0; i--) {
+ q12u8 = vld1q_u8(tmpp); tmpp += 16;
+ q13u8 = vld1q_u8(tmpp); tmpp += 16;
+ q14u8 = vld1q_u8(tmpp); tmpp += 16;
+ q15u8 = vld1q_u8(tmpp); tmpp += 16;
+
+ q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
+ q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
+ q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
+ q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
+ q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
+ q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
+ q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
+ q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
+
+ q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
+ q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
+ q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
+ q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
+ q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
+ q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
+ q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
+ q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
+
+ d2u8 = vqrshrn_n_u16(q1u16, 7);
+ d3u8 = vqrshrn_n_u16(q2u16, 7);
+ d4u8 = vqrshrn_n_u16(q3u16, 7);
+ d5u8 = vqrshrn_n_u16(q4u16, 7);
+ d6u8 = vqrshrn_n_u16(q5u16, 7);
+ d7u8 = vqrshrn_n_u16(q6u16, 7);
+ d8u8 = vqrshrn_n_u16(q7u16, 7);
+ d9u8 = vqrshrn_n_u16(q8u16, 7);
+
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ q2u8 = vcombine_u8(d4u8, d5u8);
+ q3u8 = vcombine_u8(d6u8, d7u8);
+ q4u8 = vcombine_u8(d8u8, d9u8);
+
+ q11u8 = q15u8;
+
+ vst1q_u8((uint8_t *)dst_ptr, q1u8); dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q2u8); dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q3u8); dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q4u8); dst_ptr += dst_pitch;
+ }
+ return;
+}
diff --git a/thirdparty/libvpx/vp8/common/arm/neon/copymem_neon.c b/thirdparty/libvpx/vp8/common/arm/neon/copymem_neon.c
new file mode 100644
index 0000000000..deced115c1
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/arm/neon/copymem_neon.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vp8_copy_mem8x4_neon(
+ unsigned char *src,
+ int src_stride,
+ unsigned char *dst,
+ int dst_stride) {
+ uint8x8_t vtmp;
+ int r;
+
+ for (r = 0; r < 4; r++) {
+ vtmp = vld1_u8(src);
+ vst1_u8(dst, vtmp);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vp8_copy_mem8x8_neon(
+ unsigned char *src,
+ int src_stride,
+ unsigned char *dst,
+ int dst_stride) {
+ uint8x8_t vtmp;
+ int r;
+
+ for (r = 0; r < 8; r++) {
+ vtmp = vld1_u8(src);
+ vst1_u8(dst, vtmp);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vp8_copy_mem16x16_neon(
+ unsigned char *src,
+ int src_stride,
+ unsigned char *dst,
+ int dst_stride) {
+ int r;
+ uint8x16_t qtmp;
+
+ for (r = 0; r < 16; r++) {
+ qtmp = vld1q_u8(src);
+ vst1q_u8(dst, qtmp);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
diff --git a/thirdparty/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c b/thirdparty/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c
new file mode 100644
index 0000000000..ad5f41d7de
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vp8_dc_only_idct_add_neon(
+ int16_t input_dc,
+ unsigned char *pred_ptr,
+ int pred_stride,
+ unsigned char *dst_ptr,
+ int dst_stride) {
+ int i;
+ uint16_t a1 = ((input_dc + 4) >> 3);
+ uint32x2_t d2u32 = vdup_n_u32(0);
+ uint8x8_t d2u8;
+ uint16x8_t q1u16;
+ uint16x8_t qAdd;
+
+ qAdd = vdupq_n_u16(a1);
+
+ for (i = 0; i < 2; i++) {
+ d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 0);
+ pred_ptr += pred_stride;
+ d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 1);
+ pred_ptr += pred_stride;
+
+ q1u16 = vaddw_u8(qAdd, vreinterpret_u8_u32(d2u32));
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
+
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0);
+ dst_ptr += dst_stride;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1);
+ dst_ptr += dst_stride;
+ }
+}
diff --git a/thirdparty/libvpx/vp8/common/arm/neon/dequant_idct_neon.c b/thirdparty/libvpx/vp8/common/arm/neon/dequant_idct_neon.c
new file mode 100644
index 0000000000..58e11922c7
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/arm/neon/dequant_idct_neon.c
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+static const int16_t cospi8sqrt2minus1 = 20091;
+static const int16_t sinpi8sqrt2 = 35468;
+
+void vp8_dequant_idct_add_neon(
+ int16_t *input,
+ int16_t *dq,
+ unsigned char *dst,
+ int stride) {
+ unsigned char *dst0;
+ int32x2_t d14, d15;
+ int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
+ int16x8_t q1, q2, q3, q4, q5, q6;
+ int16x8_t qEmpty = vdupq_n_s16(0);
+ int32x2x2_t d2tmp0, d2tmp1;
+ int16x4x2_t d2tmp2, d2tmp3;
+
+ d14 = d15 = vdup_n_s32(0);
+
+ // load input
+ q3 = vld1q_s16(input);
+ vst1q_s16(input, qEmpty);
+ input += 8;
+ q4 = vld1q_s16(input);
+ vst1q_s16(input, qEmpty);
+
+ // load dq
+ q5 = vld1q_s16(dq);
+ dq += 8;
+ q6 = vld1q_s16(dq);
+
+ // load src from dst
+ dst0 = dst;
+ d14 = vld1_lane_s32((const int32_t *)dst0, d14, 0);
+ dst0 += stride;
+ d14 = vld1_lane_s32((const int32_t *)dst0, d14, 1);
+ dst0 += stride;
+ d15 = vld1_lane_s32((const int32_t *)dst0, d15, 0);
+ dst0 += stride;
+ d15 = vld1_lane_s32((const int32_t *)dst0, d15, 1);
+
+ q1 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q3),
+ vreinterpretq_u16_s16(q5)));
+ q2 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q4),
+ vreinterpretq_u16_s16(q6)));
+
+ d12 = vqadd_s16(vget_low_s16(q1), vget_low_s16(q2));
+ d13 = vqsub_s16(vget_low_s16(q1), vget_low_s16(q2));
+
+ q2 = vcombine_s16(vget_high_s16(q1), vget_high_s16(q2));
+
+ q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
+ q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
+
+ q3 = vshrq_n_s16(q3, 1);
+ q4 = vshrq_n_s16(q4, 1);
+
+ q3 = vqaddq_s16(q3, q2);
+ q4 = vqaddq_s16(q4, q2);
+
+ d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
+ d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
+
+ d2 = vqadd_s16(d12, d11);
+ d3 = vqadd_s16(d13, d10);
+ d4 = vqsub_s16(d13, d10);
+ d5 = vqsub_s16(d12, d11);
+
+ d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
+ d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
+ d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
+ vreinterpret_s16_s32(d2tmp1.val[0]));
+ d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
+ vreinterpret_s16_s32(d2tmp1.val[1]));
+
+ // loop 2
+ q2 = vcombine_s16(d2tmp2.val[1], d2tmp3.val[1]);
+
+ q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
+ q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
+
+ d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]);
+ d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]);
+
+ q3 = vshrq_n_s16(q3, 1);
+ q4 = vshrq_n_s16(q4, 1);
+
+ q3 = vqaddq_s16(q3, q2);
+ q4 = vqaddq_s16(q4, q2);
+
+ d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
+ d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
+
+ d2 = vqadd_s16(d12, d11);
+ d3 = vqadd_s16(d13, d10);
+ d4 = vqsub_s16(d13, d10);
+ d5 = vqsub_s16(d12, d11);
+
+ d2 = vrshr_n_s16(d2, 3);
+ d3 = vrshr_n_s16(d3, 3);
+ d4 = vrshr_n_s16(d4, 3);
+ d5 = vrshr_n_s16(d5, 3);
+
+ d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
+ d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
+ d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
+ vreinterpret_s16_s32(d2tmp1.val[0]));
+ d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
+ vreinterpret_s16_s32(d2tmp1.val[1]));
+
+ q1 = vcombine_s16(d2tmp2.val[0], d2tmp2.val[1]);
+ q2 = vcombine_s16(d2tmp3.val[0], d2tmp3.val[1]);
+
+ q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1),
+ vreinterpret_u8_s32(d14)));
+ q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2),
+ vreinterpret_u8_s32(d15)));
+
+ d14 = vreinterpret_s32_u8(vqmovun_s16(q1));
+ d15 = vreinterpret_s32_u8(vqmovun_s16(q2));
+
+ dst0 = dst;
+ vst1_lane_s32((int32_t *)dst0, d14, 0);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst0, d14, 1);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst0, d15, 0);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst0, d15, 1);
+ return;
+}
diff --git a/thirdparty/libvpx/vp8/common/arm/neon/dequantizeb_neon.c b/thirdparty/libvpx/vp8/common/arm/neon/dequantizeb_neon.c
new file mode 100644
index 0000000000..54e709dd3c
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/arm/neon/dequantizeb_neon.c
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "vp8/common/blockd.h"
+
+void vp8_dequantize_b_neon(BLOCKD *d, short *DQC) {
+ int16x8x2_t qQ, qDQC, qDQ;
+
+ qQ = vld2q_s16(d->qcoeff);
+ qDQC = vld2q_s16(DQC);
+
+ qDQ.val[0] = vmulq_s16(qQ.val[0], qDQC.val[0]);
+ qDQ.val[1] = vmulq_s16(qQ.val[1], qDQC.val[1]);
+
+ vst2q_s16(d->dqcoeff, qDQ);
+}
diff --git a/thirdparty/libvpx/vp8/common/arm/neon/idct_blk_neon.c b/thirdparty/libvpx/vp8/common/arm/neon/idct_blk_neon.c
new file mode 100644
index 0000000000..fb327a7260
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/arm/neon/idct_blk_neon.c
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+
+/* place these declarations here because we don't want to maintain them
+ * outside of this scope
+ */
+void idct_dequant_full_2x_neon(short *q, short *dq,
+ unsigned char *dst, int stride);
+void idct_dequant_0_2x_neon(short *q, short dq,
+ unsigned char *dst, int stride);
+
+
+void vp8_dequant_idct_add_y_block_neon(short *q, short *dq,
+ unsigned char *dst,
+ int stride, char *eobs)
+{
+ int i;
+
+ for (i = 0; i < 4; i++)
+ {
+ if (((short *)(eobs))[0])
+ {
+ if (((short *)eobs)[0] & 0xfefe)
+ idct_dequant_full_2x_neon (q, dq, dst, stride);
+ else
+ idct_dequant_0_2x_neon (q, dq[0], dst, stride);
+ }
+
+ if (((short *)(eobs))[1])
+ {
+ if (((short *)eobs)[1] & 0xfefe)
+ idct_dequant_full_2x_neon (q+32, dq, dst+8, stride);
+ else
+ idct_dequant_0_2x_neon (q+32, dq[0], dst+8, stride);
+ }
+ q += 64;
+ dst += 4*stride;
+ eobs += 4;
+ }
+}
+
+void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq,
+ unsigned char *dstu,
+ unsigned char *dstv,
+ int stride, char *eobs)
+{
+ if (((short *)(eobs))[0])
+ {
+ if (((short *)eobs)[0] & 0xfefe)
+ idct_dequant_full_2x_neon (q, dq, dstu, stride);
+ else
+ idct_dequant_0_2x_neon (q, dq[0], dstu, stride);
+ }
+
+ q += 32;
+ dstu += 4*stride;
+
+ if (((short *)(eobs))[1])
+ {
+ if (((short *)eobs)[1] & 0xfefe)
+ idct_dequant_full_2x_neon (q, dq, dstu, stride);
+ else
+ idct_dequant_0_2x_neon (q, dq[0], dstu, stride);
+ }
+
+ q += 32;
+
+ if (((short *)(eobs))[2])
+ {
+ if (((short *)eobs)[2] & 0xfefe)
+ idct_dequant_full_2x_neon (q, dq, dstv, stride);
+ else
+ idct_dequant_0_2x_neon (q, dq[0], dstv, stride);
+ }
+
+ q += 32;
+ dstv += 4*stride;
+
+ if (((short *)(eobs))[3])
+ {
+ if (((short *)eobs)[3] & 0xfefe)
+ idct_dequant_full_2x_neon (q, dq, dstv, stride);
+ else
+ idct_dequant_0_2x_neon (q, dq[0], dstv, stride);
+ }
+}
diff --git a/thirdparty/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c b/thirdparty/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c
new file mode 100644
index 0000000000..e6f862fa89
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void idct_dequant_0_2x_neon(
+ int16_t *q,
+ int16_t dq,
+ unsigned char *dst,
+ int stride) {
+ unsigned char *dst0;
+ int i, a0, a1;
+ int16x8x2_t q2Add;
+ int32x2_t d2s32 = vdup_n_s32(0),
+ d4s32 = vdup_n_s32(0);
+ uint8x8_t d2u8, d4u8;
+ uint16x8_t q1u16, q2u16;
+
+ a0 = ((q[0] * dq) + 4) >> 3;
+ a1 = ((q[16] * dq) + 4) >> 3;
+ q[0] = q[16] = 0;
+ q2Add.val[0] = vdupq_n_s16((int16_t)a0);
+ q2Add.val[1] = vdupq_n_s16((int16_t)a1);
+
+ for (i = 0; i < 2; i++, dst += 4) {
+ dst0 = dst;
+ d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0);
+ dst0 += stride;
+ d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1);
+ dst0 += stride;
+ d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0);
+ dst0 += stride;
+ d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1);
+
+ q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
+ vreinterpret_u8_s32(d2s32));
+ q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
+ vreinterpret_u8_s32(d4s32));
+
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
+ d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
+
+ d2s32 = vreinterpret_s32_u8(d2u8);
+ d4s32 = vreinterpret_s32_u8(d4u8);
+
+ dst0 = dst;
+ vst1_lane_s32((int32_t *)dst0, d2s32, 0);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst0, d2s32, 1);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst0, d4s32, 0);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst0, d4s32, 1);
+ }
+ return;
+}
diff --git a/thirdparty/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c b/thirdparty/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c
new file mode 100644
index 0000000000..a60ed46b76
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+static const int16_t cospi8sqrt2minus1 = 20091;
+static const int16_t sinpi8sqrt2 = 17734;
+// because the lowest bit in 0x8a8c is 0, we can pre-shift this
+
+void idct_dequant_full_2x_neon(
+ int16_t *q,
+ int16_t *dq,
+ unsigned char *dst,
+ int stride) {
+ unsigned char *dst0, *dst1;
+ int32x2_t d28, d29, d30, d31;
+ int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
+ int16x8_t qEmpty = vdupq_n_s16(0);
+ int32x4x2_t q2tmp0, q2tmp1;
+ int16x8x2_t q2tmp2, q2tmp3;
+ int16x4_t dLow0, dLow1, dHigh0, dHigh1;
+
+ d28 = d29 = d30 = d31 = vdup_n_s32(0);
+
+ // load dq
+ q0 = vld1q_s16(dq);
+ dq += 8;
+ q1 = vld1q_s16(dq);
+
+ // load q
+ q2 = vld1q_s16(q);
+ vst1q_s16(q, qEmpty);
+ q += 8;
+ q3 = vld1q_s16(q);
+ vst1q_s16(q, qEmpty);
+ q += 8;
+ q4 = vld1q_s16(q);
+ vst1q_s16(q, qEmpty);
+ q += 8;
+ q5 = vld1q_s16(q);
+ vst1q_s16(q, qEmpty);
+
+ // load src from dst
+ dst0 = dst;
+ dst1 = dst + 4;
+ d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0);
+ dst0 += stride;
+ d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1);
+ dst1 += stride;
+ d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0);
+ dst0 += stride;
+ d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1);
+ dst1 += stride;
+
+ d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0);
+ dst0 += stride;
+ d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1);
+ dst1 += stride;
+ d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0);
+ d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1);
+
+ q2 = vmulq_s16(q2, q0);
+ q3 = vmulq_s16(q3, q1);
+ q4 = vmulq_s16(q4, q0);
+ q5 = vmulq_s16(q5, q1);
+
+ // vswp
+ dLow0 = vget_low_s16(q2);
+ dHigh0 = vget_high_s16(q2);
+ dLow1 = vget_low_s16(q4);
+ dHigh1 = vget_high_s16(q4);
+ q2 = vcombine_s16(dLow0, dLow1);
+ q4 = vcombine_s16(dHigh0, dHigh1);
+
+ dLow0 = vget_low_s16(q3);
+ dHigh0 = vget_high_s16(q3);
+ dLow1 = vget_low_s16(q5);
+ dHigh1 = vget_high_s16(q5);
+ q3 = vcombine_s16(dLow0, dLow1);
+ q5 = vcombine_s16(dHigh0, dHigh1);
+
+ q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2);
+ q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2);
+ q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1);
+ q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1);
+
+ q10 = vqaddq_s16(q2, q3);
+ q11 = vqsubq_s16(q2, q3);
+
+ q8 = vshrq_n_s16(q8, 1);
+ q9 = vshrq_n_s16(q9, 1);
+
+ q4 = vqaddq_s16(q4, q8);
+ q5 = vqaddq_s16(q5, q9);
+
+ q2 = vqsubq_s16(q6, q5);
+ q3 = vqaddq_s16(q7, q4);
+
+ q4 = vqaddq_s16(q10, q3);
+ q5 = vqaddq_s16(q11, q2);
+ q6 = vqsubq_s16(q11, q2);
+ q7 = vqsubq_s16(q10, q3);
+
+ q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
+ q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
+ q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
+ vreinterpretq_s16_s32(q2tmp1.val[0]));
+ q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
+ vreinterpretq_s16_s32(q2tmp1.val[1]));
+
+ // loop 2
+ q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2);
+ q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2);
+ q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1);
+ q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1);
+
+ q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]);
+ q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]);
+
+ q10 = vshrq_n_s16(q10, 1);
+ q11 = vshrq_n_s16(q11, 1);
+
+ q10 = vqaddq_s16(q2tmp2.val[1], q10);
+ q11 = vqaddq_s16(q2tmp3.val[1], q11);
+
+ q8 = vqsubq_s16(q8, q11);
+ q9 = vqaddq_s16(q9, q10);
+
+ q4 = vqaddq_s16(q2, q9);
+ q5 = vqaddq_s16(q3, q8);
+ q6 = vqsubq_s16(q3, q8);
+ q7 = vqsubq_s16(q2, q9);
+
+ q4 = vrshrq_n_s16(q4, 3);
+ q5 = vrshrq_n_s16(q5, 3);
+ q6 = vrshrq_n_s16(q6, 3);
+ q7 = vrshrq_n_s16(q7, 3);
+
+ q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
+ q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
+ q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
+ vreinterpretq_s16_s32(q2tmp1.val[0]));
+ q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
+ vreinterpretq_s16_s32(q2tmp1.val[1]));
+
+ q4 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]),
+ vreinterpret_u8_s32(d28)));
+ q5 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]),
+ vreinterpret_u8_s32(d29)));
+ q6 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]),
+ vreinterpret_u8_s32(d30)));
+ q7 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]),
+ vreinterpret_u8_s32(d31)));
+
+ d28 = vreinterpret_s32_u8(vqmovun_s16(q4));
+ d29 = vreinterpret_s32_u8(vqmovun_s16(q5));
+ d30 = vreinterpret_s32_u8(vqmovun_s16(q6));
+ d31 = vreinterpret_s32_u8(vqmovun_s16(q7));
+
+ dst0 = dst;
+ dst1 = dst + 4;
+ vst1_lane_s32((int32_t *)dst0, d28, 0);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst1, d28, 1);
+ dst1 += stride;
+ vst1_lane_s32((int32_t *)dst0, d29, 0);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst1, d29, 1);
+ dst1 += stride;
+
+ vst1_lane_s32((int32_t *)dst0, d30, 0);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst1, d30, 1);
+ dst1 += stride;
+ vst1_lane_s32((int32_t *)dst0, d31, 0);
+ vst1_lane_s32((int32_t *)dst1, d31, 1);
+ return;
+}
diff --git a/thirdparty/libvpx/vp8/common/arm/neon/iwalsh_neon.c b/thirdparty/libvpx/vp8/common/arm/neon/iwalsh_neon.c
new file mode 100644
index 0000000000..6ea9dd712a
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/arm/neon/iwalsh_neon.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vp8_short_inv_walsh4x4_neon(
+ int16_t *input,
+ int16_t *mb_dqcoeff) {
+ int16x8_t q0s16, q1s16, q2s16, q3s16;
+ int16x4_t d4s16, d5s16, d6s16, d7s16;
+ int16x4x2_t v2tmp0, v2tmp1;
+ int32x2x2_t v2tmp2, v2tmp3;
+ int16x8_t qAdd3;
+
+ q0s16 = vld1q_s16(input);
+ q1s16 = vld1q_s16(input + 8);
+
+ // 1st for loop
+ d4s16 = vadd_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
+ d6s16 = vadd_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
+ d5s16 = vsub_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
+ d7s16 = vsub_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
+
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q3s16 = vcombine_s16(d6s16, d7s16);
+
+ q0s16 = vaddq_s16(q2s16, q3s16);
+ q1s16 = vsubq_s16(q2s16, q3s16);
+
+ v2tmp2 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(q0s16)),
+ vreinterpret_s32_s16(vget_low_s16(q1s16)));
+ v2tmp3 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(q0s16)),
+ vreinterpret_s32_s16(vget_high_s16(q1s16)));
+ v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]),
+ vreinterpret_s16_s32(v2tmp3.val[0]));
+ v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]),
+ vreinterpret_s16_s32(v2tmp3.val[1]));
+
+ // 2nd for loop
+ d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
+ d6s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
+ d5s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
+ d7s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q3s16 = vcombine_s16(d6s16, d7s16);
+
+ qAdd3 = vdupq_n_s16(3);
+
+ q0s16 = vaddq_s16(q2s16, q3s16);
+ q1s16 = vsubq_s16(q2s16, q3s16);
+
+ q0s16 = vaddq_s16(q0s16, qAdd3);
+ q1s16 = vaddq_s16(q1s16, qAdd3);
+
+ q0s16 = vshrq_n_s16(q0s16, 3);
+ q1s16 = vshrq_n_s16(q1s16, 3);
+
+ // store
+ vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 0);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 0);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 0);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 0);
+ mb_dqcoeff += 16;
+
+ vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 1);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 1);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 1);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 1);
+ mb_dqcoeff += 16;
+
+ vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 2);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 2);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 2);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 2);
+ mb_dqcoeff += 16;
+
+ vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 3);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 3);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 3);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 3);
+ mb_dqcoeff += 16;
+ return;
+}
diff --git a/thirdparty/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c b/thirdparty/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c
new file mode 100644
index 0000000000..b25686ffb8
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_config.h"
+
+static INLINE void vp8_loop_filter_simple_horizontal_edge_neon(
+ unsigned char *s,
+ int p,
+ const unsigned char *blimit) {
+ uint8_t *sp;
+ uint8x16_t qblimit, q0u8;
+ uint8x16_t q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, q14u8, q15u8;
+ int16x8_t q2s16, q3s16, q13s16;
+ int8x8_t d8s8, d9s8;
+ int8x16_t q2s8, q3s8, q4s8, q10s8, q11s8, q14s8;
+
+ qblimit = vdupq_n_u8(*blimit);
+
+ sp = s - (p << 1);
+ q5u8 = vld1q_u8(sp);
+ sp += p;
+ q6u8 = vld1q_u8(sp);
+ sp += p;
+ q7u8 = vld1q_u8(sp);
+ sp += p;
+ q8u8 = vld1q_u8(sp);
+
+ q15u8 = vabdq_u8(q6u8, q7u8);
+ q14u8 = vabdq_u8(q5u8, q8u8);
+
+ q15u8 = vqaddq_u8(q15u8, q15u8);
+ q14u8 = vshrq_n_u8(q14u8, 1);
+ q0u8 = vdupq_n_u8(0x80);
+ q13s16 = vdupq_n_s16(3);
+ q15u8 = vqaddq_u8(q15u8, q14u8);
+
+ q5u8 = veorq_u8(q5u8, q0u8);
+ q6u8 = veorq_u8(q6u8, q0u8);
+ q7u8 = veorq_u8(q7u8, q0u8);
+ q8u8 = veorq_u8(q8u8, q0u8);
+
+ q15u8 = vcgeq_u8(qblimit, q15u8);
+
+ q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7u8)),
+ vget_low_s8(vreinterpretq_s8_u8(q6u8)));
+ q3s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7u8)),
+ vget_high_s8(vreinterpretq_s8_u8(q6u8)));
+
+ q4s8 = vqsubq_s8(vreinterpretq_s8_u8(q5u8),
+ vreinterpretq_s8_u8(q8u8));
+
+ q2s16 = vmulq_s16(q2s16, q13s16);
+ q3s16 = vmulq_s16(q3s16, q13s16);
+
+ q10u8 = vdupq_n_u8(3);
+ q9u8 = vdupq_n_u8(4);
+
+ q2s16 = vaddw_s8(q2s16, vget_low_s8(q4s8));
+ q3s16 = vaddw_s8(q3s16, vget_high_s8(q4s8));
+
+ d8s8 = vqmovn_s16(q2s16);
+ d9s8 = vqmovn_s16(q3s16);
+ q4s8 = vcombine_s8(d8s8, d9s8);
+
+ q14s8 = vandq_s8(q4s8, vreinterpretq_s8_u8(q15u8));
+
+ q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q10u8));
+ q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q9u8));
+ q2s8 = vshrq_n_s8(q2s8, 3);
+ q3s8 = vshrq_n_s8(q3s8, 3);
+
+ q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6u8), q2s8);
+ q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7u8), q3s8);
+
+ q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
+ q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
+
+ vst1q_u8(s, q7u8);
+ s -= p;
+ vst1q_u8(s, q6u8);
+ return;
+}
+
+void vp8_loop_filter_bhs_neon(
+ unsigned char *y_ptr,
+ int y_stride,
+ const unsigned char *blimit) {
+ y_ptr += y_stride * 4;
+ vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
+ y_ptr += y_stride * 4;
+ vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
+ y_ptr += y_stride * 4;
+ vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
+ return;
+}
+
+void vp8_loop_filter_mbhs_neon(
+ unsigned char *y_ptr,
+ int y_stride,
+ const unsigned char *blimit) {
+ vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
+ return;
+}
diff --git a/thirdparty/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c b/thirdparty/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
new file mode 100644
index 0000000000..921bcad698
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
@@ -0,0 +1,283 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_config.h"
+#include "vpx_ports/arm.h"
+
+#ifdef VPX_INCOMPATIBLE_GCC
+static INLINE void write_2x4(unsigned char *dst, int pitch,
+ const uint8x8x2_t result) {
+ /*
+ * uint8x8x2_t result
+ 00 01 02 03 | 04 05 06 07
+ 10 11 12 13 | 14 15 16 17
+ ---
+ * after vtrn_u8
+ 00 10 02 12 | 04 14 06 16
+ 01 11 03 13 | 05 15 07 17
+ */
+ const uint8x8x2_t r01_u8 = vtrn_u8(result.val[0],
+ result.val[1]);
+ const uint16x4_t x_0_4 = vreinterpret_u16_u8(r01_u8.val[0]);
+ const uint16x4_t x_1_5 = vreinterpret_u16_u8(r01_u8.val[1]);
+ vst1_lane_u16((uint16_t *)dst, x_0_4, 0);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_1_5, 0);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_0_4, 1);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_1_5, 1);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_0_4, 2);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_1_5, 2);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_0_4, 3);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_1_5, 3);
+}
+
+static INLINE void write_2x8(unsigned char *dst, int pitch,
+ const uint8x8x2_t result,
+ const uint8x8x2_t result2) {
+ write_2x4(dst, pitch, result);
+ dst += pitch * 8;
+ write_2x4(dst, pitch, result2);
+}
+#else
+static INLINE void write_2x8(unsigned char *dst, int pitch,
+ const uint8x8x2_t result,
+ const uint8x8x2_t result2) {
+ vst2_lane_u8(dst, result, 0);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 1);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 2);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 3);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 4);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 5);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 6);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 7);
+ dst += pitch;
+
+ vst2_lane_u8(dst, result2, 0);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 1);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 2);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 3);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 4);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 5);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 6);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 7);
+}
+#endif // VPX_INCOMPATIBLE_GCC
+
+
+#ifdef VPX_INCOMPATIBLE_GCC
+static INLINE
+uint8x8x4_t read_4x8(unsigned char *src, int pitch) {
+ uint8x8x4_t x;
+ const uint8x8_t a = vld1_u8(src);
+ const uint8x8_t b = vld1_u8(src + pitch * 1);
+ const uint8x8_t c = vld1_u8(src + pitch * 2);
+ const uint8x8_t d = vld1_u8(src + pitch * 3);
+ const uint8x8_t e = vld1_u8(src + pitch * 4);
+ const uint8x8_t f = vld1_u8(src + pitch * 5);
+ const uint8x8_t g = vld1_u8(src + pitch * 6);
+ const uint8x8_t h = vld1_u8(src + pitch * 7);
+ const uint32x2x2_t r04_u32 = vtrn_u32(vreinterpret_u32_u8(a),
+ vreinterpret_u32_u8(e));
+ const uint32x2x2_t r15_u32 = vtrn_u32(vreinterpret_u32_u8(b),
+ vreinterpret_u32_u8(f));
+ const uint32x2x2_t r26_u32 = vtrn_u32(vreinterpret_u32_u8(c),
+ vreinterpret_u32_u8(g));
+ const uint32x2x2_t r37_u32 = vtrn_u32(vreinterpret_u32_u8(d),
+ vreinterpret_u32_u8(h));
+ const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u32(r04_u32.val[0]),
+ vreinterpret_u16_u32(r26_u32.val[0]));
+ const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u32(r15_u32.val[0]),
+ vreinterpret_u16_u32(r37_u32.val[0]));
+ const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
+ vreinterpret_u8_u16(r13_u16.val[0]));
+ const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
+ vreinterpret_u8_u16(r13_u16.val[1]));
+ /*
+ * after vtrn_u32
+ 00 01 02 03 | 40 41 42 43
+ 10 11 12 13 | 50 51 52 53
+ 20 21 22 23 | 60 61 62 63
+ 30 31 32 33 | 70 71 72 73
+ ---
+ * after vtrn_u16
+ 00 01 20 21 | 40 41 60 61
+ 02 03 22 23 | 42 43 62 63
+ 10 11 30 31 | 50 51 70 71
+ 12 13 32 33 | 52 52 72 73
+
+ 00 01 20 21 | 40 41 60 61
+ 10 11 30 31 | 50 51 70 71
+ 02 03 22 23 | 42 43 62 63
+ 12 13 32 33 | 52 52 72 73
+ ---
+ * after vtrn_u8
+ 00 10 20 30 | 40 50 60 70
+ 01 11 21 31 | 41 51 61 71
+ 02 12 22 32 | 42 52 62 72
+ 03 13 23 33 | 43 53 63 73
+ */
+ x.val[0] = r01_u8.val[0];
+ x.val[1] = r01_u8.val[1];
+ x.val[2] = r23_u8.val[0];
+ x.val[3] = r23_u8.val[1];
+
+ return x;
+}
+#else
+static INLINE
+uint8x8x4_t read_4x8(unsigned char *src, int pitch) {
+ uint8x8x4_t x;
+ x.val[0] = x.val[1] = x.val[2] = x.val[3] = vdup_n_u8(0);
+ x = vld4_lane_u8(src, x, 0);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 1);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 2);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 3);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 4);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 5);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 6);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 7);
+ return x;
+}
+#endif // VPX_INCOMPATIBLE_GCC
+
+static INLINE void vp8_loop_filter_simple_vertical_edge_neon(
+ unsigned char *s,
+ int p,
+ const unsigned char *blimit) {
+ unsigned char *src1;
+ uint8x16_t qblimit, q0u8;
+ uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q11u8, q12u8, q14u8, q15u8;
+ int16x8_t q2s16, q13s16, q11s16;
+ int8x8_t d28s8, d29s8;
+ int8x16_t q2s8, q3s8, q10s8, q11s8, q14s8;
+ uint8x8x4_t d0u8x4; // d6, d7, d8, d9
+ uint8x8x4_t d1u8x4; // d10, d11, d12, d13
+ uint8x8x2_t d2u8x2; // d12, d13
+ uint8x8x2_t d3u8x2; // d14, d15
+
+ qblimit = vdupq_n_u8(*blimit);
+
+ src1 = s - 2;
+ d0u8x4 = read_4x8(src1, p);
+ src1 += p * 8;
+ d1u8x4 = read_4x8(src1, p);
+
+ q3u8 = vcombine_u8(d0u8x4.val[0], d1u8x4.val[0]); // d6 d10
+ q4u8 = vcombine_u8(d0u8x4.val[2], d1u8x4.val[2]); // d8 d12
+ q5u8 = vcombine_u8(d0u8x4.val[1], d1u8x4.val[1]); // d7 d11
+ q6u8 = vcombine_u8(d0u8x4.val[3], d1u8x4.val[3]); // d9 d13
+
+ q15u8 = vabdq_u8(q5u8, q4u8);
+ q14u8 = vabdq_u8(q3u8, q6u8);
+
+ q15u8 = vqaddq_u8(q15u8, q15u8);
+ q14u8 = vshrq_n_u8(q14u8, 1);
+ q0u8 = vdupq_n_u8(0x80);
+ q11s16 = vdupq_n_s16(3);
+ q15u8 = vqaddq_u8(q15u8, q14u8);
+
+ q3u8 = veorq_u8(q3u8, q0u8);
+ q4u8 = veorq_u8(q4u8, q0u8);
+ q5u8 = veorq_u8(q5u8, q0u8);
+ q6u8 = veorq_u8(q6u8, q0u8);
+
+ q15u8 = vcgeq_u8(qblimit, q15u8);
+
+ q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q4u8)),
+ vget_low_s8(vreinterpretq_s8_u8(q5u8)));
+ q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q4u8)),
+ vget_high_s8(vreinterpretq_s8_u8(q5u8)));
+
+ q14s8 = vqsubq_s8(vreinterpretq_s8_u8(q3u8),
+ vreinterpretq_s8_u8(q6u8));
+
+ q2s16 = vmulq_s16(q2s16, q11s16);
+ q13s16 = vmulq_s16(q13s16, q11s16);
+
+ q11u8 = vdupq_n_u8(3);
+ q12u8 = vdupq_n_u8(4);
+
+ q2s16 = vaddw_s8(q2s16, vget_low_s8(q14s8));
+ q13s16 = vaddw_s8(q13s16, vget_high_s8(q14s8));
+
+ d28s8 = vqmovn_s16(q2s16);
+ d29s8 = vqmovn_s16(q13s16);
+ q14s8 = vcombine_s8(d28s8, d29s8);
+
+ q14s8 = vandq_s8(q14s8, vreinterpretq_s8_u8(q15u8));
+
+ q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q11u8));
+ q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q12u8));
+ q2s8 = vshrq_n_s8(q2s8, 3);
+ q14s8 = vshrq_n_s8(q3s8, 3);
+
+ q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q5u8), q2s8);
+ q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q4u8), q14s8);
+
+ q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
+ q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
+
+ d2u8x2.val[0] = vget_low_u8(q6u8); // d12
+ d2u8x2.val[1] = vget_low_u8(q7u8); // d14
+ d3u8x2.val[0] = vget_high_u8(q6u8); // d13
+ d3u8x2.val[1] = vget_high_u8(q7u8); // d15
+
+ src1 = s - 1;
+ write_2x8(src1, p, d2u8x2, d3u8x2);
+}
+
+void vp8_loop_filter_bvs_neon(
+ unsigned char *y_ptr,
+ int y_stride,
+ const unsigned char *blimit) {
+ y_ptr += 4;
+ vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
+ y_ptr += 4;
+ vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
+ y_ptr += 4;
+ vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
+ return;
+}
+
+void vp8_loop_filter_mbvs_neon(
+ unsigned char *y_ptr,
+ int y_stride,
+ const unsigned char *blimit) {
+ vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
+ return;
+}
diff --git a/thirdparty/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c b/thirdparty/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c
new file mode 100644
index 0000000000..5351f4be66
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c
@@ -0,0 +1,625 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_config.h"
+
+static INLINE void vp8_mbloop_filter_neon(
+ uint8x16_t qblimit, // mblimit
+ uint8x16_t qlimit, // limit
+ uint8x16_t qthresh, // thresh
+ uint8x16_t q3, // p2
+ uint8x16_t q4, // p2
+ uint8x16_t q5, // p1
+ uint8x16_t q6, // p0
+ uint8x16_t q7, // q0
+ uint8x16_t q8, // q1
+ uint8x16_t q9, // q2
+ uint8x16_t q10, // q3
+ uint8x16_t *q4r, // p1
+ uint8x16_t *q5r, // p1
+ uint8x16_t *q6r, // p0
+ uint8x16_t *q7r, // q0
+ uint8x16_t *q8r, // q1
+ uint8x16_t *q9r) { // q1
+ uint8x16_t q0u8, q1u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+ int16x8_t q0s16, q2s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ int8x16_t q1s8, q6s8, q7s8, q2s8, q11s8, q13s8;
+ uint16x8_t q0u16, q11u16, q12u16, q13u16, q14u16, q15u16;
+ int8x16_t q0s8, q12s8, q14s8, q15s8;
+ int8x8_t d0, d1, d2, d3, d4, d5, d24, d25, d28, d29;
+
+ q11u8 = vabdq_u8(q3, q4);
+ q12u8 = vabdq_u8(q4, q5);
+ q13u8 = vabdq_u8(q5, q6);
+ q14u8 = vabdq_u8(q8, q7);
+ q1u8 = vabdq_u8(q9, q8);
+ q0u8 = vabdq_u8(q10, q9);
+
+ q11u8 = vmaxq_u8(q11u8, q12u8);
+ q12u8 = vmaxq_u8(q13u8, q14u8);
+ q1u8 = vmaxq_u8(q1u8, q0u8);
+ q15u8 = vmaxq_u8(q11u8, q12u8);
+
+ q12u8 = vabdq_u8(q6, q7);
+
+ // vp8_hevmask
+ q13u8 = vcgtq_u8(q13u8, qthresh);
+ q14u8 = vcgtq_u8(q14u8, qthresh);
+ q15u8 = vmaxq_u8(q15u8, q1u8);
+
+ q15u8 = vcgeq_u8(qlimit, q15u8);
+
+ q1u8 = vabdq_u8(q5, q8);
+ q12u8 = vqaddq_u8(q12u8, q12u8);
+
+ // vp8_filter() function
+ // convert to signed
+ q0u8 = vdupq_n_u8(0x80);
+ q9 = veorq_u8(q9, q0u8);
+ q8 = veorq_u8(q8, q0u8);
+ q7 = veorq_u8(q7, q0u8);
+ q6 = veorq_u8(q6, q0u8);
+ q5 = veorq_u8(q5, q0u8);
+ q4 = veorq_u8(q4, q0u8);
+
+ q1u8 = vshrq_n_u8(q1u8, 1);
+ q12u8 = vqaddq_u8(q12u8, q1u8);
+
+ q14u8 = vorrq_u8(q13u8, q14u8);
+ q12u8 = vcgeq_u8(qblimit, q12u8);
+
+ q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
+ vget_low_s8(vreinterpretq_s8_u8(q6)));
+ q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
+ vget_high_s8(vreinterpretq_s8_u8(q6)));
+
+ q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
+ vreinterpretq_s8_u8(q8));
+
+ q11s16 = vdupq_n_s16(3);
+ q2s16 = vmulq_s16(q2s16, q11s16);
+ q13s16 = vmulq_s16(q13s16, q11s16);
+
+ q15u8 = vandq_u8(q15u8, q12u8);
+
+ q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
+ q13s16 = vaddw_s8(q13s16, vget_high_s8(q1s8));
+
+ q12u8 = vdupq_n_u8(3);
+ q11u8 = vdupq_n_u8(4);
+ // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
+ d2 = vqmovn_s16(q2s16);
+ d3 = vqmovn_s16(q13s16);
+ q1s8 = vcombine_s8(d2, d3);
+ q1s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q15u8));
+ q13s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+ q2s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q11u8));
+ q13s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q12u8));
+ q2s8 = vshrq_n_s8(q2s8, 3);
+ q13s8 = vshrq_n_s8(q13s8, 3);
+
+ q7s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q2s8);
+ q6s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q13s8);
+
+ q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+ q0u16 = q11u16 = q12u16 = q13u16 = q14u16 = q15u16 = vdupq_n_u16(63);
+ d5 = vdup_n_s8(9);
+ d4 = vdup_n_s8(18);
+
+ q0s16 = vmlal_s8(vreinterpretq_s16_u16(q0u16), vget_low_s8(q1s8), d5);
+ q11s16 = vmlal_s8(vreinterpretq_s16_u16(q11u16), vget_high_s8(q1s8), d5);
+ d5 = vdup_n_s8(27);
+ q12s16 = vmlal_s8(vreinterpretq_s16_u16(q12u16), vget_low_s8(q1s8), d4);
+ q13s16 = vmlal_s8(vreinterpretq_s16_u16(q13u16), vget_high_s8(q1s8), d4);
+ q14s16 = vmlal_s8(vreinterpretq_s16_u16(q14u16), vget_low_s8(q1s8), d5);
+ q15s16 = vmlal_s8(vreinterpretq_s16_u16(q15u16), vget_high_s8(q1s8), d5);
+
+ d0 = vqshrn_n_s16(q0s16 , 7);
+ d1 = vqshrn_n_s16(q11s16, 7);
+ d24 = vqshrn_n_s16(q12s16, 7);
+ d25 = vqshrn_n_s16(q13s16, 7);
+ d28 = vqshrn_n_s16(q14s16, 7);
+ d29 = vqshrn_n_s16(q15s16, 7);
+
+ q0s8 = vcombine_s8(d0, d1);
+ q12s8 = vcombine_s8(d24, d25);
+ q14s8 = vcombine_s8(d28, d29);
+
+ q11s8 = vqsubq_s8(vreinterpretq_s8_u8(q9), q0s8);
+ q0s8 = vqaddq_s8(vreinterpretq_s8_u8(q4), q0s8);
+ q13s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q12s8);
+ q12s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q12s8);
+ q15s8 = vqsubq_s8((q7s8), q14s8);
+ q14s8 = vqaddq_s8((q6s8), q14s8);
+
+ q1u8 = vdupq_n_u8(0x80);
+ *q9r = veorq_u8(vreinterpretq_u8_s8(q11s8), q1u8);
+ *q8r = veorq_u8(vreinterpretq_u8_s8(q13s8), q1u8);
+ *q7r = veorq_u8(vreinterpretq_u8_s8(q15s8), q1u8);
+ *q6r = veorq_u8(vreinterpretq_u8_s8(q14s8), q1u8);
+ *q5r = veorq_u8(vreinterpretq_u8_s8(q12s8), q1u8);
+ *q4r = veorq_u8(vreinterpretq_u8_s8(q0s8), q1u8);
+ return;
+}
+
+void vp8_mbloop_filter_horizontal_edge_y_neon(
+ unsigned char *src,
+ int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh) {
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+
+ src -= (pitch << 2);
+
+ q3 = vld1q_u8(src);
+ src += pitch;
+ q4 = vld1q_u8(src);
+ src += pitch;
+ q5 = vld1q_u8(src);
+ src += pitch;
+ q6 = vld1q_u8(src);
+ src += pitch;
+ q7 = vld1q_u8(src);
+ src += pitch;
+ q8 = vld1q_u8(src);
+ src += pitch;
+ q9 = vld1q_u8(src);
+ src += pitch;
+ q10 = vld1q_u8(src);
+
+ vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+ q5, q6, q7, q8, q9, q10,
+ &q4, &q5, &q6, &q7, &q8, &q9);
+
+ src -= (pitch * 6);
+ vst1q_u8(src, q4);
+ src += pitch;
+ vst1q_u8(src, q5);
+ src += pitch;
+ vst1q_u8(src, q6);
+ src += pitch;
+ vst1q_u8(src, q7);
+ src += pitch;
+ vst1q_u8(src, q8);
+ src += pitch;
+ vst1q_u8(src, q9);
+ return;
+}
+
+void vp8_mbloop_filter_horizontal_edge_uv_neon(
+ unsigned char *u,
+ int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh,
+ unsigned char *v) {
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+ uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+ uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+
+ u -= (pitch << 2);
+ v -= (pitch << 2);
+
+ d6 = vld1_u8(u);
+ u += pitch;
+ d7 = vld1_u8(v);
+ v += pitch;
+ d8 = vld1_u8(u);
+ u += pitch;
+ d9 = vld1_u8(v);
+ v += pitch;
+ d10 = vld1_u8(u);
+ u += pitch;
+ d11 = vld1_u8(v);
+ v += pitch;
+ d12 = vld1_u8(u);
+ u += pitch;
+ d13 = vld1_u8(v);
+ v += pitch;
+ d14 = vld1_u8(u);
+ u += pitch;
+ d15 = vld1_u8(v);
+ v += pitch;
+ d16 = vld1_u8(u);
+ u += pitch;
+ d17 = vld1_u8(v);
+ v += pitch;
+ d18 = vld1_u8(u);
+ u += pitch;
+ d19 = vld1_u8(v);
+ v += pitch;
+ d20 = vld1_u8(u);
+ d21 = vld1_u8(v);
+
+ q3 = vcombine_u8(d6, d7);
+ q4 = vcombine_u8(d8, d9);
+ q5 = vcombine_u8(d10, d11);
+ q6 = vcombine_u8(d12, d13);
+ q7 = vcombine_u8(d14, d15);
+ q8 = vcombine_u8(d16, d17);
+ q9 = vcombine_u8(d18, d19);
+ q10 = vcombine_u8(d20, d21);
+
+ vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+ q5, q6, q7, q8, q9, q10,
+ &q4, &q5, &q6, &q7, &q8, &q9);
+
+ u -= (pitch * 6);
+ v -= (pitch * 6);
+ vst1_u8(u, vget_low_u8(q4));
+ u += pitch;
+ vst1_u8(v, vget_high_u8(q4));
+ v += pitch;
+ vst1_u8(u, vget_low_u8(q5));
+ u += pitch;
+ vst1_u8(v, vget_high_u8(q5));
+ v += pitch;
+ vst1_u8(u, vget_low_u8(q6));
+ u += pitch;
+ vst1_u8(v, vget_high_u8(q6));
+ v += pitch;
+ vst1_u8(u, vget_low_u8(q7));
+ u += pitch;
+ vst1_u8(v, vget_high_u8(q7));
+ v += pitch;
+ vst1_u8(u, vget_low_u8(q8));
+ u += pitch;
+ vst1_u8(v, vget_high_u8(q8));
+ v += pitch;
+ vst1_u8(u, vget_low_u8(q9));
+ vst1_u8(v, vget_high_u8(q9));
+ return;
+}
+
+void vp8_mbloop_filter_vertical_edge_y_neon(
+ unsigned char *src,
+ int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh) {
+ unsigned char *s1, *s2;
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+ uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+ uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+ uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
+ uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
+ uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+
+ s1 = src - 4;
+ s2 = s1 + 8 * pitch;
+ d6 = vld1_u8(s1);
+ s1 += pitch;
+ d7 = vld1_u8(s2);
+ s2 += pitch;
+ d8 = vld1_u8(s1);
+ s1 += pitch;
+ d9 = vld1_u8(s2);
+ s2 += pitch;
+ d10 = vld1_u8(s1);
+ s1 += pitch;
+ d11 = vld1_u8(s2);
+ s2 += pitch;
+ d12 = vld1_u8(s1);
+ s1 += pitch;
+ d13 = vld1_u8(s2);
+ s2 += pitch;
+ d14 = vld1_u8(s1);
+ s1 += pitch;
+ d15 = vld1_u8(s2);
+ s2 += pitch;
+ d16 = vld1_u8(s1);
+ s1 += pitch;
+ d17 = vld1_u8(s2);
+ s2 += pitch;
+ d18 = vld1_u8(s1);
+ s1 += pitch;
+ d19 = vld1_u8(s2);
+ s2 += pitch;
+ d20 = vld1_u8(s1);
+ d21 = vld1_u8(s2);
+
+ q3 = vcombine_u8(d6, d7);
+ q4 = vcombine_u8(d8, d9);
+ q5 = vcombine_u8(d10, d11);
+ q6 = vcombine_u8(d12, d13);
+ q7 = vcombine_u8(d14, d15);
+ q8 = vcombine_u8(d16, d17);
+ q9 = vcombine_u8(d18, d19);
+ q10 = vcombine_u8(d20, d21);
+
+ q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+ q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+ q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+ q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+ q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+ vreinterpretq_u16_u32(q2tmp2.val[0]));
+ q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+ vreinterpretq_u16_u32(q2tmp3.val[0]));
+ q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+ vreinterpretq_u16_u32(q2tmp2.val[1]));
+ q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+ vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+ q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+ vreinterpretq_u8_u16(q2tmp5.val[0]));
+ q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+ vreinterpretq_u8_u16(q2tmp5.val[1]));
+ q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+ vreinterpretq_u8_u16(q2tmp7.val[0]));
+ q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+ vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+ q3 = q2tmp8.val[0];
+ q4 = q2tmp8.val[1];
+ q5 = q2tmp9.val[0];
+ q6 = q2tmp9.val[1];
+ q7 = q2tmp10.val[0];
+ q8 = q2tmp10.val[1];
+ q9 = q2tmp11.val[0];
+ q10 = q2tmp11.val[1];
+
+ vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+ q5, q6, q7, q8, q9, q10,
+ &q4, &q5, &q6, &q7, &q8, &q9);
+
+ q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+ q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+ q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+ q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+ q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+ vreinterpretq_u16_u32(q2tmp2.val[0]));
+ q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+ vreinterpretq_u16_u32(q2tmp3.val[0]));
+ q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+ vreinterpretq_u16_u32(q2tmp2.val[1]));
+ q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+ vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+ q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+ vreinterpretq_u8_u16(q2tmp5.val[0]));
+ q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+ vreinterpretq_u8_u16(q2tmp5.val[1]));
+ q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+ vreinterpretq_u8_u16(q2tmp7.val[0]));
+ q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+ vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+ q3 = q2tmp8.val[0];
+ q4 = q2tmp8.val[1];
+ q5 = q2tmp9.val[0];
+ q6 = q2tmp9.val[1];
+ q7 = q2tmp10.val[0];
+ q8 = q2tmp10.val[1];
+ q9 = q2tmp11.val[0];
+ q10 = q2tmp11.val[1];
+
+ s1 -= 7 * pitch;
+ s2 -= 7 * pitch;
+
+ vst1_u8(s1, vget_low_u8(q3));
+ s1 += pitch;
+ vst1_u8(s2, vget_high_u8(q3));
+ s2 += pitch;
+ vst1_u8(s1, vget_low_u8(q4));
+ s1 += pitch;
+ vst1_u8(s2, vget_high_u8(q4));
+ s2 += pitch;
+ vst1_u8(s1, vget_low_u8(q5));
+ s1 += pitch;
+ vst1_u8(s2, vget_high_u8(q5));
+ s2 += pitch;
+ vst1_u8(s1, vget_low_u8(q6));
+ s1 += pitch;
+ vst1_u8(s2, vget_high_u8(q6));
+ s2 += pitch;
+ vst1_u8(s1, vget_low_u8(q7));
+ s1 += pitch;
+ vst1_u8(s2, vget_high_u8(q7));
+ s2 += pitch;
+ vst1_u8(s1, vget_low_u8(q8));
+ s1 += pitch;
+ vst1_u8(s2, vget_high_u8(q8));
+ s2 += pitch;
+ vst1_u8(s1, vget_low_u8(q9));
+ s1 += pitch;
+ vst1_u8(s2, vget_high_u8(q9));
+ s2 += pitch;
+ vst1_u8(s1, vget_low_u8(q10));
+ vst1_u8(s2, vget_high_u8(q10));
+ return;
+}
+
+void vp8_mbloop_filter_vertical_edge_uv_neon(
+ unsigned char *u,
+ int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh,
+ unsigned char *v) {
+ unsigned char *us, *ud;
+ unsigned char *vs, *vd;
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+ uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+ uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+ uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
+ uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
+ uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+
+ us = u - 4;
+ vs = v - 4;
+ d6 = vld1_u8(us);
+ us += pitch;
+ d7 = vld1_u8(vs);
+ vs += pitch;
+ d8 = vld1_u8(us);
+ us += pitch;
+ d9 = vld1_u8(vs);
+ vs += pitch;
+ d10 = vld1_u8(us);
+ us += pitch;
+ d11 = vld1_u8(vs);
+ vs += pitch;
+ d12 = vld1_u8(us);
+ us += pitch;
+ d13 = vld1_u8(vs);
+ vs += pitch;
+ d14 = vld1_u8(us);
+ us += pitch;
+ d15 = vld1_u8(vs);
+ vs += pitch;
+ d16 = vld1_u8(us);
+ us += pitch;
+ d17 = vld1_u8(vs);
+ vs += pitch;
+ d18 = vld1_u8(us);
+ us += pitch;
+ d19 = vld1_u8(vs);
+ vs += pitch;
+ d20 = vld1_u8(us);
+ d21 = vld1_u8(vs);
+
+ q3 = vcombine_u8(d6, d7);
+ q4 = vcombine_u8(d8, d9);
+ q5 = vcombine_u8(d10, d11);
+ q6 = vcombine_u8(d12, d13);
+ q7 = vcombine_u8(d14, d15);
+ q8 = vcombine_u8(d16, d17);
+ q9 = vcombine_u8(d18, d19);
+ q10 = vcombine_u8(d20, d21);
+
+ q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+ q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+ q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+ q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+ q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+ vreinterpretq_u16_u32(q2tmp2.val[0]));
+ q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+ vreinterpretq_u16_u32(q2tmp3.val[0]));
+ q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+ vreinterpretq_u16_u32(q2tmp2.val[1]));
+ q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+ vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+ q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+ vreinterpretq_u8_u16(q2tmp5.val[0]));
+ q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+ vreinterpretq_u8_u16(q2tmp5.val[1]));
+ q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+ vreinterpretq_u8_u16(q2tmp7.val[0]));
+ q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+ vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+ q3 = q2tmp8.val[0];
+ q4 = q2tmp8.val[1];
+ q5 = q2tmp9.val[0];
+ q6 = q2tmp9.val[1];
+ q7 = q2tmp10.val[0];
+ q8 = q2tmp10.val[1];
+ q9 = q2tmp11.val[0];
+ q10 = q2tmp11.val[1];
+
+ vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+ q5, q6, q7, q8, q9, q10,
+ &q4, &q5, &q6, &q7, &q8, &q9);
+
+ q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+ q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+ q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+ q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+ q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+ vreinterpretq_u16_u32(q2tmp2.val[0]));
+ q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+ vreinterpretq_u16_u32(q2tmp3.val[0]));
+ q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+ vreinterpretq_u16_u32(q2tmp2.val[1]));
+ q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+ vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+ q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+ vreinterpretq_u8_u16(q2tmp5.val[0]));
+ q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+ vreinterpretq_u8_u16(q2tmp5.val[1]));
+ q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+ vreinterpretq_u8_u16(q2tmp7.val[0]));
+ q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+ vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+ q3 = q2tmp8.val[0];
+ q4 = q2tmp8.val[1];
+ q5 = q2tmp9.val[0];
+ q6 = q2tmp9.val[1];
+ q7 = q2tmp10.val[0];
+ q8 = q2tmp10.val[1];
+ q9 = q2tmp11.val[0];
+ q10 = q2tmp11.val[1];
+
+ ud = u - 4;
+ vst1_u8(ud, vget_low_u8(q3));
+ ud += pitch;
+ vst1_u8(ud, vget_low_u8(q4));
+ ud += pitch;
+ vst1_u8(ud, vget_low_u8(q5));
+ ud += pitch;
+ vst1_u8(ud, vget_low_u8(q6));
+ ud += pitch;
+ vst1_u8(ud, vget_low_u8(q7));
+ ud += pitch;
+ vst1_u8(ud, vget_low_u8(q8));
+ ud += pitch;
+ vst1_u8(ud, vget_low_u8(q9));
+ ud += pitch;
+ vst1_u8(ud, vget_low_u8(q10));
+
+ vd = v - 4;
+ vst1_u8(vd, vget_high_u8(q3));
+ vd += pitch;
+ vst1_u8(vd, vget_high_u8(q4));
+ vd += pitch;
+ vst1_u8(vd, vget_high_u8(q5));
+ vd += pitch;
+ vst1_u8(vd, vget_high_u8(q6));
+ vd += pitch;
+ vst1_u8(vd, vget_high_u8(q7));
+ vd += pitch;
+ vst1_u8(vd, vget_high_u8(q8));
+ vd += pitch;
+ vst1_u8(vd, vget_high_u8(q9));
+ vd += pitch;
+ vst1_u8(vd, vget_high_u8(q10));
+ return;
+}
diff --git a/thirdparty/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c b/thirdparty/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c
new file mode 100644
index 0000000000..373afa6ed3
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+static const int16_t cospi8sqrt2minus1 = 20091;
+static const int16_t sinpi8sqrt2 = 35468;
+
+void vp8_short_idct4x4llm_neon(
+ int16_t *input,
+ unsigned char *pred_ptr,
+ int pred_stride,
+ unsigned char *dst_ptr,
+ int dst_stride) {
+ int i;
+ uint32x2_t d6u32 = vdup_n_u32(0);
+ uint8x8_t d1u8;
+ int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
+ uint16x8_t q1u16;
+ int16x8_t q1s16, q2s16, q3s16, q4s16;
+ int32x2x2_t v2tmp0, v2tmp1;
+ int16x4x2_t v2tmp2, v2tmp3;
+
+ d2 = vld1_s16(input);
+ d3 = vld1_s16(input + 4);
+ d4 = vld1_s16(input + 8);
+ d5 = vld1_s16(input + 12);
+
+ // 1st for loop
+ q1s16 = vcombine_s16(d2, d4); // Swap d3 d4 here
+ q2s16 = vcombine_s16(d3, d5);
+
+ q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
+ q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);
+
+ d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1
+ d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1
+
+ q3s16 = vshrq_n_s16(q3s16, 1);
+ q4s16 = vshrq_n_s16(q4s16, 1);
+
+ q3s16 = vqaddq_s16(q3s16, q2s16);
+ q4s16 = vqaddq_s16(q4s16, q2s16);
+
+ d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1
+ d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1
+
+ d2 = vqadd_s16(d12, d11);
+ d3 = vqadd_s16(d13, d10);
+ d4 = vqsub_s16(d13, d10);
+ d5 = vqsub_s16(d12, d11);
+
+ v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
+ v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
+ v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
+ vreinterpret_s16_s32(v2tmp1.val[0]));
+ v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
+ vreinterpret_s16_s32(v2tmp1.val[1]));
+
+ // 2nd for loop
+ q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp3.val[0]);
+ q2s16 = vcombine_s16(v2tmp2.val[1], v2tmp3.val[1]);
+
+ q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
+ q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);
+
+ d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1
+ d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1
+
+ q3s16 = vshrq_n_s16(q3s16, 1);
+ q4s16 = vshrq_n_s16(q4s16, 1);
+
+ q3s16 = vqaddq_s16(q3s16, q2s16);
+ q4s16 = vqaddq_s16(q4s16, q2s16);
+
+ d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1
+ d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1
+
+ d2 = vqadd_s16(d12, d11);
+ d3 = vqadd_s16(d13, d10);
+ d4 = vqsub_s16(d13, d10);
+ d5 = vqsub_s16(d12, d11);
+
+ d2 = vrshr_n_s16(d2, 3);
+ d3 = vrshr_n_s16(d3, 3);
+ d4 = vrshr_n_s16(d4, 3);
+ d5 = vrshr_n_s16(d5, 3);
+
+ v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
+ v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
+ v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
+ vreinterpret_s16_s32(v2tmp1.val[0]));
+ v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
+ vreinterpret_s16_s32(v2tmp1.val[1]));
+
+ q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp2.val[1]);
+ q2s16 = vcombine_s16(v2tmp3.val[0], v2tmp3.val[1]);
+
+ // dc_only_idct_add
+ for (i = 0; i < 2; i++, q1s16 = q2s16) {
+ d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 0);
+ pred_ptr += pred_stride;
+ d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 1);
+ pred_ptr += pred_stride;
+
+ q1u16 = vaddw_u8(vreinterpretq_u16_s16(q1s16),
+ vreinterpret_u8_u32(d6u32));
+ d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
+
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 0);
+ dst_ptr += dst_stride;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 1);
+ dst_ptr += dst_stride;
+ }
+ return;
+}
diff --git a/thirdparty/libvpx/vp8/common/arm/neon/sixtappredict_neon.c b/thirdparty/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
new file mode 100644
index 0000000000..49d8d221fc
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
@@ -0,0 +1,1377 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "vpx_ports/mem.h"
+
+static const int8_t vp8_sub_pel_filters[8][8] = {
+ {0, 0, 128, 0, 0, 0, 0, 0}, /* note that 1/8 pel positionyys are */
+ {0, -6, 123, 12, -1, 0, 0, 0}, /* just as per alpha -0.5 bicubic */
+ {2, -11, 108, 36, -8, 1, 0, 0}, /* New 1/4 pel 6 tap filter */
+ {0, -9, 93, 50, -6, 0, 0, 0},
+ {3, -16, 77, 77, -16, 3, 0, 0}, /* New 1/2 pel 6 tap filter */
+ {0, -6, 50, 93, -9, 0, 0, 0},
+ {1, -8, 36, 108, -11, 2, 0, 0}, /* New 1/4 pel 6 tap filter */
+ {0, -1, 12, 123, -6, 0, 0, 0},
+};
+
+void vp8_sixtap_predict8x4_neon(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ unsigned char *src;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
+ uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8;
+ uint8x8_t d27u8, d28u8, d29u8, d30u8, d31u8;
+ int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
+ uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
+ uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
+ int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
+ uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8;
+
+ if (xoffset == 0) { // secondpass_filter8x4_only
+ // load second_pass filter
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ // load src data
+ src = src_ptr - src_pixels_per_line * 2;
+ d22u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d23u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d24u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d25u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d26u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d27u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d28u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d29u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d30u8 = vld1_u8(src);
+
+ q3u16 = vmull_u8(d22u8, d0u8);
+ q4u16 = vmull_u8(d23u8, d0u8);
+ q5u16 = vmull_u8(d24u8, d0u8);
+ q6u16 = vmull_u8(d25u8, d0u8);
+
+ q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
+ q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
+ q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
+ q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
+
+ q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
+ q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
+ q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
+ q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
+
+ q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
+ q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
+ q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
+ q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
+
+ q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
+ q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
+ q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
+ q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
+
+ q7u16 = vmull_u8(d25u8, d3u8);
+ q8u16 = vmull_u8(d26u8, d3u8);
+ q9u16 = vmull_u8(d27u8, d3u8);
+ q10u16 = vmull_u8(d28u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+
+ q7s16 = vqaddq_s16(q7s16, q3s16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q9s16 = vqaddq_s16(q9s16, q5s16);
+ q10s16 = vqaddq_s16(q10s16, q6s16);
+
+ d6u8 = vqrshrun_n_s16(q7s16, 7);
+ d7u8 = vqrshrun_n_s16(q8s16, 7);
+ d8u8 = vqrshrun_n_s16(q9s16, 7);
+ d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+ vst1_u8(dst_ptr, d6u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d7u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d8u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d9u8);
+ return;
+ }
+
+ // load first_pass filter
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ // First pass: output_height lines x output_width columns (9x4)
+ if (yoffset == 0) // firstpass_filter4x4_only
+ src = src_ptr - 2;
+ else
+ src = src_ptr - 2 - (src_pixels_per_line * 2);
+ q3u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q4u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q5u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q6u8 = vld1q_u8(src);
+
+ q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+ q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+ q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
+ q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
+
+ q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
+ q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
+ q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
+ q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
+
+ q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
+ q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
+ q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
+ q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
+
+ q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
+ q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
+ q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
+ q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
+
+ q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
+ q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
+ q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
+ q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
+
+ q3u16 = vmull_u8(d28u8, d3u8);
+ q4u16 = vmull_u8(d29u8, d3u8);
+ q5u16 = vmull_u8(d30u8, d3u8);
+ q6u16 = vmull_u8(d31u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+
+ q7s16 = vqaddq_s16(q7s16, q3s16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q9s16 = vqaddq_s16(q9s16, q5s16);
+ q10s16 = vqaddq_s16(q10s16, q6s16);
+
+ d22u8 = vqrshrun_n_s16(q7s16, 7);
+ d23u8 = vqrshrun_n_s16(q8s16, 7);
+ d24u8 = vqrshrun_n_s16(q9s16, 7);
+ d25u8 = vqrshrun_n_s16(q10s16, 7);
+
+ if (yoffset == 0) { // firstpass_filter8x4_only
+ vst1_u8(dst_ptr, d22u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d23u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d24u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d25u8);
+ return;
+ }
+
+ // First Pass on rest 5-line data
+ src += src_pixels_per_line;
+ q3u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q4u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q5u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q6u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q7u8 = vld1q_u8(src);
+
+ q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+ q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+ q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
+ q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
+ q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
+
+ q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
+ q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
+ q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
+ q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
+ q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
+
+ q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
+ q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
+ q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
+ q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
+ q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
+
+ q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
+ q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
+ q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
+ q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
+ q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
+
+ q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
+ q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
+ q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
+ q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
+ q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
+
+ q3u16 = vmull_u8(d27u8, d3u8);
+ q4u16 = vmull_u8(d28u8, d3u8);
+ q5u16 = vmull_u8(d29u8, d3u8);
+ q6u16 = vmull_u8(d30u8, d3u8);
+ q7u16 = vmull_u8(d31u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+ q11s16 = vreinterpretq_s16_u16(q11u16);
+ q12s16 = vreinterpretq_s16_u16(q12u16);
+
+ q8s16 = vqaddq_s16(q8s16, q3s16);
+ q9s16 = vqaddq_s16(q9s16, q4s16);
+ q10s16 = vqaddq_s16(q10s16, q5s16);
+ q11s16 = vqaddq_s16(q11s16, q6s16);
+ q12s16 = vqaddq_s16(q12s16, q7s16);
+
+ d26u8 = vqrshrun_n_s16(q8s16, 7);
+ d27u8 = vqrshrun_n_s16(q9s16, 7);
+ d28u8 = vqrshrun_n_s16(q10s16, 7);
+ d29u8 = vqrshrun_n_s16(q11s16, 7);
+ d30u8 = vqrshrun_n_s16(q12s16, 7);
+
+ // Second pass: 8x4
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ q3u16 = vmull_u8(d22u8, d0u8);
+ q4u16 = vmull_u8(d23u8, d0u8);
+ q5u16 = vmull_u8(d24u8, d0u8);
+ q6u16 = vmull_u8(d25u8, d0u8);
+
+ q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
+ q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
+ q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
+ q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
+
+ q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
+ q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
+ q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
+ q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
+
+ q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
+ q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
+ q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
+ q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
+
+ q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
+ q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
+ q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
+ q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
+
+ q7u16 = vmull_u8(d25u8, d3u8);
+ q8u16 = vmull_u8(d26u8, d3u8);
+ q9u16 = vmull_u8(d27u8, d3u8);
+ q10u16 = vmull_u8(d28u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+
+ q7s16 = vqaddq_s16(q7s16, q3s16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q9s16 = vqaddq_s16(q9s16, q5s16);
+ q10s16 = vqaddq_s16(q10s16, q6s16);
+
+ d6u8 = vqrshrun_n_s16(q7s16, 7);
+ d7u8 = vqrshrun_n_s16(q8s16, 7);
+ d8u8 = vqrshrun_n_s16(q9s16, 7);
+ d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+ vst1_u8(dst_ptr, d6u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d7u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d8u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d9u8);
+ return;
+}
+
+void vp8_sixtap_predict8x8_neon(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ unsigned char *src, *tmpp;
+ unsigned char tmp[64];
+ int i;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
+ uint8x8_t d18u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8, d25u8;
+ uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
+ int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
+ uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
+ uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
+ int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
+ uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q9u8, q10u8, q11u8, q12u8;
+
+ if (xoffset == 0) { // secondpass_filter8x8_only
+ // load second_pass filter
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ // load src data
+ src = src_ptr - src_pixels_per_line * 2;
+ d18u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d19u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d20u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d21u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d22u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d23u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d24u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d25u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d26u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d27u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d28u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d29u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d30u8 = vld1_u8(src);
+
+ for (i = 2; i > 0; i--) {
+ q3u16 = vmull_u8(d18u8, d0u8);
+ q4u16 = vmull_u8(d19u8, d0u8);
+ q5u16 = vmull_u8(d20u8, d0u8);
+ q6u16 = vmull_u8(d21u8, d0u8);
+
+ q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
+ q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
+ q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
+ q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
+
+ q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
+ q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
+ q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
+ q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
+
+ q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
+ q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
+ q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
+ q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
+
+ q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
+ q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
+ q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
+ q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
+
+ q7u16 = vmull_u8(d21u8, d3u8);
+ q8u16 = vmull_u8(d22u8, d3u8);
+ q9u16 = vmull_u8(d23u8, d3u8);
+ q10u16 = vmull_u8(d24u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+
+ q7s16 = vqaddq_s16(q7s16, q3s16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q9s16 = vqaddq_s16(q9s16, q5s16);
+ q10s16 = vqaddq_s16(q10s16, q6s16);
+
+ d6u8 = vqrshrun_n_s16(q7s16, 7);
+ d7u8 = vqrshrun_n_s16(q8s16, 7);
+ d8u8 = vqrshrun_n_s16(q9s16, 7);
+ d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+ d18u8 = d22u8;
+ d19u8 = d23u8;
+ d20u8 = d24u8;
+ d21u8 = d25u8;
+ d22u8 = d26u8;
+ d23u8 = d27u8;
+ d24u8 = d28u8;
+ d25u8 = d29u8;
+ d26u8 = d30u8;
+
+ vst1_u8(dst_ptr, d6u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d7u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d8u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d9u8);
+ dst_ptr += dst_pitch;
+ }
+ return;
+ }
+
+ // load first_pass filter
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ // First pass: output_height lines x output_width columns (9x4)
+ if (yoffset == 0) // firstpass_filter4x4_only
+ src = src_ptr - 2;
+ else
+ src = src_ptr - 2 - (src_pixels_per_line * 2);
+
+ tmpp = tmp;
+ for (i = 2; i > 0; i--) {
+ q3u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q4u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q5u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q6u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+
+ __builtin_prefetch(src);
+ __builtin_prefetch(src + src_pixels_per_line);
+ __builtin_prefetch(src + src_pixels_per_line * 2);
+
+ q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+ q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+ q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
+ q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
+
+ q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
+ q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
+ q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
+ q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
+
+ q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
+ q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
+ q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
+ q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
+
+ q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
+ q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
+ q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
+ q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
+
+ q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
+ q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
+ q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
+ q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
+
+ q3u16 = vmull_u8(d28u8, d3u8);
+ q4u16 = vmull_u8(d29u8, d3u8);
+ q5u16 = vmull_u8(d30u8, d3u8);
+ q6u16 = vmull_u8(d31u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+
+ q7s16 = vqaddq_s16(q7s16, q3s16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q9s16 = vqaddq_s16(q9s16, q5s16);
+ q10s16 = vqaddq_s16(q10s16, q6s16);
+
+ d22u8 = vqrshrun_n_s16(q7s16, 7);
+ d23u8 = vqrshrun_n_s16(q8s16, 7);
+ d24u8 = vqrshrun_n_s16(q9s16, 7);
+ d25u8 = vqrshrun_n_s16(q10s16, 7);
+
+ if (yoffset == 0) { // firstpass_filter8x4_only
+ vst1_u8(dst_ptr, d22u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d23u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d24u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d25u8);
+ dst_ptr += dst_pitch;
+ } else {
+ vst1_u8(tmpp, d22u8);
+ tmpp += 8;
+ vst1_u8(tmpp, d23u8);
+ tmpp += 8;
+ vst1_u8(tmpp, d24u8);
+ tmpp += 8;
+ vst1_u8(tmpp, d25u8);
+ tmpp += 8;
+ }
+ }
+ if (yoffset == 0)
+ return;
+
+ // First Pass on rest 5-line data
+ q3u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q4u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q5u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q6u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q7u8 = vld1q_u8(src);
+
+ q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+ q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+ q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
+ q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
+ q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
+
+ q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
+ q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
+ q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
+ q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
+ q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
+
+ q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
+ q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
+ q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
+ q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
+ q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
+
+ q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
+ q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
+ q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
+ q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
+ q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
+
+ q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
+ q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
+ q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
+ q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
+ q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
+
+ q3u16 = vmull_u8(d27u8, d3u8);
+ q4u16 = vmull_u8(d28u8, d3u8);
+ q5u16 = vmull_u8(d29u8, d3u8);
+ q6u16 = vmull_u8(d30u8, d3u8);
+ q7u16 = vmull_u8(d31u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+ q11s16 = vreinterpretq_s16_u16(q11u16);
+ q12s16 = vreinterpretq_s16_u16(q12u16);
+
+ q8s16 = vqaddq_s16(q8s16, q3s16);
+ q9s16 = vqaddq_s16(q9s16, q4s16);
+ q10s16 = vqaddq_s16(q10s16, q5s16);
+ q11s16 = vqaddq_s16(q11s16, q6s16);
+ q12s16 = vqaddq_s16(q12s16, q7s16);
+
+ d26u8 = vqrshrun_n_s16(q8s16, 7);
+ d27u8 = vqrshrun_n_s16(q9s16, 7);
+ d28u8 = vqrshrun_n_s16(q10s16, 7);
+ d29u8 = vqrshrun_n_s16(q11s16, 7);
+ d30u8 = vqrshrun_n_s16(q12s16, 7);
+
+ // Second pass: 8x8
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ tmpp = tmp;
+ q9u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ q10u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ q11u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ q12u8 = vld1q_u8(tmpp);
+
+ d18u8 = vget_low_u8(q9u8);
+ d19u8 = vget_high_u8(q9u8);
+ d20u8 = vget_low_u8(q10u8);
+ d21u8 = vget_high_u8(q10u8);
+ d22u8 = vget_low_u8(q11u8);
+ d23u8 = vget_high_u8(q11u8);
+ d24u8 = vget_low_u8(q12u8);
+ d25u8 = vget_high_u8(q12u8);
+
+ for (i = 2; i > 0; i--) {
+ q3u16 = vmull_u8(d18u8, d0u8);
+ q4u16 = vmull_u8(d19u8, d0u8);
+ q5u16 = vmull_u8(d20u8, d0u8);
+ q6u16 = vmull_u8(d21u8, d0u8);
+
+ q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
+ q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
+ q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
+ q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
+
+ q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
+ q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
+ q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
+ q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
+
+ q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
+ q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
+ q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
+ q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
+
+ q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
+ q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
+ q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
+ q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
+
+ q7u16 = vmull_u8(d21u8, d3u8);
+ q8u16 = vmull_u8(d22u8, d3u8);
+ q9u16 = vmull_u8(d23u8, d3u8);
+ q10u16 = vmull_u8(d24u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+
+ q7s16 = vqaddq_s16(q7s16, q3s16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q9s16 = vqaddq_s16(q9s16, q5s16);
+ q10s16 = vqaddq_s16(q10s16, q6s16);
+
+ d6u8 = vqrshrun_n_s16(q7s16, 7);
+ d7u8 = vqrshrun_n_s16(q8s16, 7);
+ d8u8 = vqrshrun_n_s16(q9s16, 7);
+ d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+ d18u8 = d22u8;
+ d19u8 = d23u8;
+ d20u8 = d24u8;
+ d21u8 = d25u8;
+ d22u8 = d26u8;
+ d23u8 = d27u8;
+ d24u8 = d28u8;
+ d25u8 = d29u8;
+ d26u8 = d30u8;
+
+ vst1_u8(dst_ptr, d6u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d7u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d8u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d9u8);
+ dst_ptr += dst_pitch;
+ }
+ return;
+}
+
+void vp8_sixtap_predict16x16_neon(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ unsigned char *src, *src_tmp, *dst, *tmpp;
+ unsigned char tmp[336];
+ int i, j;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
+ uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d18u8, d19u8;
+ uint8x8_t d20u8, d21u8, d22u8, d23u8, d24u8, d25u8, d26u8, d27u8;
+ uint8x8_t d28u8, d29u8, d30u8, d31u8;
+ int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
+ uint8x16_t q3u8, q4u8;
+ uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16, q8u16, q9u16, q10u16;
+ uint16x8_t q11u16, q12u16, q13u16, q15u16;
+ int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16, q8s16, q9s16, q10s16;
+ int16x8_t q11s16, q12s16, q13s16, q15s16;
+
+ if (xoffset == 0) { // secondpass_filter8x8_only
+ // load second_pass filter
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ // load src data
+ src_tmp = src_ptr - src_pixels_per_line * 2;
+ for (i = 0; i < 2; i++) {
+ src = src_tmp + i * 8;
+ dst = dst_ptr + i * 8;
+ d18u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d19u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d20u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d21u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d22u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ for (j = 0; j < 4; j++) {
+ d23u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d24u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d25u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d26u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+
+ q3u16 = vmull_u8(d18u8, d0u8);
+ q4u16 = vmull_u8(d19u8, d0u8);
+ q5u16 = vmull_u8(d20u8, d0u8);
+ q6u16 = vmull_u8(d21u8, d0u8);
+
+ q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
+ q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
+ q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
+ q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
+
+ q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
+ q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
+ q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
+ q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
+
+ q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
+ q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
+ q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
+ q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
+
+ q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
+ q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
+ q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
+ q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
+
+ q7u16 = vmull_u8(d21u8, d3u8);
+ q8u16 = vmull_u8(d22u8, d3u8);
+ q9u16 = vmull_u8(d23u8, d3u8);
+ q10u16 = vmull_u8(d24u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+
+ q7s16 = vqaddq_s16(q7s16, q3s16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q9s16 = vqaddq_s16(q9s16, q5s16);
+ q10s16 = vqaddq_s16(q10s16, q6s16);
+
+ d6u8 = vqrshrun_n_s16(q7s16, 7);
+ d7u8 = vqrshrun_n_s16(q8s16, 7);
+ d8u8 = vqrshrun_n_s16(q9s16, 7);
+ d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+ d18u8 = d22u8;
+ d19u8 = d23u8;
+ d20u8 = d24u8;
+ d21u8 = d25u8;
+ d22u8 = d26u8;
+
+ vst1_u8(dst, d6u8);
+ dst += dst_pitch;
+ vst1_u8(dst, d7u8);
+ dst += dst_pitch;
+ vst1_u8(dst, d8u8);
+ dst += dst_pitch;
+ vst1_u8(dst, d9u8);
+ dst += dst_pitch;
+ }
+ }
+ return;
+ }
+
+ // load first_pass filter
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ // First pass: output_height lines x output_width columns (9x4)
+ if (yoffset == 0) { // firstpass_filter4x4_only
+ src = src_ptr - 2;
+ dst = dst_ptr;
+ for (i = 0; i < 8; i++) {
+ d6u8 = vld1_u8(src);
+ d7u8 = vld1_u8(src + 8);
+ d8u8 = vld1_u8(src + 16);
+ src += src_pixels_per_line;
+ d9u8 = vld1_u8(src);
+ d10u8 = vld1_u8(src + 8);
+ d11u8 = vld1_u8(src + 16);
+ src += src_pixels_per_line;
+
+ __builtin_prefetch(src);
+ __builtin_prefetch(src + src_pixels_per_line);
+
+ q6u16 = vmull_u8(d6u8, d0u8);
+ q7u16 = vmull_u8(d7u8, d0u8);
+ q8u16 = vmull_u8(d9u8, d0u8);
+ q9u16 = vmull_u8(d10u8, d0u8);
+
+ d20u8 = vext_u8(d6u8, d7u8, 1);
+ d21u8 = vext_u8(d9u8, d10u8, 1);
+ d22u8 = vext_u8(d7u8, d8u8, 1);
+ d23u8 = vext_u8(d10u8, d11u8, 1);
+ d24u8 = vext_u8(d6u8, d7u8, 4);
+ d25u8 = vext_u8(d9u8, d10u8, 4);
+ d26u8 = vext_u8(d7u8, d8u8, 4);
+ d27u8 = vext_u8(d10u8, d11u8, 4);
+ d28u8 = vext_u8(d6u8, d7u8, 5);
+ d29u8 = vext_u8(d9u8, d10u8, 5);
+
+ q6u16 = vmlsl_u8(q6u16, d20u8, d1u8);
+ q8u16 = vmlsl_u8(q8u16, d21u8, d1u8);
+ q7u16 = vmlsl_u8(q7u16, d22u8, d1u8);
+ q9u16 = vmlsl_u8(q9u16, d23u8, d1u8);
+ q6u16 = vmlsl_u8(q6u16, d24u8, d4u8);
+ q8u16 = vmlsl_u8(q8u16, d25u8, d4u8);
+ q7u16 = vmlsl_u8(q7u16, d26u8, d4u8);
+ q9u16 = vmlsl_u8(q9u16, d27u8, d4u8);
+ q6u16 = vmlal_u8(q6u16, d28u8, d5u8);
+ q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
+
+ d20u8 = vext_u8(d7u8, d8u8, 5);
+ d21u8 = vext_u8(d10u8, d11u8, 5);
+ d22u8 = vext_u8(d6u8, d7u8, 2);
+ d23u8 = vext_u8(d9u8, d10u8, 2);
+ d24u8 = vext_u8(d7u8, d8u8, 2);
+ d25u8 = vext_u8(d10u8, d11u8, 2);
+ d26u8 = vext_u8(d6u8, d7u8, 3);
+ d27u8 = vext_u8(d9u8, d10u8, 3);
+ d28u8 = vext_u8(d7u8, d8u8, 3);
+ d29u8 = vext_u8(d10u8, d11u8, 3);
+
+ q7u16 = vmlal_u8(q7u16, d20u8, d5u8);
+ q9u16 = vmlal_u8(q9u16, d21u8, d5u8);
+ q6u16 = vmlal_u8(q6u16, d22u8, d2u8);
+ q8u16 = vmlal_u8(q8u16, d23u8, d2u8);
+ q7u16 = vmlal_u8(q7u16, d24u8, d2u8);
+ q9u16 = vmlal_u8(q9u16, d25u8, d2u8);
+
+ q10u16 = vmull_u8(d26u8, d3u8);
+ q11u16 = vmull_u8(d27u8, d3u8);
+ q12u16 = vmull_u8(d28u8, d3u8);
+ q15u16 = vmull_u8(d29u8, d3u8);
+
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+ q11s16 = vreinterpretq_s16_u16(q11u16);
+ q12s16 = vreinterpretq_s16_u16(q12u16);
+ q15s16 = vreinterpretq_s16_u16(q15u16);
+
+ q6s16 = vqaddq_s16(q6s16, q10s16);
+ q8s16 = vqaddq_s16(q8s16, q11s16);
+ q7s16 = vqaddq_s16(q7s16, q12s16);
+ q9s16 = vqaddq_s16(q9s16, q15s16);
+
+ d6u8 = vqrshrun_n_s16(q6s16, 7);
+ d7u8 = vqrshrun_n_s16(q7s16, 7);
+ d8u8 = vqrshrun_n_s16(q8s16, 7);
+ d9u8 = vqrshrun_n_s16(q9s16, 7);
+
+ q3u8 = vcombine_u8(d6u8, d7u8);
+ q4u8 = vcombine_u8(d8u8, d9u8);
+ vst1q_u8(dst, q3u8);
+ dst += dst_pitch;
+ vst1q_u8(dst, q4u8);
+ dst += dst_pitch;
+ }
+ return;
+ }
+
+ src = src_ptr - 2 - src_pixels_per_line * 2;
+ tmpp = tmp;
+ for (i = 0; i < 7; i++) {
+ d6u8 = vld1_u8(src);
+ d7u8 = vld1_u8(src + 8);
+ d8u8 = vld1_u8(src + 16);
+ src += src_pixels_per_line;
+ d9u8 = vld1_u8(src);
+ d10u8 = vld1_u8(src + 8);
+ d11u8 = vld1_u8(src + 16);
+ src += src_pixels_per_line;
+ d12u8 = vld1_u8(src);
+ d13u8 = vld1_u8(src + 8);
+ d14u8 = vld1_u8(src + 16);
+ src += src_pixels_per_line;
+
+ __builtin_prefetch(src);
+ __builtin_prefetch(src + src_pixels_per_line);
+ __builtin_prefetch(src + src_pixels_per_line * 2);
+
+ q8u16 = vmull_u8(d6u8, d0u8);
+ q9u16 = vmull_u8(d7u8, d0u8);
+ q10u16 = vmull_u8(d9u8, d0u8);
+ q11u16 = vmull_u8(d10u8, d0u8);
+ q12u16 = vmull_u8(d12u8, d0u8);
+ q13u16 = vmull_u8(d13u8, d0u8);
+
+ d28u8 = vext_u8(d6u8, d7u8, 1);
+ d29u8 = vext_u8(d9u8, d10u8, 1);
+ d30u8 = vext_u8(d12u8, d13u8, 1);
+ q8u16 = vmlsl_u8(q8u16, d28u8, d1u8);
+ q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
+ q12u16 = vmlsl_u8(q12u16, d30u8, d1u8);
+ d28u8 = vext_u8(d7u8, d8u8, 1);
+ d29u8 = vext_u8(d10u8, d11u8, 1);
+ d30u8 = vext_u8(d13u8, d14u8, 1);
+ q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
+ q11u16 = vmlsl_u8(q11u16, d29u8, d1u8);
+ q13u16 = vmlsl_u8(q13u16, d30u8, d1u8);
+
+ d28u8 = vext_u8(d6u8, d7u8, 4);
+ d29u8 = vext_u8(d9u8, d10u8, 4);
+ d30u8 = vext_u8(d12u8, d13u8, 4);
+ q8u16 = vmlsl_u8(q8u16, d28u8, d4u8);
+ q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
+ q12u16 = vmlsl_u8(q12u16, d30u8, d4u8);
+ d28u8 = vext_u8(d7u8, d8u8, 4);
+ d29u8 = vext_u8(d10u8, d11u8, 4);
+ d30u8 = vext_u8(d13u8, d14u8, 4);
+ q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
+ q11u16 = vmlsl_u8(q11u16, d29u8, d4u8);
+ q13u16 = vmlsl_u8(q13u16, d30u8, d4u8);
+
+ d28u8 = vext_u8(d6u8, d7u8, 5);
+ d29u8 = vext_u8(d9u8, d10u8, 5);
+ d30u8 = vext_u8(d12u8, d13u8, 5);
+ q8u16 = vmlal_u8(q8u16, d28u8, d5u8);
+ q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
+ q12u16 = vmlal_u8(q12u16, d30u8, d5u8);
+ d28u8 = vext_u8(d7u8, d8u8, 5);
+ d29u8 = vext_u8(d10u8, d11u8, 5);
+ d30u8 = vext_u8(d13u8, d14u8, 5);
+ q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
+ q11u16 = vmlal_u8(q11u16, d29u8, d5u8);
+ q13u16 = vmlal_u8(q13u16, d30u8, d5u8);
+
+ d28u8 = vext_u8(d6u8, d7u8, 2);
+ d29u8 = vext_u8(d9u8, d10u8, 2);
+ d30u8 = vext_u8(d12u8, d13u8, 2);
+ q8u16 = vmlal_u8(q8u16, d28u8, d2u8);
+ q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
+ q12u16 = vmlal_u8(q12u16, d30u8, d2u8);
+ d28u8 = vext_u8(d7u8, d8u8, 2);
+ d29u8 = vext_u8(d10u8, d11u8, 2);
+ d30u8 = vext_u8(d13u8, d14u8, 2);
+ q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
+ q11u16 = vmlal_u8(q11u16, d29u8, d2u8);
+ q13u16 = vmlal_u8(q13u16, d30u8, d2u8);
+
+ d28u8 = vext_u8(d6u8, d7u8, 3);
+ d29u8 = vext_u8(d9u8, d10u8, 3);
+ d30u8 = vext_u8(d12u8, d13u8, 3);
+ d15u8 = vext_u8(d7u8, d8u8, 3);
+ d31u8 = vext_u8(d10u8, d11u8, 3);
+ d6u8 = vext_u8(d13u8, d14u8, 3);
+ q4u16 = vmull_u8(d28u8, d3u8);
+ q5u16 = vmull_u8(d29u8, d3u8);
+ q6u16 = vmull_u8(d30u8, d3u8);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+ q12s16 = vreinterpretq_s16_u16(q12u16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q10s16 = vqaddq_s16(q10s16, q5s16);
+ q12s16 = vqaddq_s16(q12s16, q6s16);
+
+ q6u16 = vmull_u8(d15u8, d3u8);
+ q7u16 = vmull_u8(d31u8, d3u8);
+ q3u16 = vmull_u8(d6u8, d3u8);
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q11s16 = vreinterpretq_s16_u16(q11u16);
+ q13s16 = vreinterpretq_s16_u16(q13u16);
+ q9s16 = vqaddq_s16(q9s16, q6s16);
+ q11s16 = vqaddq_s16(q11s16, q7s16);
+ q13s16 = vqaddq_s16(q13s16, q3s16);
+
+ d6u8 = vqrshrun_n_s16(q8s16, 7);
+ d7u8 = vqrshrun_n_s16(q9s16, 7);
+ d8u8 = vqrshrun_n_s16(q10s16, 7);
+ d9u8 = vqrshrun_n_s16(q11s16, 7);
+ d10u8 = vqrshrun_n_s16(q12s16, 7);
+ d11u8 = vqrshrun_n_s16(q13s16, 7);
+
+ vst1_u8(tmpp, d6u8);
+ tmpp += 8;
+ vst1_u8(tmpp, d7u8);
+ tmpp += 8;
+ vst1_u8(tmpp, d8u8);
+ tmpp += 8;
+ vst1_u8(tmpp, d9u8);
+ tmpp += 8;
+ vst1_u8(tmpp, d10u8);
+ tmpp += 8;
+ vst1_u8(tmpp, d11u8);
+ tmpp += 8;
+ }
+
+ // Second pass: 16x16
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ for (i = 0; i < 2; i++) {
+ dst = dst_ptr + 8 * i;
+ tmpp = tmp + 8 * i;
+ d18u8 = vld1_u8(tmpp);
+ tmpp += 16;
+ d19u8 = vld1_u8(tmpp);
+ tmpp += 16;
+ d20u8 = vld1_u8(tmpp);
+ tmpp += 16;
+ d21u8 = vld1_u8(tmpp);
+ tmpp += 16;
+ d22u8 = vld1_u8(tmpp);
+ tmpp += 16;
+ for (j = 0; j < 4; j++) {
+ d23u8 = vld1_u8(tmpp);
+ tmpp += 16;
+ d24u8 = vld1_u8(tmpp);
+ tmpp += 16;
+ d25u8 = vld1_u8(tmpp);
+ tmpp += 16;
+ d26u8 = vld1_u8(tmpp);
+ tmpp += 16;
+
+ q3u16 = vmull_u8(d18u8, d0u8);
+ q4u16 = vmull_u8(d19u8, d0u8);
+ q5u16 = vmull_u8(d20u8, d0u8);
+ q6u16 = vmull_u8(d21u8, d0u8);
+
+ q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
+ q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
+ q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
+ q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
+
+ q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
+ q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
+ q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
+ q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
+
+ q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
+ q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
+ q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
+ q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
+
+ q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
+ q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
+ q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
+ q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
+
+ q7u16 = vmull_u8(d21u8, d3u8);
+ q8u16 = vmull_u8(d22u8, d3u8);
+ q9u16 = vmull_u8(d23u8, d3u8);
+ q10u16 = vmull_u8(d24u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+
+ q7s16 = vqaddq_s16(q7s16, q3s16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q9s16 = vqaddq_s16(q9s16, q5s16);
+ q10s16 = vqaddq_s16(q10s16, q6s16);
+
+ d6u8 = vqrshrun_n_s16(q7s16, 7);
+ d7u8 = vqrshrun_n_s16(q8s16, 7);
+ d8u8 = vqrshrun_n_s16(q9s16, 7);
+ d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+ d18u8 = d22u8;
+ d19u8 = d23u8;
+ d20u8 = d24u8;
+ d21u8 = d25u8;
+ d22u8 = d26u8;
+
+ vst1_u8(dst, d6u8);
+ dst += dst_pitch;
+ vst1_u8(dst, d7u8);
+ dst += dst_pitch;
+ vst1_u8(dst, d8u8);
+ dst += dst_pitch;
+ vst1_u8(dst, d9u8);
+ dst += dst_pitch;
+ }
+ }
+ return;
+}
diff --git a/thirdparty/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c b/thirdparty/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c
new file mode 100644
index 0000000000..9d6807af71
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c
@@ -0,0 +1,550 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_config.h"
+#include "vpx_ports/arm.h"
+
+static INLINE void vp8_loop_filter_neon(
+ uint8x16_t qblimit, // flimit
+ uint8x16_t qlimit, // limit
+ uint8x16_t qthresh, // thresh
+ uint8x16_t q3, // p3
+ uint8x16_t q4, // p2
+ uint8x16_t q5, // p1
+ uint8x16_t q6, // p0
+ uint8x16_t q7, // q0
+ uint8x16_t q8, // q1
+ uint8x16_t q9, // q2
+ uint8x16_t q10, // q3
+ uint8x16_t *q5r, // p1
+ uint8x16_t *q6r, // p0
+ uint8x16_t *q7r, // q0
+ uint8x16_t *q8r) { // q1
+ uint8x16_t q0u8, q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+ int16x8_t q2s16, q11s16;
+ uint16x8_t q4u16;
+ int8x16_t q1s8, q2s8, q10s8, q11s8, q12s8, q13s8;
+ int8x8_t d2s8, d3s8;
+
+ q11u8 = vabdq_u8(q3, q4);
+ q12u8 = vabdq_u8(q4, q5);
+ q13u8 = vabdq_u8(q5, q6);
+ q14u8 = vabdq_u8(q8, q7);
+ q3 = vabdq_u8(q9, q8);
+ q4 = vabdq_u8(q10, q9);
+
+ q11u8 = vmaxq_u8(q11u8, q12u8);
+ q12u8 = vmaxq_u8(q13u8, q14u8);
+ q3 = vmaxq_u8(q3, q4);
+ q15u8 = vmaxq_u8(q11u8, q12u8);
+
+ q9 = vabdq_u8(q6, q7);
+
+ // vp8_hevmask
+ q13u8 = vcgtq_u8(q13u8, qthresh);
+ q14u8 = vcgtq_u8(q14u8, qthresh);
+ q15u8 = vmaxq_u8(q15u8, q3);
+
+ q2u8 = vabdq_u8(q5, q8);
+ q9 = vqaddq_u8(q9, q9);
+
+ q15u8 = vcgeq_u8(qlimit, q15u8);
+
+ // vp8_filter() function
+ // convert to signed
+ q10 = vdupq_n_u8(0x80);
+ q8 = veorq_u8(q8, q10);
+ q7 = veorq_u8(q7, q10);
+ q6 = veorq_u8(q6, q10);
+ q5 = veorq_u8(q5, q10);
+
+ q2u8 = vshrq_n_u8(q2u8, 1);
+ q9 = vqaddq_u8(q9, q2u8);
+
+ q10 = vdupq_n_u8(3);
+
+ q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
+ vget_low_s8(vreinterpretq_s8_u8(q6)));
+ q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
+ vget_high_s8(vreinterpretq_s8_u8(q6)));
+
+ q9 = vcgeq_u8(qblimit, q9);
+
+ q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
+ vreinterpretq_s8_u8(q8));
+
+ q14u8 = vorrq_u8(q13u8, q14u8);
+
+ q4u16 = vmovl_u8(vget_low_u8(q10));
+ q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
+ q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
+
+ q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
+ q15u8 = vandq_u8(q15u8, q9);
+
+ q1s8 = vreinterpretq_s8_u8(q1u8);
+ q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
+ q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
+
+ q9 = vdupq_n_u8(4);
+ // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
+ d2s8 = vqmovn_s16(q2s16);
+ d3s8 = vqmovn_s16(q11s16);
+ q1s8 = vcombine_s8(d2s8, d3s8);
+ q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
+ q1s8 = vreinterpretq_s8_u8(q1u8);
+
+ q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q10));
+ q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
+ q2s8 = vshrq_n_s8(q2s8, 3);
+ q1s8 = vshrq_n_s8(q1s8, 3);
+
+ q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
+ q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
+
+ q1s8 = vrshrq_n_s8(q1s8, 1);
+ q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+ q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
+ q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
+
+ q0u8 = vdupq_n_u8(0x80);
+ *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q0u8);
+ *q7r = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
+ *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
+ *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q0u8);
+ return;
+}
+
+void vp8_loop_filter_horizontal_edge_y_neon(
+ unsigned char *src,
+ int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh) {
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+ src -= (pitch << 2);
+
+ q3 = vld1q_u8(src);
+ src += pitch;
+ q4 = vld1q_u8(src);
+ src += pitch;
+ q5 = vld1q_u8(src);
+ src += pitch;
+ q6 = vld1q_u8(src);
+ src += pitch;
+ q7 = vld1q_u8(src);
+ src += pitch;
+ q8 = vld1q_u8(src);
+ src += pitch;
+ q9 = vld1q_u8(src);
+ src += pitch;
+ q10 = vld1q_u8(src);
+
+ vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+ q5, q6, q7, q8, q9, q10,
+ &q5, &q6, &q7, &q8);
+
+ src -= (pitch * 5);
+ vst1q_u8(src, q5);
+ src += pitch;
+ vst1q_u8(src, q6);
+ src += pitch;
+ vst1q_u8(src, q7);
+ src += pitch;
+ vst1q_u8(src, q8);
+ return;
+}
+
+void vp8_loop_filter_horizontal_edge_uv_neon(
+ unsigned char *u,
+ int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh,
+ unsigned char *v) {
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+ uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+ uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+
+ u -= (pitch << 2);
+ v -= (pitch << 2);
+
+ d6 = vld1_u8(u);
+ u += pitch;
+ d7 = vld1_u8(v);
+ v += pitch;
+ d8 = vld1_u8(u);
+ u += pitch;
+ d9 = vld1_u8(v);
+ v += pitch;
+ d10 = vld1_u8(u);
+ u += pitch;
+ d11 = vld1_u8(v);
+ v += pitch;
+ d12 = vld1_u8(u);
+ u += pitch;
+ d13 = vld1_u8(v);
+ v += pitch;
+ d14 = vld1_u8(u);
+ u += pitch;
+ d15 = vld1_u8(v);
+ v += pitch;
+ d16 = vld1_u8(u);
+ u += pitch;
+ d17 = vld1_u8(v);
+ v += pitch;
+ d18 = vld1_u8(u);
+ u += pitch;
+ d19 = vld1_u8(v);
+ v += pitch;
+ d20 = vld1_u8(u);
+ d21 = vld1_u8(v);
+
+ q3 = vcombine_u8(d6, d7);
+ q4 = vcombine_u8(d8, d9);
+ q5 = vcombine_u8(d10, d11);
+ q6 = vcombine_u8(d12, d13);
+ q7 = vcombine_u8(d14, d15);
+ q8 = vcombine_u8(d16, d17);
+ q9 = vcombine_u8(d18, d19);
+ q10 = vcombine_u8(d20, d21);
+
+ vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+ q5, q6, q7, q8, q9, q10,
+ &q5, &q6, &q7, &q8);
+
+ u -= (pitch * 5);
+ vst1_u8(u, vget_low_u8(q5));
+ u += pitch;
+ vst1_u8(u, vget_low_u8(q6));
+ u += pitch;
+ vst1_u8(u, vget_low_u8(q7));
+ u += pitch;
+ vst1_u8(u, vget_low_u8(q8));
+
+ v -= (pitch * 5);
+ vst1_u8(v, vget_high_u8(q5));
+ v += pitch;
+ vst1_u8(v, vget_high_u8(q6));
+ v += pitch;
+ vst1_u8(v, vget_high_u8(q7));
+ v += pitch;
+ vst1_u8(v, vget_high_u8(q8));
+ return;
+}
+
+static INLINE void write_4x8(unsigned char *dst, int pitch,
+ const uint8x8x4_t result) {
+#ifdef VPX_INCOMPATIBLE_GCC
+ /*
+ * uint8x8x4_t result
+ 00 01 02 03 | 04 05 06 07
+ 10 11 12 13 | 14 15 16 17
+ 20 21 22 23 | 24 25 26 27
+ 30 31 32 33 | 34 35 36 37
+ ---
+ * after vtrn_u16
+ 00 01 20 21 | 04 05 24 25
+ 02 03 22 23 | 06 07 26 27
+ 10 11 30 31 | 14 15 34 35
+ 12 13 32 33 | 16 17 36 37
+ ---
+ * after vtrn_u8
+ 00 10 20 30 | 04 14 24 34
+ 01 11 21 31 | 05 15 25 35
+ 02 12 22 32 | 06 16 26 36
+ 03 13 23 33 | 07 17 27 37
+ */
+ const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]),
+ vreinterpret_u16_u8(result.val[2]));
+ const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]),
+ vreinterpret_u16_u8(result.val[3]));
+ const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
+ vreinterpret_u8_u16(r13_u16.val[0]));
+ const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
+ vreinterpret_u8_u16(r13_u16.val[1]));
+ const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]);
+ const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]);
+ const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]);
+ const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]);
+ vst1_lane_u32((uint32_t *)dst, x_0_4, 0);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_1_5, 0);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_2_6, 0);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_3_7, 0);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_0_4, 1);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_1_5, 1);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_2_6, 1);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_3_7, 1);
+#else
+ vst4_lane_u8(dst, result, 0);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 1);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 2);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 3);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 4);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 5);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 6);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 7);
+#endif // VPX_INCOMPATIBLE_GCC
+}
+
+void vp8_loop_filter_vertical_edge_y_neon(
+ unsigned char *src,
+ int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh) {
+ unsigned char *s, *d;
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+ uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+ uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+ uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
+ uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
+ uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
+ uint8x8x4_t q4ResultH, q4ResultL;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+
+ s = src - 4;
+ d6 = vld1_u8(s);
+ s += pitch;
+ d8 = vld1_u8(s);
+ s += pitch;
+ d10 = vld1_u8(s);
+ s += pitch;
+ d12 = vld1_u8(s);
+ s += pitch;
+ d14 = vld1_u8(s);
+ s += pitch;
+ d16 = vld1_u8(s);
+ s += pitch;
+ d18 = vld1_u8(s);
+ s += pitch;
+ d20 = vld1_u8(s);
+ s += pitch;
+ d7 = vld1_u8(s);
+ s += pitch;
+ d9 = vld1_u8(s);
+ s += pitch;
+ d11 = vld1_u8(s);
+ s += pitch;
+ d13 = vld1_u8(s);
+ s += pitch;
+ d15 = vld1_u8(s);
+ s += pitch;
+ d17 = vld1_u8(s);
+ s += pitch;
+ d19 = vld1_u8(s);
+ s += pitch;
+ d21 = vld1_u8(s);
+
+ q3 = vcombine_u8(d6, d7);
+ q4 = vcombine_u8(d8, d9);
+ q5 = vcombine_u8(d10, d11);
+ q6 = vcombine_u8(d12, d13);
+ q7 = vcombine_u8(d14, d15);
+ q8 = vcombine_u8(d16, d17);
+ q9 = vcombine_u8(d18, d19);
+ q10 = vcombine_u8(d20, d21);
+
+ q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+ q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+ q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+ q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+ q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+ vreinterpretq_u16_u32(q2tmp2.val[0]));
+ q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+ vreinterpretq_u16_u32(q2tmp3.val[0]));
+ q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+ vreinterpretq_u16_u32(q2tmp2.val[1]));
+ q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+ vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+ q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+ vreinterpretq_u8_u16(q2tmp5.val[0]));
+ q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+ vreinterpretq_u8_u16(q2tmp5.val[1]));
+ q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+ vreinterpretq_u8_u16(q2tmp7.val[0]));
+ q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+ vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+ q3 = q2tmp8.val[0];
+ q4 = q2tmp8.val[1];
+ q5 = q2tmp9.val[0];
+ q6 = q2tmp9.val[1];
+ q7 = q2tmp10.val[0];
+ q8 = q2tmp10.val[1];
+ q9 = q2tmp11.val[0];
+ q10 = q2tmp11.val[1];
+
+ vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+ q5, q6, q7, q8, q9, q10,
+ &q5, &q6, &q7, &q8);
+
+ q4ResultL.val[0] = vget_low_u8(q5); // d10
+ q4ResultL.val[1] = vget_low_u8(q6); // d12
+ q4ResultL.val[2] = vget_low_u8(q7); // d14
+ q4ResultL.val[3] = vget_low_u8(q8); // d16
+ q4ResultH.val[0] = vget_high_u8(q5); // d11
+ q4ResultH.val[1] = vget_high_u8(q6); // d13
+ q4ResultH.val[2] = vget_high_u8(q7); // d15
+ q4ResultH.val[3] = vget_high_u8(q8); // d17
+
+ d = src - 2;
+ write_4x8(d, pitch, q4ResultL);
+ d += pitch * 8;
+ write_4x8(d, pitch, q4ResultH);
+}
+
+void vp8_loop_filter_vertical_edge_uv_neon(
+ unsigned char *u,
+ int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh,
+ unsigned char *v) {
+ unsigned char *us, *ud;
+ unsigned char *vs, *vd;
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+ uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+ uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+ uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
+ uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
+ uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
+ uint8x8x4_t q4ResultH, q4ResultL;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+
+ us = u - 4;
+ d6 = vld1_u8(us);
+ us += pitch;
+ d8 = vld1_u8(us);
+ us += pitch;
+ d10 = vld1_u8(us);
+ us += pitch;
+ d12 = vld1_u8(us);
+ us += pitch;
+ d14 = vld1_u8(us);
+ us += pitch;
+ d16 = vld1_u8(us);
+ us += pitch;
+ d18 = vld1_u8(us);
+ us += pitch;
+ d20 = vld1_u8(us);
+
+ vs = v - 4;
+ d7 = vld1_u8(vs);
+ vs += pitch;
+ d9 = vld1_u8(vs);
+ vs += pitch;
+ d11 = vld1_u8(vs);
+ vs += pitch;
+ d13 = vld1_u8(vs);
+ vs += pitch;
+ d15 = vld1_u8(vs);
+ vs += pitch;
+ d17 = vld1_u8(vs);
+ vs += pitch;
+ d19 = vld1_u8(vs);
+ vs += pitch;
+ d21 = vld1_u8(vs);
+
+ q3 = vcombine_u8(d6, d7);
+ q4 = vcombine_u8(d8, d9);
+ q5 = vcombine_u8(d10, d11);
+ q6 = vcombine_u8(d12, d13);
+ q7 = vcombine_u8(d14, d15);
+ q8 = vcombine_u8(d16, d17);
+ q9 = vcombine_u8(d18, d19);
+ q10 = vcombine_u8(d20, d21);
+
+ q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+ q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+ q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+ q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+ q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+ vreinterpretq_u16_u32(q2tmp2.val[0]));
+ q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+ vreinterpretq_u16_u32(q2tmp3.val[0]));
+ q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+ vreinterpretq_u16_u32(q2tmp2.val[1]));
+ q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+ vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+ q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+ vreinterpretq_u8_u16(q2tmp5.val[0]));
+ q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+ vreinterpretq_u8_u16(q2tmp5.val[1]));
+ q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+ vreinterpretq_u8_u16(q2tmp7.val[0]));
+ q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+ vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+ q3 = q2tmp8.val[0];
+ q4 = q2tmp8.val[1];
+ q5 = q2tmp9.val[0];
+ q6 = q2tmp9.val[1];
+ q7 = q2tmp10.val[0];
+ q8 = q2tmp10.val[1];
+ q9 = q2tmp11.val[0];
+ q10 = q2tmp11.val[1];
+
+ vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+ q5, q6, q7, q8, q9, q10,
+ &q5, &q6, &q7, &q8);
+
+ q4ResultL.val[0] = vget_low_u8(q5); // d10
+ q4ResultL.val[1] = vget_low_u8(q6); // d12
+ q4ResultL.val[2] = vget_low_u8(q7); // d14
+ q4ResultL.val[3] = vget_low_u8(q8); // d16
+ ud = u - 2;
+ write_4x8(ud, pitch, q4ResultL);
+
+ q4ResultH.val[0] = vget_high_u8(q5); // d11
+ q4ResultH.val[1] = vget_high_u8(q6); // d13
+ q4ResultH.val[2] = vget_high_u8(q7); // d15
+ q4ResultH.val[3] = vget_high_u8(q8); // d17
+ vd = v - 2;
+ write_4x8(vd, pitch, q4ResultH);
+}
diff --git a/thirdparty/libvpx/vp8/common/blockd.c b/thirdparty/libvpx/vp8/common/blockd.c
new file mode 100644
index 0000000000..1fc3cd0ca7
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/blockd.c
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "blockd.h"
+#include "vpx_mem/vpx_mem.h"
+
+const unsigned char vp8_block2left[25] =
+{
+ 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+};
+const unsigned char vp8_block2above[25] =
+{
+ 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8
+};
diff --git a/thirdparty/libvpx/vp8/common/blockd.h b/thirdparty/libvpx/vp8/common/blockd.h
new file mode 100644
index 0000000000..192108a06d
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/blockd.h
@@ -0,0 +1,312 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_BLOCKD_H_
+#define VP8_COMMON_BLOCKD_H_
+
+void vpx_log(const char *format, ...);
+
+#include "vpx_config.h"
+#include "vpx_scale/yv12config.h"
+#include "mv.h"
+#include "treecoder.h"
+#include "vpx_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*#define DCPRED 1*/
+#define DCPREDSIMTHRESH 0
+#define DCPREDCNTTHRESH 3
+
+#define MB_FEATURE_TREE_PROBS 3
+#define MAX_MB_SEGMENTS 4
+
+#define MAX_REF_LF_DELTAS 4
+#define MAX_MODE_LF_DELTAS 4
+
+/* Segment Feature Masks */
+#define SEGMENT_DELTADATA 0
+#define SEGMENT_ABSDATA 1
+
+typedef struct
+{
+ int r, c;
+} POS;
+
+#define PLANE_TYPE_Y_NO_DC 0
+#define PLANE_TYPE_Y2 1
+#define PLANE_TYPE_UV 2
+#define PLANE_TYPE_Y_WITH_DC 3
+
+
+typedef char ENTROPY_CONTEXT;
+typedef struct
+{
+ ENTROPY_CONTEXT y1[4];
+ ENTROPY_CONTEXT u[2];
+ ENTROPY_CONTEXT v[2];
+ ENTROPY_CONTEXT y2;
+} ENTROPY_CONTEXT_PLANES;
+
+extern const unsigned char vp8_block2left[25];
+extern const unsigned char vp8_block2above[25];
+
+#define VP8_COMBINEENTROPYCONTEXTS( Dest, A, B) \
+ Dest = (A)+(B);
+
+
+typedef enum
+{
+ KEY_FRAME = 0,
+ INTER_FRAME = 1
+} FRAME_TYPE;
+
+typedef enum
+{
+ DC_PRED, /* average of above and left pixels */
+ V_PRED, /* vertical prediction */
+ H_PRED, /* horizontal prediction */
+ TM_PRED, /* Truemotion prediction */
+ B_PRED, /* block based prediction, each block has its own prediction mode */
+
+ NEARESTMV,
+ NEARMV,
+ ZEROMV,
+ NEWMV,
+ SPLITMV,
+
+ MB_MODE_COUNT
+} MB_PREDICTION_MODE;
+
+/* Macroblock level features */
+typedef enum
+{
+ MB_LVL_ALT_Q = 0, /* Use alternate Quantizer .... */
+ MB_LVL_ALT_LF = 1, /* Use alternate loop filter value... */
+ MB_LVL_MAX = 2 /* Number of MB level features supported */
+
+} MB_LVL_FEATURES;
+
+/* Segment Feature Masks */
+#define SEGMENT_ALTQ 0x01
+#define SEGMENT_ALT_LF 0x02
+
+#define VP8_YMODES (B_PRED + 1)
+#define VP8_UV_MODES (TM_PRED + 1)
+
+#define VP8_MVREFS (1 + SPLITMV - NEARESTMV)
+
+typedef enum
+{
+ B_DC_PRED, /* average of above and left pixels */
+ B_TM_PRED,
+
+ B_VE_PRED, /* vertical prediction */
+ B_HE_PRED, /* horizontal prediction */
+
+ B_LD_PRED,
+ B_RD_PRED,
+
+ B_VR_PRED,
+ B_VL_PRED,
+ B_HD_PRED,
+ B_HU_PRED,
+
+ LEFT4X4,
+ ABOVE4X4,
+ ZERO4X4,
+ NEW4X4,
+
+ B_MODE_COUNT
+} B_PREDICTION_MODE;
+
+#define VP8_BINTRAMODES (B_HU_PRED + 1) /* 10 */
+#define VP8_SUBMVREFS (1 + NEW4X4 - LEFT4X4)
+
+/* For keyframes, intra block modes are predicted by the (already decoded)
+ modes for the Y blocks to the left and above us; for interframes, there
+ is a single probability table. */
+
+union b_mode_info
+{
+ B_PREDICTION_MODE as_mode;
+ int_mv mv;
+};
+
+typedef enum
+{
+ INTRA_FRAME = 0,
+ LAST_FRAME = 1,
+ GOLDEN_FRAME = 2,
+ ALTREF_FRAME = 3,
+ MAX_REF_FRAMES = 4
+} MV_REFERENCE_FRAME;
+
+typedef struct
+{
+ uint8_t mode, uv_mode;
+ uint8_t ref_frame;
+ uint8_t is_4x4;
+ int_mv mv;
+
+ uint8_t partitioning;
+ uint8_t mb_skip_coeff; /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
+ uint8_t need_to_clamp_mvs;
+ uint8_t segment_id; /* Which set of segmentation parameters should be used for this MB */
+} MB_MODE_INFO;
+
+typedef struct modeinfo
+{
+ MB_MODE_INFO mbmi;
+ union b_mode_info bmi[16];
+} MODE_INFO;
+
+#if CONFIG_MULTI_RES_ENCODING
+/* The mb-level information needed to be stored for higher-resolution encoder */
+typedef struct
+{
+ MB_PREDICTION_MODE mode;
+ MV_REFERENCE_FRAME ref_frame;
+ int_mv mv;
+ int dissim; /* dissimilarity level of the macroblock */
+} LOWER_RES_MB_INFO;
+
+/* The frame-level information needed to be stored for higher-resolution
+ * encoder */
+typedef struct
+{
+ FRAME_TYPE frame_type;
+ int is_frame_dropped;
+ // The frame rate for the lowest resolution.
+ double low_res_framerate;
+ /* The frame number of each reference frames */
+ unsigned int low_res_ref_frames[MAX_REF_FRAMES];
+ // The video frame counter value for the key frame, for lowest resolution.
+ unsigned int key_frame_counter_value;
+ LOWER_RES_MB_INFO *mb_info;
+} LOWER_RES_FRAME_INFO;
+#endif
+
+typedef struct blockd
+{
+ short *qcoeff;
+ short *dqcoeff;
+ unsigned char *predictor;
+ short *dequant;
+
+ int offset;
+ char *eob;
+
+ union b_mode_info bmi;
+} BLOCKD;
+
+typedef void (*vp8_subpix_fn_t)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+
+typedef struct macroblockd
+{
+ DECLARE_ALIGNED(16, unsigned char, predictor[384]);
+ DECLARE_ALIGNED(16, short, qcoeff[400]);
+ DECLARE_ALIGNED(16, short, dqcoeff[400]);
+ DECLARE_ALIGNED(16, char, eobs[25]);
+
+ DECLARE_ALIGNED(16, short, dequant_y1[16]);
+ DECLARE_ALIGNED(16, short, dequant_y1_dc[16]);
+ DECLARE_ALIGNED(16, short, dequant_y2[16]);
+ DECLARE_ALIGNED(16, short, dequant_uv[16]);
+
+ /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */
+ BLOCKD block[25];
+ int fullpixel_mask;
+
+ YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */
+ YV12_BUFFER_CONFIG dst;
+
+ MODE_INFO *mode_info_context;
+ int mode_info_stride;
+
+ FRAME_TYPE frame_type;
+
+ int up_available;
+ int left_available;
+
+ unsigned char *recon_above[3];
+ unsigned char *recon_left[3];
+ int recon_left_stride[2];
+
+ /* Y,U,V,Y2 */
+ ENTROPY_CONTEXT_PLANES *above_context;
+ ENTROPY_CONTEXT_PLANES *left_context;
+
+ /* 0 indicates segmentation at MB level is not enabled. Otherwise the individual bits indicate which features are active. */
+ unsigned char segmentation_enabled;
+
+ /* 0 (do not update) 1 (update) the macroblock segmentation map. */
+ unsigned char update_mb_segmentation_map;
+
+ /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
+ unsigned char update_mb_segmentation_data;
+
+ /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
+ unsigned char mb_segement_abs_delta;
+
+ /* Per frame flags that define which MB level features (such as quantizer or loop filter level) */
+ /* are enabled and when enabled the proabilities used to decode the per MB flags in MB_MODE_INFO */
+ vp8_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS]; /* Probability Tree used to code Segment number */
+
+ signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS]; /* Segment parameters */
+
+ /* mode_based Loop filter adjustment */
+ unsigned char mode_ref_lf_delta_enabled;
+ unsigned char mode_ref_lf_delta_update;
+
+ /* Delta values have the range +/- MAX_LOOP_FILTER */
+ signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS]; /* 0 = Intra, Last, GF, ARF */
+ signed char ref_lf_deltas[MAX_REF_LF_DELTAS]; /* 0 = Intra, Last, GF, ARF */
+ signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS]; /* 0 = BPRED, ZERO_MV, MV, SPLIT */
+ signed char mode_lf_deltas[MAX_MODE_LF_DELTAS]; /* 0 = BPRED, ZERO_MV, MV, SPLIT */
+
+ /* Distance of MB away from frame edges */
+ int mb_to_left_edge;
+ int mb_to_right_edge;
+ int mb_to_top_edge;
+ int mb_to_bottom_edge;
+
+
+
+ vp8_subpix_fn_t subpixel_predict;
+ vp8_subpix_fn_t subpixel_predict8x4;
+ vp8_subpix_fn_t subpixel_predict8x8;
+ vp8_subpix_fn_t subpixel_predict16x16;
+
+ void *current_bc;
+
+ int corrupted;
+
+#if ARCH_X86 || ARCH_X86_64
+ /* This is an intermediate buffer currently used in sub-pixel motion search
+ * to keep a copy of the reference area. This buffer can be used for other
+ * purpose.
+ */
+ DECLARE_ALIGNED(32, unsigned char, y_buf[22*32]);
+#endif
+} MACROBLOCKD;
+
+
+extern void vp8_build_block_doffsets(MACROBLOCKD *x);
+extern void vp8_setup_block_dptrs(MACROBLOCKD *x);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_BLOCKD_H_
diff --git a/thirdparty/libvpx/vp8/common/coefupdateprobs.h b/thirdparty/libvpx/vp8/common/coefupdateprobs.h
new file mode 100644
index 0000000000..d96a19e747
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/coefupdateprobs.h
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_COMMON_COEFUPDATEPROBS_H_
+#define VP8_COMMON_COEFUPDATEPROBS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Update probabilities for the nodes in the token entropy tree.
+ Generated file included by entropy.c */
+
+const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES] =
+{
+ {
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255, },
+ {250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ },
+ {
+ {
+ {217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255, },
+ {234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255, },
+ },
+ {
+ {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ },
+ {
+ {
+ {186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255, },
+ {251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255, },
+ },
+ {
+ {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ },
+ {
+ {
+ {248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255, },
+ {248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ },
+};
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_COEFUPDATEPROBS_H_
diff --git a/thirdparty/libvpx/vp8/common/common.h b/thirdparty/libvpx/vp8/common/common.h
new file mode 100644
index 0000000000..e58a9cc23b
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/common.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_COMMON_H_
+#define VP8_COMMON_COMMON_H_
+
+#include <assert.h>
+
+/* Interface header for common constant data structures and lookup tables */
+
+#include "vpx_mem/vpx_mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Only need this for fixed-size arrays, for structs just assign. */
+
+#define vp8_copy( Dest, Src) { \
+ assert( sizeof( Dest) == sizeof( Src)); \
+ memcpy( Dest, Src, sizeof( Src)); \
+ }
+
+/* Use this for variably-sized arrays. */
+
+#define vp8_copy_array( Dest, Src, N) { \
+ assert( sizeof( *Dest) == sizeof( *Src)); \
+ memcpy( Dest, Src, N * sizeof( *Src)); \
+ }
+
+#define vp8_zero( Dest) memset( &Dest, 0, sizeof( Dest));
+
+#define vp8_zero_array( Dest, N) memset( Dest, 0, N * sizeof( *Dest));
+
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_COMMON_H_
diff --git a/thirdparty/libvpx/vp8/common/copy_c.c b/thirdparty/libvpx/vp8/common/copy_c.c
new file mode 100644
index 0000000000..e3392913f6
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/copy_c.c
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <string.h>
+
+#include "./vp8_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+/* Copy 2 macroblocks to a buffer */
+void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride,
+ unsigned char *dst_ptr, int dst_stride,
+ int height)
+{
+ int r;
+
+ for (r = 0; r < height; r++)
+ {
+ memcpy(dst_ptr, src_ptr, 32);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+
+ }
+}
diff --git a/thirdparty/libvpx/vp8/common/debugmodes.c b/thirdparty/libvpx/vp8/common/debugmodes.c
new file mode 100644
index 0000000000..159fddc6a7
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/debugmodes.c
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdio.h>
+#include "blockd.h"
+
+
+void vp8_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, int frame)
+{
+
+ int mb_row;
+ int mb_col;
+ int mb_index = 0;
+ FILE *mvs = fopen("mvs.stt", "a");
+
+ /* print out the macroblock Y modes */
+ mb_index = 0;
+ fprintf(mvs, "Mb Modes for Frame %d\n", frame);
+
+ for (mb_row = 0; mb_row < rows; mb_row++)
+ {
+ for (mb_col = 0; mb_col < cols; mb_col++)
+ {
+
+ fprintf(mvs, "%2d ", mi[mb_index].mbmi.mode);
+
+ mb_index++;
+ }
+
+ fprintf(mvs, "\n");
+ mb_index++;
+ }
+
+ fprintf(mvs, "\n");
+
+ mb_index = 0;
+ fprintf(mvs, "Mb mv ref for Frame %d\n", frame);
+
+ for (mb_row = 0; mb_row < rows; mb_row++)
+ {
+ for (mb_col = 0; mb_col < cols; mb_col++)
+ {
+
+ fprintf(mvs, "%2d ", mi[mb_index].mbmi.ref_frame);
+
+ mb_index++;
+ }
+
+ fprintf(mvs, "\n");
+ mb_index++;
+ }
+
+ fprintf(mvs, "\n");
+
+ /* print out the macroblock UV modes */
+ mb_index = 0;
+ fprintf(mvs, "UV Modes for Frame %d\n", frame);
+
+ for (mb_row = 0; mb_row < rows; mb_row++)
+ {
+ for (mb_col = 0; mb_col < cols; mb_col++)
+ {
+
+ fprintf(mvs, "%2d ", mi[mb_index].mbmi.uv_mode);
+
+ mb_index++;
+ }
+
+ mb_index++;
+ fprintf(mvs, "\n");
+ }
+
+ fprintf(mvs, "\n");
+
+ /* print out the block modes */
+ fprintf(mvs, "Mbs for Frame %d\n", frame);
+ {
+ int b_row;
+
+ for (b_row = 0; b_row < 4 * rows; b_row++)
+ {
+ int b_col;
+ int bindex;
+
+ for (b_col = 0; b_col < 4 * cols; b_col++)
+ {
+ mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);
+ bindex = (b_row & 3) * 4 + (b_col & 3);
+
+ if (mi[mb_index].mbmi.mode == B_PRED)
+ fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode);
+ else
+ fprintf(mvs, "xx ");
+
+ }
+
+ fprintf(mvs, "\n");
+ }
+ }
+ fprintf(mvs, "\n");
+
+ /* print out the macroblock mvs */
+ mb_index = 0;
+ fprintf(mvs, "MVs for Frame %d\n", frame);
+
+ for (mb_row = 0; mb_row < rows; mb_row++)
+ {
+ for (mb_col = 0; mb_col < cols; mb_col++)
+ {
+ fprintf(mvs, "%5d:%-5d", mi[mb_index].mbmi.mv.as_mv.row / 2, mi[mb_index].mbmi.mv.as_mv.col / 2);
+
+ mb_index++;
+ }
+
+ mb_index++;
+ fprintf(mvs, "\n");
+ }
+
+ fprintf(mvs, "\n");
+
+
+ /* print out the block modes */
+ fprintf(mvs, "MVs for Frame %d\n", frame);
+ {
+ int b_row;
+
+ for (b_row = 0; b_row < 4 * rows; b_row++)
+ {
+ int b_col;
+ int bindex;
+
+ for (b_col = 0; b_col < 4 * cols; b_col++)
+ {
+ mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);
+ bindex = (b_row & 3) * 4 + (b_col & 3);
+ fprintf(mvs, "%3d:%-3d ", mi[mb_index].bmi[bindex].mv.as_mv.row, mi[mb_index].bmi[bindex].mv.as_mv.col);
+
+ }
+
+ fprintf(mvs, "\n");
+ }
+ }
+ fprintf(mvs, "\n");
+
+
+ fclose(mvs);
+}
diff --git a/thirdparty/libvpx/vp8/common/default_coef_probs.h b/thirdparty/libvpx/vp8/common/default_coef_probs.h
new file mode 100644
index 0000000000..4d69e4be66
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/default_coef_probs.h
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+*/
+
+#ifndef VP8_COMMON_DEFAULT_COEF_PROBS_H_
+#define VP8_COMMON_DEFAULT_COEF_PROBS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*Generated file, included by entropy.c*/
+
+
+static const vp8_prob default_coef_probs [BLOCK_TYPES]
+ [COEF_BANDS]
+ [PREV_COEF_CONTEXTS]
+ [ENTROPY_NODES] =
+{
+ { /* Block Type ( 0 ) */
+ { /* Coeff Band ( 0 )*/
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 1 )*/
+ { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 },
+ { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 },
+ { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 2 )*/
+ { 1, 98, 248, 255, 236, 226, 255, 255, 128, 128, 128 },
+ { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 },
+ { 78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 3 )*/
+ { 1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 },
+ { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 },
+ { 77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 4 )*/
+ { 1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 },
+ { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 },
+ { 37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 5 )*/
+ { 1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 },
+ { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 },
+ { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 6 )*/
+ { 1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 },
+ { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 },
+ { 80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 7 )*/
+ { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 246, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+ }
+ },
+ { /* Block Type ( 1 ) */
+ { /* Coeff Band ( 0 )*/
+ { 198, 35, 237, 223, 193, 187, 162, 160, 145, 155, 62 },
+ { 131, 45, 198, 221, 172, 176, 220, 157, 252, 221, 1 },
+ { 68, 47, 146, 208, 149, 167, 221, 162, 255, 223, 128 }
+ },
+ { /* Coeff Band ( 1 )*/
+ { 1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 },
+ { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 },
+ { 81, 99, 181, 242, 176, 190, 249, 202, 255, 255, 128 }
+ },
+ { /* Coeff Band ( 2 )*/
+ { 1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 },
+ { 99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 },
+ { 23, 91, 163, 242, 170, 187, 247, 210, 255, 255, 128 }
+ },
+ { /* Coeff Band ( 3 )*/
+ { 1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 },
+ { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 },
+ { 44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 4 )*/
+ { 1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 },
+ { 94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 },
+ { 22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 5 )*/
+ { 1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 },
+ { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 },
+ { 35, 77, 181, 251, 193, 211, 255, 205, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 6 )*/
+ { 1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 },
+ { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 },
+ { 45, 99, 188, 251, 195, 217, 255, 224, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 7 )*/
+ { 1, 1, 251, 255, 213, 255, 128, 128, 128, 128, 128 },
+ { 203, 1, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 }
+ }
+ },
+ { /* Block Type ( 2 ) */
+ { /* Coeff Band ( 0 )*/
+ { 253, 9, 248, 251, 207, 208, 255, 192, 128, 128, 128 },
+ { 175, 13, 224, 243, 193, 185, 249, 198, 255, 255, 128 },
+ { 73, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 }
+ },
+ { /* Coeff Band ( 1 )*/
+ { 1, 95, 247, 253, 212, 183, 255, 255, 128, 128, 128 },
+ { 239, 90, 244, 250, 211, 209, 255, 255, 128, 128, 128 },
+ { 155, 77, 195, 248, 188, 195, 255, 255, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 2 )*/
+ { 1, 24, 239, 251, 218, 219, 255, 205, 128, 128, 128 },
+ { 201, 51, 219, 255, 196, 186, 128, 128, 128, 128, 128 },
+ { 69, 46, 190, 239, 201, 218, 255, 228, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 3 )*/
+ { 1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 },
+ { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 4 )*/
+ { 1, 16, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 190, 36, 230, 255, 236, 255, 128, 128, 128, 128, 128 },
+ { 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 5 )*/
+ { 1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 6 )*/
+ { 1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 213, 62, 250, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 55, 93, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 7 )*/
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+ }
+ },
+ { /* Block Type ( 3 ) */
+ { /* Coeff Band ( 0 )*/
+ { 202, 24, 213, 235, 186, 191, 220, 160, 240, 175, 255 },
+ { 126, 38, 182, 232, 169, 184, 228, 174, 255, 187, 128 },
+ { 61, 46, 138, 219, 151, 178, 240, 170, 255, 216, 128 }
+ },
+ { /* Coeff Band ( 1 )*/
+ { 1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 },
+ { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 },
+ { 39, 77, 162, 232, 172, 180, 245, 178, 255, 255, 128 }
+ },
+ { /* Coeff Band ( 2 )*/
+ { 1, 52, 220, 246, 198, 199, 249, 220, 255, 255, 128 },
+ { 124, 74, 191, 243, 183, 193, 250, 221, 255, 255, 128 },
+ { 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 }
+ },
+ { /* Coeff Band ( 3 )*/
+ { 1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 },
+ { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 },
+ { 28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 }
+ },
+ { /* Coeff Band ( 4 )*/
+ { 1, 81, 230, 252, 204, 203, 255, 192, 128, 128, 128 },
+ { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 },
+ { 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 5 )*/
+ { 1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 },
+ { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 },
+ { 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 6 )*/
+ { 1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 },
+ { 141, 84, 213, 252, 201, 202, 255, 219, 128, 128, 128 },
+ { 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 7 )*/
+ { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 244, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+ }
+ }
+};
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_DEFAULT_COEF_PROBS_H_
diff --git a/thirdparty/libvpx/vp8/common/dequantize.c b/thirdparty/libvpx/vp8/common/dequantize.c
new file mode 100644
index 0000000000..f8b04fa4ee
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/dequantize.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "vp8/common/blockd.h"
+#include "vpx_mem/vpx_mem.h"
+
+void vp8_dequantize_b_c(BLOCKD *d, short *DQC)
+{
+ int i;
+ short *DQ = d->dqcoeff;
+ short *Q = d->qcoeff;
+
+ for (i = 0; i < 16; i++)
+ {
+ DQ[i] = Q[i] * DQC[i];
+ }
+}
+
+void vp8_dequant_idct_add_c(short *input, short *dq,
+ unsigned char *dest, int stride)
+{
+ int i;
+
+ for (i = 0; i < 16; i++)
+ {
+ input[i] = dq[i] * input[i];
+ }
+
+ vp8_short_idct4x4llm_c(input, dest, stride, dest, stride);
+
+ memset(input, 0, 32);
+
+}
diff --git a/thirdparty/libvpx/vp8/common/entropy.c b/thirdparty/libvpx/vp8/common/entropy.c
new file mode 100644
index 0000000000..c00e565f06
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/entropy.c
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "entropy.h"
+#include "blockd.h"
+#include "onyxc_int.h"
+#include "vpx_mem/vpx_mem.h"
+
+#include "coefupdateprobs.h"
+
+DECLARE_ALIGNED(16, const unsigned char, vp8_norm[256]) =
+{
+ 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+DECLARE_ALIGNED(16, const unsigned char, vp8_coef_bands[16]) =
+{ 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7};
+
+DECLARE_ALIGNED(16, const unsigned char,
+ vp8_prev_token_class[MAX_ENTROPY_TOKENS]) =
+{ 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0};
+
+DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]) =
+{
+ 0, 1, 4, 8,
+ 5, 2, 3, 6,
+ 9, 12, 13, 10,
+ 7, 11, 14, 15,
+};
+
+DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]) =
+{
+ 1, 2, 6, 7,
+ 3, 5, 8, 13,
+ 4, 9, 12, 14,
+ 10, 11, 15, 16
+};
+
+/* vp8_default_zig_zag_mask generated with:
+
+ void vp8_init_scan_order_mask()
+ {
+ int i;
+
+ for (i = 0; i < 16; i++)
+ {
+ vp8_default_zig_zag_mask[vp8_default_zig_zag1d[i]] = 1 << i;
+ }
+
+ }
+*/
+DECLARE_ALIGNED(16, const short, vp8_default_zig_zag_mask[16]) =
+{
+ 1, 2, 32, 64,
+ 4, 16, 128, 4096,
+ 8, 256, 2048, 8192,
+ 512, 1024, 16384, -32768
+};
+
+const int vp8_mb_feature_data_bits[MB_LVL_MAX] = {7, 6};
+
+/* Array indices are identical to previously-existing CONTEXT_NODE indices */
+
+const vp8_tree_index vp8_coef_tree[ 22] = /* corresponding _CONTEXT_NODEs */
+{
+ -DCT_EOB_TOKEN, 2, /* 0 = EOB */
+ -ZERO_TOKEN, 4, /* 1 = ZERO */
+ -ONE_TOKEN, 6, /* 2 = ONE */
+ 8, 12, /* 3 = LOW_VAL */
+ -TWO_TOKEN, 10, /* 4 = TWO */
+ -THREE_TOKEN, -FOUR_TOKEN, /* 5 = THREE */
+ 14, 16, /* 6 = HIGH_LOW */
+ -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2, /* 7 = CAT_ONE */
+ 18, 20, /* 8 = CAT_THREEFOUR */
+ -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4, /* 9 = CAT_THREE */
+ -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6 /* 10 = CAT_FIVE */
+};
+
+/* vp8_coef_encodings generated with:
+ vp8_tokens_from_tree(vp8_coef_encodings, vp8_coef_tree);
+*/
+vp8_token vp8_coef_encodings[MAX_ENTROPY_TOKENS] =
+{
+ {2, 2},
+ {6, 3},
+ {28, 5},
+ {58, 6},
+ {59, 6},
+ {60, 6},
+ {61, 6},
+ {124, 7},
+ {125, 7},
+ {126, 7},
+ {127, 7},
+ {0, 1}
+};
+
+/* Trees for extra bits. Probabilities are constant and
+ do not depend on previously encoded bits */
+
+static const vp8_prob Pcat1[] = { 159};
+static const vp8_prob Pcat2[] = { 165, 145};
+static const vp8_prob Pcat3[] = { 173, 148, 140};
+static const vp8_prob Pcat4[] = { 176, 155, 140, 135};
+static const vp8_prob Pcat5[] = { 180, 157, 141, 134, 130};
+static const vp8_prob Pcat6[] =
+{ 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129};
+
+
+/* tree index tables generated with:
+
+ void init_bit_tree(vp8_tree_index *p, int n)
+ {
+ int i = 0;
+
+ while (++i < n)
+ {
+ p[0] = p[1] = i << 1;
+ p += 2;
+ }
+
+ p[0] = p[1] = 0;
+ }
+
+ void init_bit_trees()
+ {
+ init_bit_tree(cat1, 1);
+ init_bit_tree(cat2, 2);
+ init_bit_tree(cat3, 3);
+ init_bit_tree(cat4, 4);
+ init_bit_tree(cat5, 5);
+ init_bit_tree(cat6, 11);
+ }
+*/
+
+static const vp8_tree_index cat1[2] = { 0, 0 };
+static const vp8_tree_index cat2[4] = { 2, 2, 0, 0 };
+static const vp8_tree_index cat3[6] = { 2, 2, 4, 4, 0, 0 };
+static const vp8_tree_index cat4[8] = { 2, 2, 4, 4, 6, 6, 0, 0 };
+static const vp8_tree_index cat5[10] = { 2, 2, 4, 4, 6, 6, 8, 8, 0, 0 };
+static const vp8_tree_index cat6[22] = { 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12,
+ 14, 14, 16, 16, 18, 18, 20, 20, 0, 0 };
+
+const vp8_extra_bit_struct vp8_extra_bits[12] =
+{
+ { 0, 0, 0, 0},
+ { 0, 0, 0, 1},
+ { 0, 0, 0, 2},
+ { 0, 0, 0, 3},
+ { 0, 0, 0, 4},
+ { cat1, Pcat1, 1, 5},
+ { cat2, Pcat2, 2, 7},
+ { cat3, Pcat3, 3, 11},
+ { cat4, Pcat4, 4, 19},
+ { cat5, Pcat5, 5, 35},
+ { cat6, Pcat6, 11, 67},
+ { 0, 0, 0, 0}
+};
+
+#include "default_coef_probs.h"
+
+void vp8_default_coef_probs(VP8_COMMON *pc)
+{
+ memcpy(pc->fc.coef_probs, default_coef_probs, sizeof(default_coef_probs));
+}
+
diff --git a/thirdparty/libvpx/vp8/common/entropy.h b/thirdparty/libvpx/vp8/common/entropy.h
new file mode 100644
index 0000000000..a90bab4bac
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/entropy.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_ENTROPY_H_
+#define VP8_COMMON_ENTROPY_H_
+
+#include "treecoder.h"
+#include "blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Coefficient token alphabet */
+
+#define ZERO_TOKEN 0 /* 0 Extra Bits 0+0 */
+#define ONE_TOKEN 1 /* 1 Extra Bits 0+1 */
+#define TWO_TOKEN 2 /* 2 Extra Bits 0+1 */
+#define THREE_TOKEN 3 /* 3 Extra Bits 0+1 */
+#define FOUR_TOKEN 4 /* 4 Extra Bits 0+1 */
+#define DCT_VAL_CATEGORY1 5 /* 5-6 Extra Bits 1+1 */
+#define DCT_VAL_CATEGORY2 6 /* 7-10 Extra Bits 2+1 */
+#define DCT_VAL_CATEGORY3 7 /* 11-18 Extra Bits 3+1 */
+#define DCT_VAL_CATEGORY4 8 /* 19-34 Extra Bits 4+1 */
+#define DCT_VAL_CATEGORY5 9 /* 35-66 Extra Bits 5+1 */
+#define DCT_VAL_CATEGORY6 10 /* 67+ Extra Bits 11+1 */
+#define DCT_EOB_TOKEN 11 /* EOB Extra Bits 0+0 */
+
+#define MAX_ENTROPY_TOKENS 12
+#define ENTROPY_NODES 11
+
+extern const vp8_tree_index vp8_coef_tree[];
+
+extern const struct vp8_token_struct vp8_coef_encodings[MAX_ENTROPY_TOKENS];
+
+typedef struct
+{
+ vp8_tree_p tree;
+ const vp8_prob *prob;
+ int Len;
+ int base_val;
+} vp8_extra_bit_struct;
+
+extern const vp8_extra_bit_struct vp8_extra_bits[12]; /* indexed by token value */
+
+#define PROB_UPDATE_BASELINE_COST 7
+
+#define MAX_PROB 255
+#define DCT_MAX_VALUE 2048
+
+
+/* Coefficients are predicted via a 3-dimensional probability table. */
+
+/* Outside dimension. 0 = Y no DC, 1 = Y2, 2 = UV, 3 = Y with DC */
+
+#define BLOCK_TYPES 4
+
+/* Middle dimension is a coarsening of the coefficient's
+ position within the 4x4 DCT. */
+
+#define COEF_BANDS 8
+extern DECLARE_ALIGNED(16, const unsigned char, vp8_coef_bands[16]);
+
+/* Inside dimension is 3-valued measure of nearby complexity, that is,
+ the extent to which nearby coefficients are nonzero. For the first
+ coefficient (DC, unless block type is 0), we look at the (already encoded)
+ blocks above and to the left of the current block. The context index is
+ then the number (0,1,or 2) of these blocks having nonzero coefficients.
+ After decoding a coefficient, the measure is roughly the size of the
+ most recently decoded coefficient (0 for 0, 1 for 1, 2 for >1).
+ Note that the intuitive meaning of this measure changes as coefficients
+ are decoded, e.g., prior to the first token, a zero means that my neighbors
+ are empty while, after the first token, because of the use of end-of-block,
+ a zero means we just decoded a zero and hence guarantees that a non-zero
+ coefficient will appear later in this block. However, this shift
+ in meaning is perfectly OK because our context depends also on the
+ coefficient band (and since zigzag positions 0, 1, and 2 are in
+ distinct bands). */
+
+/*# define DC_TOKEN_CONTEXTS 3*/ /* 00, 0!0, !0!0 */
+# define PREV_COEF_CONTEXTS 3
+
+extern DECLARE_ALIGNED(16, const unsigned char, vp8_prev_token_class[MAX_ENTROPY_TOKENS]);
+
+extern const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+
+
+struct VP8Common;
+void vp8_default_coef_probs(struct VP8Common *);
+
+extern DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]);
+extern DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]);
+extern DECLARE_ALIGNED(16, const short, vp8_default_zig_zag_mask[16]);
+extern const int vp8_mb_feature_data_bits[MB_LVL_MAX];
+
+void vp8_coef_tree_initialize(void);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_ENTROPY_H_
diff --git a/thirdparty/libvpx/vp8/common/entropymode.c b/thirdparty/libvpx/vp8/common/entropymode.c
new file mode 100644
index 0000000000..8981a8d3c2
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/entropymode.c
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#define USE_PREBUILT_TABLES
+
+#include "entropymode.h"
+#include "entropy.h"
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp8_entropymodedata.h"
+
+int vp8_mv_cont(const int_mv *l, const int_mv *a)
+{
+ int lez = (l->as_int == 0);
+ int aez = (a->as_int == 0);
+ int lea = (l->as_int == a->as_int);
+
+ if (lea && lez)
+ return SUBMVREF_LEFT_ABOVE_ZED;
+
+ if (lea)
+ return SUBMVREF_LEFT_ABOVE_SAME;
+
+ if (aez)
+ return SUBMVREF_ABOVE_ZED;
+
+ if (lez)
+ return SUBMVREF_LEFT_ZED;
+
+ return SUBMVREF_NORMAL;
+}
+
+static const vp8_prob sub_mv_ref_prob [VP8_SUBMVREFS-1] = { 180, 162, 25};
+
+const vp8_prob vp8_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP8_SUBMVREFS-1] =
+{
+ { 147, 136, 18 },
+ { 106, 145, 1 },
+ { 179, 121, 1 },
+ { 223, 1 , 34 },
+ { 208, 1 , 1 }
+};
+
+
+
+const vp8_mbsplit vp8_mbsplits [VP8_NUMMBSPLITS] =
+{
+ {
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 1, 1, 1, 1,
+ 1, 1, 1, 1,
+ },
+ {
+ 0, 0, 1, 1,
+ 0, 0, 1, 1,
+ 0, 0, 1, 1,
+ 0, 0, 1, 1,
+ },
+ {
+ 0, 0, 1, 1,
+ 0, 0, 1, 1,
+ 2, 2, 3, 3,
+ 2, 2, 3, 3,
+ },
+ {
+ 0, 1, 2, 3,
+ 4, 5, 6, 7,
+ 8, 9, 10, 11,
+ 12, 13, 14, 15,
+ }
+};
+
+const int vp8_mbsplit_count [VP8_NUMMBSPLITS] = { 2, 2, 4, 16};
+
+const vp8_prob vp8_mbsplit_probs [VP8_NUMMBSPLITS-1] = { 110, 111, 150};
+
+
+/* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
+
+const vp8_tree_index vp8_bmode_tree[18] = /* INTRAMODECONTEXTNODE value */
+{
+ -B_DC_PRED, 2, /* 0 = DC_NODE */
+ -B_TM_PRED, 4, /* 1 = TM_NODE */
+ -B_VE_PRED, 6, /* 2 = VE_NODE */
+ 8, 12, /* 3 = COM_NODE */
+ -B_HE_PRED, 10, /* 4 = HE_NODE */
+ -B_RD_PRED, -B_VR_PRED, /* 5 = RD_NODE */
+ -B_LD_PRED, 14, /* 6 = LD_NODE */
+ -B_VL_PRED, 16, /* 7 = VL_NODE */
+ -B_HD_PRED, -B_HU_PRED /* 8 = HD_NODE */
+};
+
+/* Again, these trees use the same probability indices as their
+ explicitly-programmed predecessors. */
+
+const vp8_tree_index vp8_ymode_tree[8] =
+{
+ -DC_PRED, 2,
+ 4, 6,
+ -V_PRED, -H_PRED,
+ -TM_PRED, -B_PRED
+};
+
+const vp8_tree_index vp8_kf_ymode_tree[8] =
+{
+ -B_PRED, 2,
+ 4, 6,
+ -DC_PRED, -V_PRED,
+ -H_PRED, -TM_PRED
+};
+
+const vp8_tree_index vp8_uv_mode_tree[6] =
+{
+ -DC_PRED, 2,
+ -V_PRED, 4,
+ -H_PRED, -TM_PRED
+};
+
+const vp8_tree_index vp8_mbsplit_tree[6] =
+{
+ -3, 2,
+ -2, 4,
+ -0, -1
+};
+
+const vp8_tree_index vp8_mv_ref_tree[8] =
+{
+ -ZEROMV, 2,
+ -NEARESTMV, 4,
+ -NEARMV, 6,
+ -NEWMV, -SPLITMV
+};
+
+const vp8_tree_index vp8_sub_mv_ref_tree[6] =
+{
+ -LEFT4X4, 2,
+ -ABOVE4X4, 4,
+ -ZERO4X4, -NEW4X4
+};
+
+const vp8_tree_index vp8_small_mvtree [14] =
+{
+ 2, 8,
+ 4, 6,
+ -0, -1,
+ -2, -3,
+ 10, 12,
+ -4, -5,
+ -6, -7
+};
+
+void vp8_init_mbmode_probs(VP8_COMMON *x)
+{
+ memcpy(x->fc.ymode_prob, vp8_ymode_prob, sizeof(vp8_ymode_prob));
+ memcpy(x->fc.uv_mode_prob, vp8_uv_mode_prob, sizeof(vp8_uv_mode_prob));
+ memcpy(x->fc.sub_mv_ref_prob, sub_mv_ref_prob, sizeof(sub_mv_ref_prob));
+}
+
+void vp8_default_bmode_probs(vp8_prob p [VP8_BINTRAMODES-1])
+{
+ memcpy(p, vp8_bmode_prob, sizeof(vp8_bmode_prob));
+}
+
diff --git a/thirdparty/libvpx/vp8/common/entropymode.h b/thirdparty/libvpx/vp8/common/entropymode.h
new file mode 100644
index 0000000000..81bdfc4b8b
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/entropymode.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_ENTROPYMODE_H_
+#define VP8_COMMON_ENTROPYMODE_H_
+
+#include "onyxc_int.h"
+#include "treecoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum
+{
+ SUBMVREF_NORMAL,
+ SUBMVREF_LEFT_ZED,
+ SUBMVREF_ABOVE_ZED,
+ SUBMVREF_LEFT_ABOVE_SAME,
+ SUBMVREF_LEFT_ABOVE_ZED
+} sumvfref_t;
+
+typedef int vp8_mbsplit[16];
+
+#define VP8_NUMMBSPLITS 4
+
+extern const vp8_mbsplit vp8_mbsplits [VP8_NUMMBSPLITS];
+
+extern const int vp8_mbsplit_count [VP8_NUMMBSPLITS]; /* # of subsets */
+
+extern const vp8_prob vp8_mbsplit_probs [VP8_NUMMBSPLITS-1];
+
+extern int vp8_mv_cont(const int_mv *l, const int_mv *a);
+#define SUBMVREF_COUNT 5
+extern const vp8_prob vp8_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP8_SUBMVREFS-1];
+
+
+extern const unsigned int vp8_kf_default_bmode_counts [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES];
+
+
+extern const vp8_tree_index vp8_bmode_tree[];
+
+extern const vp8_tree_index vp8_ymode_tree[];
+extern const vp8_tree_index vp8_kf_ymode_tree[];
+extern const vp8_tree_index vp8_uv_mode_tree[];
+
+extern const vp8_tree_index vp8_mbsplit_tree[];
+extern const vp8_tree_index vp8_mv_ref_tree[];
+extern const vp8_tree_index vp8_sub_mv_ref_tree[];
+
+extern const struct vp8_token_struct vp8_bmode_encodings[VP8_BINTRAMODES];
+extern const struct vp8_token_struct vp8_ymode_encodings[VP8_YMODES];
+extern const struct vp8_token_struct vp8_kf_ymode_encodings[VP8_YMODES];
+extern const struct vp8_token_struct vp8_uv_mode_encodings[VP8_UV_MODES];
+extern const struct vp8_token_struct vp8_mbsplit_encodings[VP8_NUMMBSPLITS];
+
+/* Inter mode values do not start at zero */
+
+extern const struct vp8_token_struct vp8_mv_ref_encoding_array[VP8_MVREFS];
+extern const struct vp8_token_struct vp8_sub_mv_ref_encoding_array[VP8_SUBMVREFS];
+
+extern const vp8_tree_index vp8_small_mvtree[];
+
+extern const struct vp8_token_struct vp8_small_mvencodings[8];
+
+/* Key frame default mode probs */
+extern const vp8_prob vp8_kf_bmode_prob[VP8_BINTRAMODES][VP8_BINTRAMODES]
+[VP8_BINTRAMODES-1];
+extern const vp8_prob vp8_kf_uv_mode_prob[VP8_UV_MODES-1];
+extern const vp8_prob vp8_kf_ymode_prob[VP8_YMODES-1];
+
+void vp8_init_mbmode_probs(VP8_COMMON *x);
+void vp8_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES-1]);
+void vp8_kf_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES-1]);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_ENTROPYMODE_H_
diff --git a/thirdparty/libvpx/vp8/common/entropymv.c b/thirdparty/libvpx/vp8/common/entropymv.c
new file mode 100644
index 0000000000..e5df1f0955
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/entropymv.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "entropymv.h"
+
+const MV_CONTEXT vp8_mv_update_probs[2] =
+{
+ {{
+ 237,
+ 246,
+ 253, 253, 254, 254, 254, 254, 254,
+ 254, 254, 254, 254, 254, 250, 250, 252, 254, 254
+ }},
+ {{
+ 231,
+ 243,
+ 245, 253, 254, 254, 254, 254, 254,
+ 254, 254, 254, 254, 254, 251, 251, 254, 254, 254
+ }}
+};
+const MV_CONTEXT vp8_default_mv_context[2] =
+{
+ {{
+ /* row */
+ 162, /* is short */
+ 128, /* sign */
+ 225, 146, 172, 147, 214, 39, 156, /* short tree */
+ 128, 129, 132, 75, 145, 178, 206, 239, 254, 254 /* long bits */
+ }},
+
+
+
+ {{
+ /* same for column */
+ 164, /* is short */
+ 128,
+ 204, 170, 119, 235, 140, 230, 228,
+ 128, 130, 130, 74, 148, 180, 203, 236, 254, 254 /* long bits */
+
+ }}
+};
diff --git a/thirdparty/libvpx/vp8/common/entropymv.h b/thirdparty/libvpx/vp8/common/entropymv.h
new file mode 100644
index 0000000000..42840d58ad
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/entropymv.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_ENTROPYMV_H_
+#define VP8_COMMON_ENTROPYMV_H_
+
+#include "treecoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum
+{
+ mv_max = 1023, /* max absolute value of a MV component */
+ MVvals = (2 * mv_max) + 1, /* # possible values "" */
+ mvfp_max = 255, /* max absolute value of a full pixel MV component */
+ MVfpvals = (2 * mvfp_max) +1, /* # possible full pixel MV values */
+
+ mvlong_width = 10, /* Large MVs have 9 bit magnitudes */
+ mvnum_short = 8, /* magnitudes 0 through 7 */
+
+ /* probability offsets for coding each MV component */
+
+ mvpis_short = 0, /* short (<= 7) vs long (>= 8) */
+ MVPsign, /* sign for non-zero */
+ MVPshort, /* 8 short values = 7-position tree */
+
+ MVPbits = MVPshort + mvnum_short - 1, /* mvlong_width long value bits */
+ MVPcount = MVPbits + mvlong_width /* (with independent probabilities) */
+};
+
+typedef struct mv_context
+{
+ vp8_prob prob[MVPcount]; /* often come in row, col pairs */
+} MV_CONTEXT;
+
+extern const MV_CONTEXT vp8_mv_update_probs[2], vp8_default_mv_context[2];
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_ENTROPYMV_H_
diff --git a/thirdparty/libvpx/vp8/common/extend.c b/thirdparty/libvpx/vp8/common/extend.c
new file mode 100644
index 0000000000..2d938ad782
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/extend.c
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "extend.h"
+#include "vpx_mem/vpx_mem.h"
+
+
+static void copy_and_extend_plane
+(
+ unsigned char *s, /* source */
+ int sp, /* source pitch */
+ unsigned char *d, /* destination */
+ int dp, /* destination pitch */
+ int h, /* height */
+ int w, /* width */
+ int et, /* extend top border */
+ int el, /* extend left border */
+ int eb, /* extend bottom border */
+ int er /* extend right border */
+)
+{
+ int i;
+ unsigned char *src_ptr1, *src_ptr2;
+ unsigned char *dest_ptr1, *dest_ptr2;
+ int linesize;
+
+ /* copy the left and right most columns out */
+ src_ptr1 = s;
+ src_ptr2 = s + w - 1;
+ dest_ptr1 = d - el;
+ dest_ptr2 = d + w;
+
+ for (i = 0; i < h; i++)
+ {
+ memset(dest_ptr1, src_ptr1[0], el);
+ memcpy(dest_ptr1 + el, src_ptr1, w);
+ memset(dest_ptr2, src_ptr2[0], er);
+ src_ptr1 += sp;
+ src_ptr2 += sp;
+ dest_ptr1 += dp;
+ dest_ptr2 += dp;
+ }
+
+ /* Now copy the top and bottom lines into each line of the respective
+ * borders
+ */
+ src_ptr1 = d - el;
+ src_ptr2 = d + dp * (h - 1) - el;
+ dest_ptr1 = d + dp * (-et) - el;
+ dest_ptr2 = d + dp * (h) - el;
+ linesize = el + er + w;
+
+ for (i = 0; i < et; i++)
+ {
+ memcpy(dest_ptr1, src_ptr1, linesize);
+ dest_ptr1 += dp;
+ }
+
+ for (i = 0; i < eb; i++)
+ {
+ memcpy(dest_ptr2, src_ptr2, linesize);
+ dest_ptr2 += dp;
+ }
+}
+
+
+void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst)
+{
+ int et = dst->border;
+ int el = dst->border;
+ int eb = dst->border + dst->y_height - src->y_height;
+ int er = dst->border + dst->y_width - src->y_width;
+
+ copy_and_extend_plane(src->y_buffer, src->y_stride,
+ dst->y_buffer, dst->y_stride,
+ src->y_height, src->y_width,
+ et, el, eb, er);
+
+ et = dst->border >> 1;
+ el = dst->border >> 1;
+ eb = (dst->border >> 1) + dst->uv_height - src->uv_height;
+ er = (dst->border >> 1) + dst->uv_width - src->uv_width;
+
+ copy_and_extend_plane(src->u_buffer, src->uv_stride,
+ dst->u_buffer, dst->uv_stride,
+ src->uv_height, src->uv_width,
+ et, el, eb, er);
+
+ copy_and_extend_plane(src->v_buffer, src->uv_stride,
+ dst->v_buffer, dst->uv_stride,
+ src->uv_height, src->uv_width,
+ et, el, eb, er);
+}
+
+
+void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst,
+ int srcy, int srcx,
+ int srch, int srcw)
+{
+ int et = dst->border;
+ int el = dst->border;
+ int eb = dst->border + dst->y_height - src->y_height;
+ int er = dst->border + dst->y_width - src->y_width;
+ int src_y_offset = srcy * src->y_stride + srcx;
+ int dst_y_offset = srcy * dst->y_stride + srcx;
+ int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);
+ int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);
+
+ /* If the side is not touching the bounder then don't extend. */
+ if (srcy)
+ et = 0;
+ if (srcx)
+ el = 0;
+ if (srcy + srch != src->y_height)
+ eb = 0;
+ if (srcx + srcw != src->y_width)
+ er = 0;
+
+ copy_and_extend_plane(src->y_buffer + src_y_offset,
+ src->y_stride,
+ dst->y_buffer + dst_y_offset,
+ dst->y_stride,
+ srch, srcw,
+ et, el, eb, er);
+
+ et = (et + 1) >> 1;
+ el = (el + 1) >> 1;
+ eb = (eb + 1) >> 1;
+ er = (er + 1) >> 1;
+ srch = (srch + 1) >> 1;
+ srcw = (srcw + 1) >> 1;
+
+ copy_and_extend_plane(src->u_buffer + src_uv_offset,
+ src->uv_stride,
+ dst->u_buffer + dst_uv_offset,
+ dst->uv_stride,
+ srch, srcw,
+ et, el, eb, er);
+
+ copy_and_extend_plane(src->v_buffer + src_uv_offset,
+ src->uv_stride,
+ dst->v_buffer + dst_uv_offset,
+ dst->uv_stride,
+ srch, srcw,
+ et, el, eb, er);
+}
+
+
+/* note the extension is only for the last row, for intra prediction purpose */
+void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf,
+ unsigned char *YPtr,
+ unsigned char *UPtr,
+ unsigned char *VPtr)
+{
+ int i;
+
+ YPtr += ybf->y_stride * 14;
+ UPtr += ybf->uv_stride * 6;
+ VPtr += ybf->uv_stride * 6;
+
+ for (i = 0; i < 4; i++)
+ {
+ YPtr[i] = YPtr[-1];
+ UPtr[i] = UPtr[-1];
+ VPtr[i] = VPtr[-1];
+ }
+
+ YPtr += ybf->y_stride;
+ UPtr += ybf->uv_stride;
+ VPtr += ybf->uv_stride;
+
+ for (i = 0; i < 4; i++)
+ {
+ YPtr[i] = YPtr[-1];
+ UPtr[i] = UPtr[-1];
+ VPtr[i] = VPtr[-1];
+ }
+}
diff --git a/thirdparty/libvpx/vp8/common/extend.h b/thirdparty/libvpx/vp8/common/extend.h
new file mode 100644
index 0000000000..068f4ac523
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/extend.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_EXTEND_H_
+#define VP8_COMMON_EXTEND_H_
+
+#include "vpx_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, unsigned char *UPtr, unsigned char *VPtr);
+void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst);
+void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst,
+ int srcy, int srcx,
+ int srch, int srcw);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_EXTEND_H_
diff --git a/thirdparty/libvpx/vp8/common/filter.c b/thirdparty/libvpx/vp8/common/filter.c
new file mode 100644
index 0000000000..84c608effa
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/filter.c
@@ -0,0 +1,493 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "filter.h"
+#include "./vp8_rtcd.h"
+
+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]) =
+{
+ { 128, 0 },
+ { 112, 16 },
+ { 96, 32 },
+ { 80, 48 },
+ { 64, 64 },
+ { 48, 80 },
+ { 32, 96 },
+ { 16, 112 }
+};
+
+DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]) =
+{
+
+ { 0, 0, 128, 0, 0, 0 }, /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */
+ { 0, -6, 123, 12, -1, 0 },
+ { 2, -11, 108, 36, -8, 1 }, /* New 1/4 pel 6 tap filter */
+ { 0, -9, 93, 50, -6, 0 },
+ { 3, -16, 77, 77, -16, 3 }, /* New 1/2 pel 6 tap filter */
+ { 0, -6, 50, 93, -9, 0 },
+ { 1, -8, 36, 108, -11, 2 }, /* New 1/4 pel 6 tap filter */
+ { 0, -1, 12, 123, -6, 0 },
+};
+
+static void filter_block2d_first_pass
+(
+ unsigned char *src_ptr,
+ int *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp8_filter
+)
+{
+ unsigned int i, j;
+ int Temp;
+
+ for (i = 0; i < output_height; i++)
+ {
+ for (j = 0; j < output_width; j++)
+ {
+ Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +
+ ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +
+ ((int)src_ptr[0] * vp8_filter[2]) +
+ ((int)src_ptr[pixel_step] * vp8_filter[3]) +
+ ((int)src_ptr[2*pixel_step] * vp8_filter[4]) +
+ ((int)src_ptr[3*pixel_step] * vp8_filter[5]) +
+ (VP8_FILTER_WEIGHT >> 1); /* Rounding */
+
+ /* Normalize back to 0-255 */
+ Temp = Temp >> VP8_FILTER_SHIFT;
+
+ if (Temp < 0)
+ Temp = 0;
+ else if (Temp > 255)
+ Temp = 255;
+
+ output_ptr[j] = Temp;
+ src_ptr++;
+ }
+
+ /* Next row... */
+ src_ptr += src_pixels_per_line - output_width;
+ output_ptr += output_width;
+ }
+}
+
+static void filter_block2d_second_pass
+(
+ int *src_ptr,
+ unsigned char *output_ptr,
+ int output_pitch,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp8_filter
+)
+{
+ unsigned int i, j;
+ int Temp;
+
+ for (i = 0; i < output_height; i++)
+ {
+ for (j = 0; j < output_width; j++)
+ {
+ /* Apply filter */
+ Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +
+ ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +
+ ((int)src_ptr[0] * vp8_filter[2]) +
+ ((int)src_ptr[pixel_step] * vp8_filter[3]) +
+ ((int)src_ptr[2*pixel_step] * vp8_filter[4]) +
+ ((int)src_ptr[3*pixel_step] * vp8_filter[5]) +
+ (VP8_FILTER_WEIGHT >> 1); /* Rounding */
+
+ /* Normalize back to 0-255 */
+ Temp = Temp >> VP8_FILTER_SHIFT;
+
+ if (Temp < 0)
+ Temp = 0;
+ else if (Temp > 255)
+ Temp = 255;
+
+ output_ptr[j] = (unsigned char)Temp;
+ src_ptr++;
+ }
+
+ /* Start next row */
+ src_ptr += src_pixels_per_line - output_width;
+ output_ptr += output_pitch;
+ }
+}
+
+
+static void filter_block2d
+(
+ unsigned char *src_ptr,
+ unsigned char *output_ptr,
+ unsigned int src_pixels_per_line,
+ int output_pitch,
+ const short *HFilter,
+ const short *VFilter
+)
+{
+ int FData[9*4]; /* Temp data buffer used in filtering */
+
+ /* First filter 1-D horizontally... */
+ filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 4, HFilter);
+
+ /* then filter verticaly... */
+ filter_block2d_second_pass(FData + 8, output_ptr, output_pitch, 4, 4, 4, 4, VFilter);
+}
+
+
+void vp8_sixtap_predict4x4_c
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ const short *HFilter;
+ const short *VFilter;
+
+ HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
+ VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
+
+ filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);
+}
+void vp8_sixtap_predict8x8_c
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ const short *HFilter;
+ const short *VFilter;
+ int FData[13*16]; /* Temp data buffer used in filtering */
+
+ HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
+ VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
+
+ /* First filter 1-D horizontally... */
+ filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 13, 8, HFilter);
+
+
+ /* then filter verticaly... */
+ filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);
+
+}
+
+void vp8_sixtap_predict8x4_c
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ const short *HFilter;
+ const short *VFilter;
+ int FData[13*16]; /* Temp data buffer used in filtering */
+
+ HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
+ VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
+
+ /* First filter 1-D horizontally... */
+ filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 8, HFilter);
+
+
+ /* then filter verticaly... */
+ filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter);
+
+}
+
+void vp8_sixtap_predict16x16_c
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ const short *HFilter;
+ const short *VFilter;
+ int FData[21*24]; /* Temp data buffer used in filtering */
+
+
+ HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
+ VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
+
+ /* First filter 1-D horizontally... */
+ filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 21, 16, HFilter);
+
+ /* then filter verticaly... */
+ filter_block2d_second_pass(FData + 32, dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter);
+
+}
+
+
+/****************************************************************************
+ *
+ * ROUTINE : filter_block2d_bil_first_pass
+ *
+ * INPUTS : UINT8 *src_ptr : Pointer to source block.
+ * UINT32 src_stride : Stride of source block.
+ * UINT32 height : Block height.
+ * UINT32 width : Block width.
+ * INT32 *vp8_filter : Array of 2 bi-linear filter taps.
+ *
+ * OUTPUTS : INT32 *dst_ptr : Pointer to filtered block.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block
+ * in the horizontal direction to produce the filtered output
+ * block. Used to implement first-pass of 2-D separable filter.
+ *
+ * SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
+ * Two filter taps should sum to VP8_FILTER_WEIGHT.
+ *
+ ****************************************************************************/
+static void filter_block2d_bil_first_pass
+(
+ unsigned char *src_ptr,
+ unsigned short *dst_ptr,
+ unsigned int src_stride,
+ unsigned int height,
+ unsigned int width,
+ const short *vp8_filter
+)
+{
+ unsigned int i, j;
+
+ for (i = 0; i < height; i++)
+ {
+ for (j = 0; j < width; j++)
+ {
+ /* Apply bilinear filter */
+ dst_ptr[j] = (((int)src_ptr[0] * vp8_filter[0]) +
+ ((int)src_ptr[1] * vp8_filter[1]) +
+ (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
+ src_ptr++;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride - width;
+ dst_ptr += width;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : filter_block2d_bil_second_pass
+ *
+ * INPUTS : INT32 *src_ptr : Pointer to source block.
+ * UINT32 dst_pitch : Destination block pitch.
+ * UINT32 height : Block height.
+ * UINT32 width : Block width.
+ * INT32 *vp8_filter : Array of 2 bi-linear filter taps.
+ *
+ * OUTPUTS : UINT16 *dst_ptr : Pointer to filtered block.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block
+ * in the vertical direction to produce the filtered output
+ * block. Used to implement second-pass of 2-D separable filter.
+ *
+ * SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
+ * Two filter taps should sum to VP8_FILTER_WEIGHT.
+ *
+ ****************************************************************************/
+static void filter_block2d_bil_second_pass
+(
+ unsigned short *src_ptr,
+ unsigned char *dst_ptr,
+ int dst_pitch,
+ unsigned int height,
+ unsigned int width,
+ const short *vp8_filter
+)
+{
+ unsigned int i, j;
+ int Temp;
+
+ for (i = 0; i < height; i++)
+ {
+ for (j = 0; j < width; j++)
+ {
+ /* Apply filter */
+ Temp = ((int)src_ptr[0] * vp8_filter[0]) +
+ ((int)src_ptr[width] * vp8_filter[1]) +
+ (VP8_FILTER_WEIGHT / 2);
+ dst_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
+ src_ptr++;
+ }
+
+ /* Next row... */
+ dst_ptr += dst_pitch;
+ }
+}
+
+
+/****************************************************************************
+ *
+ * ROUTINE : filter_block2d_bil
+ *
+ * INPUTS : UINT8 *src_ptr : Pointer to source block.
+ * UINT32 src_pitch : Stride of source block.
+ * UINT32 dst_pitch : Stride of destination block.
+ * INT32 *HFilter : Array of 2 horizontal filter taps.
+ * INT32 *VFilter : Array of 2 vertical filter taps.
+ * INT32 Width : Block width
+ * INT32 Height : Block height
+ *
+ * OUTPUTS : UINT16 *dst_ptr : Pointer to filtered block.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : 2-D filters an input block by applying a 2-tap
+ * bi-linear filter horizontally followed by a 2-tap
+ * bi-linear filter vertically on the result.
+ *
+ * SPECIAL NOTES : The largest block size can be handled here is 16x16
+ *
+ ****************************************************************************/
+static void filter_block2d_bil
+(
+ unsigned char *src_ptr,
+ unsigned char *dst_ptr,
+ unsigned int src_pitch,
+ unsigned int dst_pitch,
+ const short *HFilter,
+ const short *VFilter,
+ int Width,
+ int Height
+)
+{
+
+ unsigned short FData[17*16]; /* Temp data buffer used in filtering */
+
+ /* First filter 1-D horizontally... */
+ filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
+
+ /* then 1-D vertically... */
+ filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
+}
+
+
+void vp8_bilinear_predict4x4_c
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ const short *HFilter;
+ const short *VFilter;
+
+ HFilter = vp8_bilinear_filters[xoffset];
+ VFilter = vp8_bilinear_filters[yoffset];
+#if 0
+ {
+ int i;
+ unsigned char temp1[16];
+ unsigned char temp2[16];
+
+ bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);
+ filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
+
+ for (i = 0; i < 16; i++)
+ {
+ if (temp1[i] != temp2[i])
+ {
+ bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);
+ filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
+ }
+ }
+ }
+#endif
+ filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
+
+}
+
+void vp8_bilinear_predict8x8_c
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ const short *HFilter;
+ const short *VFilter;
+
+ HFilter = vp8_bilinear_filters[xoffset];
+ VFilter = vp8_bilinear_filters[yoffset];
+
+ filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
+
+}
+
+void vp8_bilinear_predict8x4_c
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ const short *HFilter;
+ const short *VFilter;
+
+ HFilter = vp8_bilinear_filters[xoffset];
+ VFilter = vp8_bilinear_filters[yoffset];
+
+ filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
+
+}
+
+void vp8_bilinear_predict16x16_c
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ const short *HFilter;
+ const short *VFilter;
+
+ HFilter = vp8_bilinear_filters[xoffset];
+ VFilter = vp8_bilinear_filters[yoffset];
+
+ filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
+}
diff --git a/thirdparty/libvpx/vp8/common/filter.h b/thirdparty/libvpx/vp8/common/filter.h
new file mode 100644
index 0000000000..cfba775fce
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/filter.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_FILTER_H_
+#define VP8_COMMON_FILTER_H_
+
+#include "vpx_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define BLOCK_HEIGHT_WIDTH 4
+#define VP8_FILTER_WEIGHT 128
+#define VP8_FILTER_SHIFT 7
+
+extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]);
+extern DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_FILTER_H_
diff --git a/thirdparty/libvpx/vp8/common/findnearmv.c b/thirdparty/libvpx/vp8/common/findnearmv.c
new file mode 100644
index 0000000000..e8ee40f56c
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/findnearmv.c
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "findnearmv.h"
+
+const unsigned char vp8_mbsplit_offset[4][16] = {
+ { 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ { 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ { 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
+};
+
+/* Predict motion vectors using those from already-decoded nearby blocks.
+ Note that we only consider one 4x4 subblock from each candidate 16x16
+ macroblock. */
+void vp8_find_near_mvs
+(
+ MACROBLOCKD *xd,
+ const MODE_INFO *here,
+ int_mv *nearest,
+ int_mv *nearby,
+ int_mv *best_mv,
+ int cnt[4],
+ int refframe,
+ int *ref_frame_sign_bias
+)
+{
+ const MODE_INFO *above = here - xd->mode_info_stride;
+ const MODE_INFO *left = here - 1;
+ const MODE_INFO *aboveleft = above - 1;
+ int_mv near_mvs[4];
+ int_mv *mv = near_mvs;
+ int *cntx = cnt;
+ enum {CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV};
+
+ /* Zero accumulators */
+ mv[0].as_int = mv[1].as_int = mv[2].as_int = 0;
+ cnt[0] = cnt[1] = cnt[2] = cnt[3] = 0;
+
+ /* Process above */
+ if (above->mbmi.ref_frame != INTRA_FRAME)
+ {
+ if (above->mbmi.mv.as_int)
+ {
+ (++mv)->as_int = above->mbmi.mv.as_int;
+ mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame], refframe, mv, ref_frame_sign_bias);
+ ++cntx;
+ }
+
+ *cntx += 2;
+ }
+
+ /* Process left */
+ if (left->mbmi.ref_frame != INTRA_FRAME)
+ {
+ if (left->mbmi.mv.as_int)
+ {
+ int_mv this_mv;
+
+ this_mv.as_int = left->mbmi.mv.as_int;
+ mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame], refframe, &this_mv, ref_frame_sign_bias);
+
+ if (this_mv.as_int != mv->as_int)
+ {
+ (++mv)->as_int = this_mv.as_int;
+ ++cntx;
+ }
+
+ *cntx += 2;
+ }
+ else
+ cnt[CNT_INTRA] += 2;
+ }
+
+ /* Process above left */
+ if (aboveleft->mbmi.ref_frame != INTRA_FRAME)
+ {
+ if (aboveleft->mbmi.mv.as_int)
+ {
+ int_mv this_mv;
+
+ this_mv.as_int = aboveleft->mbmi.mv.as_int;
+ mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame], refframe, &this_mv, ref_frame_sign_bias);
+
+ if (this_mv.as_int != mv->as_int)
+ {
+ (++mv)->as_int = this_mv.as_int;
+ ++cntx;
+ }
+
+ *cntx += 1;
+ }
+ else
+ cnt[CNT_INTRA] += 1;
+ }
+
+ /* If we have three distinct MV's ... */
+ if (cnt[CNT_SPLITMV])
+ {
+ /* See if above-left MV can be merged with NEAREST */
+ if (mv->as_int == near_mvs[CNT_NEAREST].as_int)
+ cnt[CNT_NEAREST] += 1;
+ }
+
+ cnt[CNT_SPLITMV] = ((above->mbmi.mode == SPLITMV)
+ + (left->mbmi.mode == SPLITMV)) * 2
+ + (aboveleft->mbmi.mode == SPLITMV);
+
+ /* Swap near and nearest if necessary */
+ if (cnt[CNT_NEAR] > cnt[CNT_NEAREST])
+ {
+ int tmp;
+ tmp = cnt[CNT_NEAREST];
+ cnt[CNT_NEAREST] = cnt[CNT_NEAR];
+ cnt[CNT_NEAR] = tmp;
+ tmp = near_mvs[CNT_NEAREST].as_int;
+ near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int;
+ near_mvs[CNT_NEAR].as_int = tmp;
+ }
+
+ /* Use near_mvs[0] to store the "best" MV */
+ if (cnt[CNT_NEAREST] >= cnt[CNT_INTRA])
+ near_mvs[CNT_INTRA] = near_mvs[CNT_NEAREST];
+
+ /* Set up return values */
+ best_mv->as_int = near_mvs[0].as_int;
+ nearest->as_int = near_mvs[CNT_NEAREST].as_int;
+ nearby->as_int = near_mvs[CNT_NEAR].as_int;
+}
+
+
+static void invert_and_clamp_mvs(int_mv *inv, int_mv *src, MACROBLOCKD *xd)
+{
+ inv->as_mv.row = src->as_mv.row * -1;
+ inv->as_mv.col = src->as_mv.col * -1;
+ vp8_clamp_mv2(inv, xd);
+ vp8_clamp_mv2(src, xd);
+}
+
+
+int vp8_find_near_mvs_bias
+(
+ MACROBLOCKD *xd,
+ const MODE_INFO *here,
+ int_mv mode_mv_sb[2][MB_MODE_COUNT],
+ int_mv best_mv_sb[2],
+ int cnt[4],
+ int refframe,
+ int *ref_frame_sign_bias
+)
+{
+ int sign_bias = ref_frame_sign_bias[refframe];
+
+ vp8_find_near_mvs(xd,
+ here,
+ &mode_mv_sb[sign_bias][NEARESTMV],
+ &mode_mv_sb[sign_bias][NEARMV],
+ &best_mv_sb[sign_bias],
+ cnt,
+ refframe,
+ ref_frame_sign_bias);
+
+ invert_and_clamp_mvs(&mode_mv_sb[!sign_bias][NEARESTMV],
+ &mode_mv_sb[sign_bias][NEARESTMV], xd);
+ invert_and_clamp_mvs(&mode_mv_sb[!sign_bias][NEARMV],
+ &mode_mv_sb[sign_bias][NEARMV], xd);
+ invert_and_clamp_mvs(&best_mv_sb[!sign_bias],
+ &best_mv_sb[sign_bias], xd);
+
+ return sign_bias;
+}
+
+
+vp8_prob *vp8_mv_ref_probs(
+ vp8_prob p[VP8_MVREFS-1], const int near_mv_ref_ct[4]
+)
+{
+ p[0] = vp8_mode_contexts [near_mv_ref_ct[0]] [0];
+ p[1] = vp8_mode_contexts [near_mv_ref_ct[1]] [1];
+ p[2] = vp8_mode_contexts [near_mv_ref_ct[2]] [2];
+ p[3] = vp8_mode_contexts [near_mv_ref_ct[3]] [3];
+ /*p[3] = vp8_mode_contexts [near_mv_ref_ct[1] + near_mv_ref_ct[2] + near_mv_ref_ct[3]] [3];*/
+ return p;
+}
+
diff --git a/thirdparty/libvpx/vp8/common/findnearmv.h b/thirdparty/libvpx/vp8/common/findnearmv.h
new file mode 100644
index 0000000000..472a7b5d8d
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/findnearmv.h
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_FINDNEARMV_H_
+#define VP8_COMMON_FINDNEARMV_H_
+
+#include "./vpx_config.h"
+#include "mv.h"
+#include "blockd.h"
+#include "modecont.h"
+#include "treecoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+static INLINE void mv_bias(int refmb_ref_frame_sign_bias, int refframe,
+ int_mv *mvp, const int *ref_frame_sign_bias)
+{
+ if (refmb_ref_frame_sign_bias != ref_frame_sign_bias[refframe])
+ {
+ mvp->as_mv.row *= -1;
+ mvp->as_mv.col *= -1;
+ }
+}
+
+#define LEFT_TOP_MARGIN (16 << 3)
+#define RIGHT_BOTTOM_MARGIN (16 << 3)
+static INLINE void vp8_clamp_mv2(int_mv *mv, const MACROBLOCKD *xd)
+{
+ if (mv->as_mv.col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN))
+ mv->as_mv.col = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
+ else if (mv->as_mv.col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN)
+ mv->as_mv.col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
+
+ if (mv->as_mv.row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN))
+ mv->as_mv.row = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
+ else if (mv->as_mv.row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN)
+ mv->as_mv.row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
+}
+
+static INLINE void vp8_clamp_mv(int_mv *mv, int mb_to_left_edge,
+ int mb_to_right_edge, int mb_to_top_edge,
+ int mb_to_bottom_edge)
+{
+ mv->as_mv.col = (mv->as_mv.col < mb_to_left_edge) ?
+ mb_to_left_edge : mv->as_mv.col;
+ mv->as_mv.col = (mv->as_mv.col > mb_to_right_edge) ?
+ mb_to_right_edge : mv->as_mv.col;
+ mv->as_mv.row = (mv->as_mv.row < mb_to_top_edge) ?
+ mb_to_top_edge : mv->as_mv.row;
+ mv->as_mv.row = (mv->as_mv.row > mb_to_bottom_edge) ?
+ mb_to_bottom_edge : mv->as_mv.row;
+}
+static INLINE unsigned int vp8_check_mv_bounds(int_mv *mv, int mb_to_left_edge,
+ int mb_to_right_edge,
+ int mb_to_top_edge,
+ int mb_to_bottom_edge)
+{
+ unsigned int need_to_clamp;
+ need_to_clamp = (mv->as_mv.col < mb_to_left_edge);
+ need_to_clamp |= (mv->as_mv.col > mb_to_right_edge);
+ need_to_clamp |= (mv->as_mv.row < mb_to_top_edge);
+ need_to_clamp |= (mv->as_mv.row > mb_to_bottom_edge);
+ return need_to_clamp;
+}
+
+void vp8_find_near_mvs
+(
+ MACROBLOCKD *xd,
+ const MODE_INFO *here,
+ int_mv *nearest, int_mv *nearby, int_mv *best,
+ int near_mv_ref_cts[4],
+ int refframe,
+ int *ref_frame_sign_bias
+);
+
+
+int vp8_find_near_mvs_bias
+(
+ MACROBLOCKD *xd,
+ const MODE_INFO *here,
+ int_mv mode_mv_sb[2][MB_MODE_COUNT],
+ int_mv best_mv_sb[2],
+ int cnt[4],
+ int refframe,
+ int *ref_frame_sign_bias
+);
+
+
+vp8_prob *vp8_mv_ref_probs(
+ vp8_prob p[VP8_MVREFS-1], const int near_mv_ref_ct[4]
+);
+
+extern const unsigned char vp8_mbsplit_offset[4][16];
+
+
+static INLINE uint32_t left_block_mv(const MODE_INFO *cur_mb, int b)
+{
+ if (!(b & 3))
+ {
+ /* On L edge, get from MB to left of us */
+ --cur_mb;
+
+ if(cur_mb->mbmi.mode != SPLITMV)
+ return cur_mb->mbmi.mv.as_int;
+ b += 4;
+ }
+
+ return (cur_mb->bmi + b - 1)->mv.as_int;
+}
+
+static INLINE uint32_t above_block_mv(const MODE_INFO *cur_mb, int b,
+ int mi_stride)
+{
+ if (!(b >> 2))
+ {
+ /* On top edge, get from MB above us */
+ cur_mb -= mi_stride;
+
+ if(cur_mb->mbmi.mode != SPLITMV)
+ return cur_mb->mbmi.mv.as_int;
+ b += 16;
+ }
+
+ return (cur_mb->bmi + (b - 4))->mv.as_int;
+}
+static INLINE B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b)
+{
+ if (!(b & 3))
+ {
+ /* On L edge, get from MB to left of us */
+ --cur_mb;
+ switch (cur_mb->mbmi.mode)
+ {
+ case B_PRED:
+ return (cur_mb->bmi + b + 3)->as_mode;
+ case DC_PRED:
+ return B_DC_PRED;
+ case V_PRED:
+ return B_VE_PRED;
+ case H_PRED:
+ return B_HE_PRED;
+ case TM_PRED:
+ return B_TM_PRED;
+ default:
+ return B_DC_PRED;
+ }
+ }
+
+ return (cur_mb->bmi + b - 1)->as_mode;
+}
+
+static INLINE B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, int b,
+ int mi_stride)
+{
+ if (!(b >> 2))
+ {
+ /* On top edge, get from MB above us */
+ cur_mb -= mi_stride;
+
+ switch (cur_mb->mbmi.mode)
+ {
+ case B_PRED:
+ return (cur_mb->bmi + b + 12)->as_mode;
+ case DC_PRED:
+ return B_DC_PRED;
+ case V_PRED:
+ return B_VE_PRED;
+ case H_PRED:
+ return B_HE_PRED;
+ case TM_PRED:
+ return B_TM_PRED;
+ default:
+ return B_DC_PRED;
+ }
+ }
+
+ return (cur_mb->bmi + b - 4)->as_mode;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_FINDNEARMV_H_
diff --git a/thirdparty/libvpx/vp8/common/generic/systemdependent.c b/thirdparty/libvpx/vp8/common/generic/systemdependent.c
new file mode 100644
index 0000000000..6d5f302d7a
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/generic/systemdependent.c
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#if ARCH_ARM
+#include "vpx_ports/arm.h"
+#elif ARCH_X86 || ARCH_X86_64
+#include "vpx_ports/x86.h"
+#endif
+#include "vp8/common/onyxc_int.h"
+#include "vp8/common/systemdependent.h"
+
+#if CONFIG_MULTITHREAD
+#if HAVE_UNISTD_H && !defined(__OS2__)
+#include <unistd.h>
+#elif defined(_WIN32)
+#include <windows.h>
+typedef void (WINAPI *PGNSI)(LPSYSTEM_INFO);
+#elif defined(__OS2__)
+#define INCL_DOS
+#define INCL_DOSSPINLOCK
+#include <os2.h>
+#endif
+#endif
+
+#if CONFIG_MULTITHREAD
+static int get_cpu_count()
+{
+ int core_count = 16;
+
+#if HAVE_UNISTD_H && !defined(__OS2__)
+#if defined(_SC_NPROCESSORS_ONLN)
+ core_count = sysconf(_SC_NPROCESSORS_ONLN);
+#elif defined(_SC_NPROC_ONLN)
+ core_count = sysconf(_SC_NPROC_ONLN);
+#endif
+#elif defined(_WIN32)
+ {
+#if _WIN32_WINNT >= 0x0501
+ SYSTEM_INFO sysinfo;
+ GetNativeSystemInfo(&sysinfo);
+#else
+ PGNSI pGNSI;
+ SYSTEM_INFO sysinfo;
+
+ /* Call GetNativeSystemInfo if supported or
+ * GetSystemInfo otherwise. */
+
+ pGNSI = (PGNSI) GetProcAddress(
+ GetModuleHandle(TEXT("kernel32.dll")), "GetNativeSystemInfo");
+ if (pGNSI != NULL)
+ pGNSI(&sysinfo);
+ else
+ GetSystemInfo(&sysinfo);
+#endif
+
+ core_count = sysinfo.dwNumberOfProcessors;
+ }
+#elif defined(__OS2__)
+ {
+ ULONG proc_id;
+ ULONG status;
+
+ core_count = 0;
+ for (proc_id = 1; ; proc_id++)
+ {
+ if (DosGetProcessorStatus(proc_id, &status))
+ break;
+
+ if (status == PROC_ONLINE)
+ core_count++;
+ }
+ }
+#else
+ /* other platforms */
+#endif
+
+ return core_count > 0 ? core_count : 1;
+}
+#endif
+
+void vp8_clear_system_state_c() {};
+
+void vp8_machine_specific_config(VP8_COMMON *ctx)
+{
+#if CONFIG_MULTITHREAD
+ ctx->processor_core_count = get_cpu_count();
+#else
+ (void)ctx;
+#endif /* CONFIG_MULTITHREAD */
+
+#if ARCH_ARM
+ ctx->cpu_caps = arm_cpu_caps();
+#elif ARCH_X86 || ARCH_X86_64
+ ctx->cpu_caps = x86_simd_caps();
+#endif
+}
diff --git a/thirdparty/libvpx/vp8/common/header.h b/thirdparty/libvpx/vp8/common/header.h
new file mode 100644
index 0000000000..e27bca16bd
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/header.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_HEADER_H_
+#define VP8_COMMON_HEADER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* 24 bits total */
+typedef struct
+{
+ unsigned int type: 1;
+ unsigned int version: 3;
+ unsigned int show_frame: 1;
+
+ /* Allow 2^20 bytes = 8 megabits for first partition */
+
+ unsigned int first_partition_length_in_bytes: 19;
+
+#ifdef PACKET_TESTING
+ unsigned int frame_number;
+ unsigned int update_gold: 1;
+ unsigned int uses_gold: 1;
+ unsigned int update_last: 1;
+ unsigned int uses_last: 1;
+#endif
+
+} VP8_HEADER;
+
+#ifdef PACKET_TESTING
+#define VP8_HEADER_SIZE 8
+#else
+#define VP8_HEADER_SIZE 3
+#endif
+
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_HEADER_H_
diff --git a/thirdparty/libvpx/vp8/common/idct_blk.c b/thirdparty/libvpx/vp8/common/idct_blk.c
new file mode 100644
index 0000000000..8aa7d9bf0f
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/idct_blk.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
+
+void vp8_dequant_idct_add_c(short *input, short *dq,
+ unsigned char *dest, int stride);
+void vp8_dc_only_idct_add_c(short input_dc, unsigned char * pred,
+ int pred_stride, unsigned char *dst_ptr,
+ int dst_stride);
+
+void vp8_dequant_idct_add_y_block_c
+ (short *q, short *dq,
+ unsigned char *dst, int stride, char *eobs)
+{
+ int i, j;
+
+ for (i = 0; i < 4; i++)
+ {
+ for (j = 0; j < 4; j++)
+ {
+ if (*eobs++ > 1)
+ vp8_dequant_idct_add_c (q, dq, dst, stride);
+ else
+ {
+ vp8_dc_only_idct_add_c (q[0]*dq[0], dst, stride, dst, stride);
+ memset(q, 0, 2 * sizeof(q[0]));
+ }
+
+ q += 16;
+ dst += 4;
+ }
+
+ dst += 4*stride - 16;
+ }
+}
+
+void vp8_dequant_idct_add_uv_block_c
+ (short *q, short *dq,
+ unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
+{
+ int i, j;
+
+ for (i = 0; i < 2; i++)
+ {
+ for (j = 0; j < 2; j++)
+ {
+ if (*eobs++ > 1)
+ vp8_dequant_idct_add_c (q, dq, dstu, stride);
+ else
+ {
+ vp8_dc_only_idct_add_c (q[0]*dq[0], dstu, stride, dstu, stride);
+ memset(q, 0, 2 * sizeof(q[0]));
+ }
+
+ q += 16;
+ dstu += 4;
+ }
+
+ dstu += 4*stride - 8;
+ }
+
+ for (i = 0; i < 2; i++)
+ {
+ for (j = 0; j < 2; j++)
+ {
+ if (*eobs++ > 1)
+ vp8_dequant_idct_add_c (q, dq, dstv, stride);
+ else
+ {
+ vp8_dc_only_idct_add_c (q[0]*dq[0], dstv, stride, dstv, stride);
+ memset(q, 0, 2 * sizeof(q[0]));
+ }
+
+ q += 16;
+ dstv += 4;
+ }
+
+ dstv += 4*stride - 8;
+ }
+}
diff --git a/thirdparty/libvpx/vp8/common/idctllm.c b/thirdparty/libvpx/vp8/common/idctllm.c
new file mode 100644
index 0000000000..f5403c5aaf
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/idctllm.c
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+
+/****************************************************************************
+ * Notes:
+ *
+ * This implementation makes use of 16 bit fixed point verio of two multiply
+ * constants:
+ * 1. sqrt(2) * cos (pi/8)
+ * 2. sqrt(2) * sin (pi/8)
+ * Becuase the first constant is bigger than 1, to maintain the same 16 bit
+ * fixed point precision as the second one, we use a trick of
+ * x * a = x + x*(a-1)
+ * so
+ * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
+ **************************************************************************/
+static const int cospi8sqrt2minus1 = 20091;
+static const int sinpi8sqrt2 = 35468;
+
+void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr,
+ int pred_stride, unsigned char *dst_ptr,
+ int dst_stride)
+{
+ int i;
+ int r, c;
+ int a1, b1, c1, d1;
+ short output[16];
+ short *ip = input;
+ short *op = output;
+ int temp1, temp2;
+ int shortpitch = 4;
+
+ for (i = 0; i < 4; i++)
+ {
+ a1 = ip[0] + ip[8];
+ b1 = ip[0] - ip[8];
+
+ temp1 = (ip[4] * sinpi8sqrt2) >> 16;
+ temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
+ c1 = temp1 - temp2;
+
+ temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16);
+ temp2 = (ip[12] * sinpi8sqrt2) >> 16;
+ d1 = temp1 + temp2;
+
+ op[shortpitch*0] = a1 + d1;
+ op[shortpitch*3] = a1 - d1;
+
+ op[shortpitch*1] = b1 + c1;
+ op[shortpitch*2] = b1 - c1;
+
+ ip++;
+ op++;
+ }
+
+ ip = output;
+ op = output;
+
+ for (i = 0; i < 4; i++)
+ {
+ a1 = ip[0] + ip[2];
+ b1 = ip[0] - ip[2];
+
+ temp1 = (ip[1] * sinpi8sqrt2) >> 16;
+ temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16);
+ c1 = temp1 - temp2;
+
+ temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16);
+ temp2 = (ip[3] * sinpi8sqrt2) >> 16;
+ d1 = temp1 + temp2;
+
+
+ op[0] = (a1 + d1 + 4) >> 3;
+ op[3] = (a1 - d1 + 4) >> 3;
+
+ op[1] = (b1 + c1 + 4) >> 3;
+ op[2] = (b1 - c1 + 4) >> 3;
+
+ ip += shortpitch;
+ op += shortpitch;
+ }
+
+ ip = output;
+ for (r = 0; r < 4; r++)
+ {
+ for (c = 0; c < 4; c++)
+ {
+ int a = ip[c] + pred_ptr[c] ;
+
+ if (a < 0)
+ a = 0;
+
+ if (a > 255)
+ a = 255;
+
+ dst_ptr[c] = (unsigned char) a ;
+ }
+ ip += 4;
+ dst_ptr += dst_stride;
+ pred_ptr += pred_stride;
+ }
+}
+
+void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
+ int pred_stride, unsigned char *dst_ptr,
+ int dst_stride)
+{
+ int a1 = ((input_dc + 4) >> 3);
+ int r, c;
+
+ for (r = 0; r < 4; r++)
+ {
+ for (c = 0; c < 4; c++)
+ {
+ int a = a1 + pred_ptr[c] ;
+
+ if (a < 0)
+ a = 0;
+
+ if (a > 255)
+ a = 255;
+
+ dst_ptr[c] = (unsigned char) a ;
+ }
+
+ dst_ptr += dst_stride;
+ pred_ptr += pred_stride;
+ }
+
+}
+
+void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff)
+{
+ short output[16];
+ int i;
+ int a1, b1, c1, d1;
+ int a2, b2, c2, d2;
+ short *ip = input;
+ short *op = output;
+
+ for (i = 0; i < 4; i++)
+ {
+ a1 = ip[0] + ip[12];
+ b1 = ip[4] + ip[8];
+ c1 = ip[4] - ip[8];
+ d1 = ip[0] - ip[12];
+
+ op[0] = a1 + b1;
+ op[4] = c1 + d1;
+ op[8] = a1 - b1;
+ op[12] = d1 - c1;
+ ip++;
+ op++;
+ }
+
+ ip = output;
+ op = output;
+
+ for (i = 0; i < 4; i++)
+ {
+ a1 = ip[0] + ip[3];
+ b1 = ip[1] + ip[2];
+ c1 = ip[1] - ip[2];
+ d1 = ip[0] - ip[3];
+
+ a2 = a1 + b1;
+ b2 = c1 + d1;
+ c2 = a1 - b1;
+ d2 = d1 - c1;
+
+ op[0] = (a2 + 3) >> 3;
+ op[1] = (b2 + 3) >> 3;
+ op[2] = (c2 + 3) >> 3;
+ op[3] = (d2 + 3) >> 3;
+
+ ip += 4;
+ op += 4;
+ }
+
+ for(i = 0; i < 16; i++)
+ {
+ mb_dqcoeff[i * 16] = output[i];
+ }
+}
+
+void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff)
+{
+ int i;
+ int a1;
+
+ a1 = ((input[0] + 3) >> 3);
+ for(i = 0; i < 16; i++)
+ {
+ mb_dqcoeff[i * 16] = a1;
+ }
+}
diff --git a/thirdparty/libvpx/vp8/common/invtrans.h b/thirdparty/libvpx/vp8/common/invtrans.h
new file mode 100644
index 0000000000..9cfea8d513
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/invtrans.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_INVTRANS_H_
+#define VP8_COMMON_INVTRANS_H_
+
+#include "./vpx_config.h"
+#include "vp8_rtcd.h"
+#include "blockd.h"
+#include "onyxc_int.h"
+
+#if CONFIG_MULTITHREAD
+#include "vpx_mem/vpx_mem.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static void eob_adjust(char *eobs, short *diff)
+{
+ /* eob adjust.... the idct can only skip if both the dc and eob are zero */
+ int js;
+ for(js = 0; js < 16; js++)
+ {
+ if((eobs[js] == 0) && (diff[0] != 0))
+ eobs[js]++;
+ diff+=16;
+ }
+}
+
+static INLINE void vp8_inverse_transform_mby(MACROBLOCKD *xd)
+{
+ short *DQC = xd->dequant_y1;
+
+ if (xd->mode_info_context->mbmi.mode != SPLITMV)
+ {
+ /* do 2nd order transform on the dc block */
+ if (xd->eobs[24] > 1)
+ {
+ vp8_short_inv_walsh4x4
+ (&xd->block[24].dqcoeff[0], xd->qcoeff);
+ }
+ else
+ {
+ vp8_short_inv_walsh4x4_1
+ (&xd->block[24].dqcoeff[0], xd->qcoeff);
+ }
+ eob_adjust(xd->eobs, xd->qcoeff);
+
+ DQC = xd->dequant_y1_dc;
+ }
+ vp8_dequant_idct_add_y_block
+ (xd->qcoeff, DQC,
+ xd->dst.y_buffer,
+ xd->dst.y_stride, xd->eobs);
+}
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_INVTRANS_H_
diff --git a/thirdparty/libvpx/vp8/common/loopfilter.h b/thirdparty/libvpx/vp8/common/loopfilter.h
new file mode 100644
index 0000000000..20a6bd375b
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/loopfilter.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_LOOPFILTER_H_
+#define VP8_COMMON_LOOPFILTER_H_
+
+#include "vpx_ports/mem.h"
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_LOOP_FILTER 63
+/* fraction of total macroblock rows to be used in fast filter level picking */
+/* has to be > 2 */
+#define PARTIAL_FRAME_FRACTION 8
+
+typedef enum
+{
+ NORMAL_LOOPFILTER = 0,
+ SIMPLE_LOOPFILTER = 1
+} LOOPFILTERTYPE;
+
+#if ARCH_ARM
+#define SIMD_WIDTH 1
+#else
+#define SIMD_WIDTH 16
+#endif
+
+/* Need to align this structure so when it is declared and
+ * passed it can be loaded into vector registers.
+ */
+typedef struct
+{
+ DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
+ DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, blim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
+ DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
+ DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, hev_thr[4][SIMD_WIDTH]);
+ unsigned char lvl[4][4][4];
+ unsigned char hev_thr_lut[2][MAX_LOOP_FILTER + 1];
+ unsigned char mode_lf_lut[10];
+} loop_filter_info_n;
+
+typedef struct loop_filter_info
+{
+ const unsigned char * mblim;
+ const unsigned char * blim;
+ const unsigned char * lim;
+ const unsigned char * hev_thr;
+} loop_filter_info;
+
+
+typedef void loop_filter_uvfunction
+(
+ unsigned char *u, /* source pointer */
+ int p, /* pitch */
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh,
+ unsigned char *v
+);
+
+/* assorted loopfilter functions which get used elsewhere */
+struct VP8Common;
+struct macroblockd;
+struct modeinfo;
+
+void vp8_loop_filter_init(struct VP8Common *cm);
+
+void vp8_loop_filter_frame_init(struct VP8Common *cm,
+ struct macroblockd *mbd,
+ int default_filt_lvl);
+
+void vp8_loop_filter_frame(struct VP8Common *cm, struct macroblockd *mbd,
+ int frame_type);
+
+void vp8_loop_filter_partial_frame(struct VP8Common *cm,
+ struct macroblockd *mbd,
+ int default_filt_lvl);
+
+void vp8_loop_filter_frame_yonly(struct VP8Common *cm,
+ struct macroblockd *mbd,
+ int default_filt_lvl);
+
+void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi,
+ int sharpness_lvl);
+
+void vp8_loop_filter_row_normal(struct VP8Common *cm,
+ struct modeinfo *mode_info_context,
+ int mb_row, int post_ystride, int post_uvstride,
+ unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr);
+
+void vp8_loop_filter_row_simple(struct VP8Common *cm,
+ struct modeinfo *mode_info_context,
+ int mb_row, int post_ystride, int post_uvstride,
+ unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_LOOPFILTER_H_
diff --git a/thirdparty/libvpx/vp8/common/loopfilter_filters.c b/thirdparty/libvpx/vp8/common/loopfilter_filters.c
new file mode 100644
index 0000000000..1d51696ff7
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/loopfilter_filters.c
@@ -0,0 +1,430 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+#include "loopfilter.h"
+#include "onyxc_int.h"
+
+typedef unsigned char uc;
+
+static signed char vp8_signed_char_clamp(int t)
+{
+ t = (t < -128 ? -128 : t);
+ t = (t > 127 ? 127 : t);
+ return (signed char) t;
+}
+
+
+/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
+static signed char vp8_filter_mask(uc limit, uc blimit,
+ uc p3, uc p2, uc p1, uc p0,
+ uc q0, uc q1, uc q2, uc q3)
+{
+ signed char mask = 0;
+ mask |= (abs(p3 - p2) > limit);
+ mask |= (abs(p2 - p1) > limit);
+ mask |= (abs(p1 - p0) > limit);
+ mask |= (abs(q1 - q0) > limit);
+ mask |= (abs(q2 - q1) > limit);
+ mask |= (abs(q3 - q2) > limit);
+ mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit);
+ return mask - 1;
+}
+
+/* is there high variance internal edge ( 11111111 yes, 00000000 no) */
+static signed char vp8_hevmask(uc thresh, uc p1, uc p0, uc q0, uc q1)
+{
+ signed char hev = 0;
+ hev |= (abs(p1 - p0) > thresh) * -1;
+ hev |= (abs(q1 - q0) > thresh) * -1;
+ return hev;
+}
+
+static void vp8_filter(signed char mask, uc hev, uc *op1,
+ uc *op0, uc *oq0, uc *oq1)
+
+{
+ signed char ps0, qs0;
+ signed char ps1, qs1;
+ signed char filter_value, Filter1, Filter2;
+ signed char u;
+
+ ps1 = (signed char) * op1 ^ 0x80;
+ ps0 = (signed char) * op0 ^ 0x80;
+ qs0 = (signed char) * oq0 ^ 0x80;
+ qs1 = (signed char) * oq1 ^ 0x80;
+
+ /* add outer taps if we have high edge variance */
+ filter_value = vp8_signed_char_clamp(ps1 - qs1);
+ filter_value &= hev;
+
+ /* inner taps */
+ filter_value = vp8_signed_char_clamp(filter_value + 3 * (qs0 - ps0));
+ filter_value &= mask;
+
+ /* save bottom 3 bits so that we round one side +4 and the other +3
+ * if it equals 4 we'll set to adjust by -1 to account for the fact
+ * we'd round 3 the other way
+ */
+ Filter1 = vp8_signed_char_clamp(filter_value + 4);
+ Filter2 = vp8_signed_char_clamp(filter_value + 3);
+ Filter1 >>= 3;
+ Filter2 >>= 3;
+ u = vp8_signed_char_clamp(qs0 - Filter1);
+ *oq0 = u ^ 0x80;
+ u = vp8_signed_char_clamp(ps0 + Filter2);
+ *op0 = u ^ 0x80;
+ filter_value = Filter1;
+
+ /* outer tap adjustments */
+ filter_value += 1;
+ filter_value >>= 1;
+ filter_value &= ~hev;
+
+ u = vp8_signed_char_clamp(qs1 - filter_value);
+ *oq1 = u ^ 0x80;
+ u = vp8_signed_char_clamp(ps1 + filter_value);
+ *op1 = u ^ 0x80;
+
+}
+void vp8_loop_filter_horizontal_edge_c
+(
+ unsigned char *s,
+ int p, /* pitch */
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh,
+ int count
+)
+{
+ int hev = 0; /* high edge variance */
+ signed char mask = 0;
+ int i = 0;
+
+ /* loop filter designed to work using chars so that we can make maximum use
+ * of 8 bit simd instructions.
+ */
+ do
+ {
+ mask = vp8_filter_mask(limit[0], blimit[0],
+ s[-4*p], s[-3*p], s[-2*p], s[-1*p],
+ s[0*p], s[1*p], s[2*p], s[3*p]);
+
+ hev = vp8_hevmask(thresh[0], s[-2*p], s[-1*p], s[0*p], s[1*p]);
+
+ vp8_filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p);
+
+ ++s;
+ }
+ while (++i < count * 8);
+}
+
+void vp8_loop_filter_vertical_edge_c
+(
+ unsigned char *s,
+ int p,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh,
+ int count
+)
+{
+ int hev = 0; /* high edge variance */
+ signed char mask = 0;
+ int i = 0;
+
+ /* loop filter designed to work using chars so that we can make maximum use
+ * of 8 bit simd instructions.
+ */
+ do
+ {
+ mask = vp8_filter_mask(limit[0], blimit[0],
+ s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]);
+
+ hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
+
+ vp8_filter(mask, hev, s - 2, s - 1, s, s + 1);
+
+ s += p;
+ }
+ while (++i < count * 8);
+}
+
+static void vp8_mbfilter(signed char mask, uc hev,
+ uc *op2, uc *op1, uc *op0, uc *oq0, uc *oq1, uc *oq2)
+{
+ signed char s, u;
+ signed char filter_value, Filter1, Filter2;
+ signed char ps2 = (signed char) * op2 ^ 0x80;
+ signed char ps1 = (signed char) * op1 ^ 0x80;
+ signed char ps0 = (signed char) * op0 ^ 0x80;
+ signed char qs0 = (signed char) * oq0 ^ 0x80;
+ signed char qs1 = (signed char) * oq1 ^ 0x80;
+ signed char qs2 = (signed char) * oq2 ^ 0x80;
+
+ /* add outer taps if we have high edge variance */
+ filter_value = vp8_signed_char_clamp(ps1 - qs1);
+ filter_value = vp8_signed_char_clamp(filter_value + 3 * (qs0 - ps0));
+ filter_value &= mask;
+
+ Filter2 = filter_value;
+ Filter2 &= hev;
+
+ /* save bottom 3 bits so that we round one side +4 and the other +3 */
+ Filter1 = vp8_signed_char_clamp(Filter2 + 4);
+ Filter2 = vp8_signed_char_clamp(Filter2 + 3);
+ Filter1 >>= 3;
+ Filter2 >>= 3;
+ qs0 = vp8_signed_char_clamp(qs0 - Filter1);
+ ps0 = vp8_signed_char_clamp(ps0 + Filter2);
+
+
+ /* only apply wider filter if not high edge variance */
+ filter_value &= ~hev;
+ Filter2 = filter_value;
+
+ /* roughly 3/7th difference across boundary */
+ u = vp8_signed_char_clamp((63 + Filter2 * 27) >> 7);
+ s = vp8_signed_char_clamp(qs0 - u);
+ *oq0 = s ^ 0x80;
+ s = vp8_signed_char_clamp(ps0 + u);
+ *op0 = s ^ 0x80;
+
+ /* roughly 2/7th difference across boundary */
+ u = vp8_signed_char_clamp((63 + Filter2 * 18) >> 7);
+ s = vp8_signed_char_clamp(qs1 - u);
+ *oq1 = s ^ 0x80;
+ s = vp8_signed_char_clamp(ps1 + u);
+ *op1 = s ^ 0x80;
+
+ /* roughly 1/7th difference across boundary */
+ u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7);
+ s = vp8_signed_char_clamp(qs2 - u);
+ *oq2 = s ^ 0x80;
+ s = vp8_signed_char_clamp(ps2 + u);
+ *op2 = s ^ 0x80;
+}
+
+void vp8_mbloop_filter_horizontal_edge_c
+(
+ unsigned char *s,
+ int p,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh,
+ int count
+)
+{
+ signed char hev = 0; /* high edge variance */
+ signed char mask = 0;
+ int i = 0;
+
+ /* loop filter designed to work using chars so that we can make maximum use
+ * of 8 bit simd instructions.
+ */
+ do
+ {
+
+ mask = vp8_filter_mask(limit[0], blimit[0],
+ s[-4*p], s[-3*p], s[-2*p], s[-1*p],
+ s[0*p], s[1*p], s[2*p], s[3*p]);
+
+ hev = vp8_hevmask(thresh[0], s[-2*p], s[-1*p], s[0*p], s[1*p]);
+
+ vp8_mbfilter(mask, hev, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p);
+
+ ++s;
+ }
+ while (++i < count * 8);
+
+}
+
+
+void vp8_mbloop_filter_vertical_edge_c
+(
+ unsigned char *s,
+ int p,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh,
+ int count
+)
+{
+ signed char hev = 0; /* high edge variance */
+ signed char mask = 0;
+ int i = 0;
+
+ do
+ {
+
+ mask = vp8_filter_mask(limit[0], blimit[0],
+ s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]);
+
+ hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
+
+ vp8_mbfilter(mask, hev, s - 3, s - 2, s - 1, s, s + 1, s + 2);
+
+ s += p;
+ }
+ while (++i < count * 8);
+
+}
+
+/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
+static signed char vp8_simple_filter_mask(uc blimit, uc p1, uc p0, uc q0, uc q1)
+{
+/* Why does this cause problems for win32?
+ * error C2143: syntax error : missing ';' before 'type'
+ * (void) limit;
+ */
+ signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= blimit) * -1;
+ return mask;
+}
+
+static void vp8_simple_filter(signed char mask, uc *op1, uc *op0, uc *oq0, uc *oq1)
+{
+ signed char filter_value, Filter1, Filter2;
+ signed char p1 = (signed char) * op1 ^ 0x80;
+ signed char p0 = (signed char) * op0 ^ 0x80;
+ signed char q0 = (signed char) * oq0 ^ 0x80;
+ signed char q1 = (signed char) * oq1 ^ 0x80;
+ signed char u;
+
+ filter_value = vp8_signed_char_clamp(p1 - q1);
+ filter_value = vp8_signed_char_clamp(filter_value + 3 * (q0 - p0));
+ filter_value &= mask;
+
+ /* save bottom 3 bits so that we round one side +4 and the other +3 */
+ Filter1 = vp8_signed_char_clamp(filter_value + 4);
+ Filter1 >>= 3;
+ u = vp8_signed_char_clamp(q0 - Filter1);
+ *oq0 = u ^ 0x80;
+
+ Filter2 = vp8_signed_char_clamp(filter_value + 3);
+ Filter2 >>= 3;
+ u = vp8_signed_char_clamp(p0 + Filter2);
+ *op0 = u ^ 0x80;
+}
+
+void vp8_loop_filter_simple_horizontal_edge_c
+(
+ unsigned char *s,
+ int p,
+ const unsigned char *blimit
+)
+{
+ signed char mask = 0;
+ int i = 0;
+
+ do
+ {
+ mask = vp8_simple_filter_mask(blimit[0], s[-2*p], s[-1*p], s[0*p], s[1*p]);
+ vp8_simple_filter(mask, s - 2 * p, s - 1 * p, s, s + 1 * p);
+ ++s;
+ }
+ while (++i < 16);
+}
+
+void vp8_loop_filter_simple_vertical_edge_c
+(
+ unsigned char *s,
+ int p,
+ const unsigned char *blimit
+)
+{
+ signed char mask = 0;
+ int i = 0;
+
+ do
+ {
+ mask = vp8_simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]);
+ vp8_simple_filter(mask, s - 2, s - 1, s, s + 1);
+ s += p;
+ }
+ while (++i < 16);
+
+}
+
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ loop_filter_info *lfi)
+{
+ vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ loop_filter_info *lfi)
+{
+ vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ loop_filter_info *lfi)
+{
+ vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit)
+{
+ vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, blimit);
+ vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, blimit);
+ vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, blimit);
+}
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ loop_filter_info *lfi)
+{
+ vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit)
+{
+ vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit);
+ vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit);
+ vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit);
+}
diff --git a/thirdparty/libvpx/vp8/common/mbpitch.c b/thirdparty/libvpx/vp8/common/mbpitch.c
new file mode 100644
index 0000000000..32e1b66409
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/mbpitch.c
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "blockd.h"
+
+void vp8_setup_block_dptrs(MACROBLOCKD *x)
+{
+ int r, c;
+
+ for (r = 0; r < 4; r++)
+ {
+ for (c = 0; c < 4; c++)
+ {
+ x->block[r*4+c].predictor = x->predictor + r * 4 * 16 + c * 4;
+ }
+ }
+
+ for (r = 0; r < 2; r++)
+ {
+ for (c = 0; c < 2; c++)
+ {
+ x->block[16+r*2+c].predictor = x->predictor + 256 + r * 4 * 8 + c * 4;
+
+ }
+ }
+
+ for (r = 0; r < 2; r++)
+ {
+ for (c = 0; c < 2; c++)
+ {
+ x->block[20+r*2+c].predictor = x->predictor + 320 + r * 4 * 8 + c * 4;
+
+ }
+ }
+
+ for (r = 0; r < 25; r++)
+ {
+ x->block[r].qcoeff = x->qcoeff + r * 16;
+ x->block[r].dqcoeff = x->dqcoeff + r * 16;
+ x->block[r].eob = x->eobs + r;
+ }
+}
+
+void vp8_build_block_doffsets(MACROBLOCKD *x)
+{
+ int block;
+
+ for (block = 0; block < 16; block++) /* y blocks */
+ {
+ x->block[block].offset =
+ (block >> 2) * 4 * x->dst.y_stride + (block & 3) * 4;
+ }
+
+ for (block = 16; block < 20; block++) /* U and V blocks */
+ {
+ x->block[block+4].offset =
+ x->block[block].offset =
+ ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4;
+ }
+}
diff --git a/thirdparty/libvpx/vp8/common/modecont.c b/thirdparty/libvpx/vp8/common/modecont.c
new file mode 100644
index 0000000000..86a74bc0ff
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/modecont.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "entropy.h"
+
+const int vp8_mode_contexts[6][4] =
+{
+ {
+ /* 0 */
+ 7, 1, 1, 143,
+ },
+ {
+ /* 1 */
+ 14, 18, 14, 107,
+ },
+ {
+ /* 2 */
+ 135, 64, 57, 68,
+ },
+ {
+ /* 3 */
+ 60, 56, 128, 65,
+ },
+ {
+ /* 4 */
+ 159, 134, 128, 34,
+ },
+ {
+ /* 5 */
+ 234, 188, 128, 28,
+ },
+};
diff --git a/thirdparty/libvpx/vp8/common/modecont.h b/thirdparty/libvpx/vp8/common/modecont.h
new file mode 100644
index 0000000000..ff34c33c55
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/modecont.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_MODECONT_H_
+#define VP8_COMMON_MODECONT_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern const int vp8_mode_contexts[6][4];
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_MODECONT_H_
diff --git a/thirdparty/libvpx/vp8/common/mv.h b/thirdparty/libvpx/vp8/common/mv.h
new file mode 100644
index 0000000000..111ccd63c7
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/mv.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_MV_H_
+#define VP8_COMMON_MV_H_
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct
+{
+ short row;
+ short col;
+} MV;
+
+typedef union int_mv
+{
+ uint32_t as_int;
+ MV as_mv;
+} int_mv; /* facilitates faster equality tests and copies */
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_MV_H_
diff --git a/thirdparty/libvpx/vp8/common/onyxc_int.h b/thirdparty/libvpx/vp8/common/onyxc_int.h
new file mode 100644
index 0000000000..6d89865c60
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/onyxc_int.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_ONYXC_INT_H_
+#define VP8_COMMON_ONYXC_INT_H_
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "loopfilter.h"
+#include "entropymv.h"
+#include "entropy.h"
+#if CONFIG_POSTPROC
+#include "postproc.h"
+#endif
+
+/*#ifdef PACKET_TESTING*/
+#include "header.h"
+/*#endif*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MINQ 0
+#define MAXQ 127
+#define QINDEX_RANGE (MAXQ + 1)
+
+#define NUM_YV12_BUFFERS 4
+
+#define MAX_PARTITIONS 9
+
+typedef struct frame_contexts
+{
+ vp8_prob bmode_prob [VP8_BINTRAMODES-1];
+ vp8_prob ymode_prob [VP8_YMODES-1]; /* interframe intra mode probs */
+ vp8_prob uv_mode_prob [VP8_UV_MODES-1];
+ vp8_prob sub_mv_ref_prob [VP8_SUBMVREFS-1];
+ vp8_prob coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+ MV_CONTEXT mvc[2];
+} FRAME_CONTEXT;
+
+typedef enum
+{
+ ONE_PARTITION = 0,
+ TWO_PARTITION = 1,
+ FOUR_PARTITION = 2,
+ EIGHT_PARTITION = 3
+} TOKEN_PARTITION;
+
+typedef enum
+{
+ RECON_CLAMP_REQUIRED = 0,
+ RECON_CLAMP_NOTREQUIRED = 1
+} CLAMP_TYPE;
+
+typedef struct VP8Common
+
+{
+ struct vpx_internal_error_info error;
+
+ DECLARE_ALIGNED(16, short, Y1dequant[QINDEX_RANGE][2]);
+ DECLARE_ALIGNED(16, short, Y2dequant[QINDEX_RANGE][2]);
+ DECLARE_ALIGNED(16, short, UVdequant[QINDEX_RANGE][2]);
+
+ int Width;
+ int Height;
+ int horiz_scale;
+ int vert_scale;
+
+ CLAMP_TYPE clamp_type;
+
+ YV12_BUFFER_CONFIG *frame_to_show;
+
+ YV12_BUFFER_CONFIG yv12_fb[NUM_YV12_BUFFERS];
+ int fb_idx_ref_cnt[NUM_YV12_BUFFERS];
+ int new_fb_idx, lst_fb_idx, gld_fb_idx, alt_fb_idx;
+
+ YV12_BUFFER_CONFIG temp_scale_frame;
+
+#if CONFIG_POSTPROC
+ YV12_BUFFER_CONFIG post_proc_buffer;
+ YV12_BUFFER_CONFIG post_proc_buffer_int;
+ int post_proc_buffer_int_used;
+ unsigned char *pp_limits_buffer; /* post-processing filter coefficients */
+#endif
+
+ FRAME_TYPE last_frame_type; /* Save last frame's frame type for motion search. */
+ FRAME_TYPE frame_type;
+
+ int show_frame;
+
+ int frame_flags;
+ int MBs;
+ int mb_rows;
+ int mb_cols;
+ int mode_info_stride;
+
+ /* profile settings */
+ int mb_no_coeff_skip;
+ int no_lpf;
+ int use_bilinear_mc_filter;
+ int full_pixel;
+
+ int base_qindex;
+
+ int y1dc_delta_q;
+ int y2dc_delta_q;
+ int y2ac_delta_q;
+ int uvdc_delta_q;
+ int uvac_delta_q;
+
+ /* We allocate a MODE_INFO struct for each macroblock, together with
+ an extra row on top and column on the left to simplify prediction. */
+
+ MODE_INFO *mip; /* Base of allocated array */
+ MODE_INFO *mi; /* Corresponds to upper left visible macroblock */
+#if CONFIG_ERROR_CONCEALMENT
+ MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */
+ MODE_INFO *prev_mi; /* 'mi' from last frame (points into prev_mip) */
+#endif
+ MODE_INFO *show_frame_mi; /* MODE_INFO for the last decoded frame
+ to show */
+ LOOPFILTERTYPE filter_type;
+
+ loop_filter_info_n lf_info;
+
+ int filter_level;
+ int last_sharpness_level;
+ int sharpness_level;
+
+ int refresh_last_frame; /* Two state 0 = NO, 1 = YES */
+ int refresh_golden_frame; /* Two state 0 = NO, 1 = YES */
+ int refresh_alt_ref_frame; /* Two state 0 = NO, 1 = YES */
+
+ int copy_buffer_to_gf; /* 0 none, 1 Last to GF, 2 ARF to GF */
+ int copy_buffer_to_arf; /* 0 none, 1 Last to ARF, 2 GF to ARF */
+
+ int refresh_entropy_probs; /* Two state 0 = NO, 1 = YES */
+
+ int ref_frame_sign_bias[MAX_REF_FRAMES]; /* Two state 0, 1 */
+
+ /* Y,U,V,Y2 */
+ ENTROPY_CONTEXT_PLANES *above_context; /* row of context for each plane */
+ ENTROPY_CONTEXT_PLANES left_context; /* (up to) 4 contexts "" */
+
+ FRAME_CONTEXT lfc; /* last frame entropy */
+ FRAME_CONTEXT fc; /* this frame entropy */
+
+ unsigned int current_video_frame;
+
+ int version;
+
+ TOKEN_PARTITION multi_token_partition;
+
+#ifdef PACKET_TESTING
+ VP8_HEADER oh;
+#endif
+#if CONFIG_POSTPROC_VISUALIZER
+ double bitrate;
+ double framerate;
+#endif
+
+#if CONFIG_MULTITHREAD
+ int processor_core_count;
+#endif
+#if CONFIG_POSTPROC
+ struct postproc_state postproc_state;
+#endif
+ int cpu_caps;
+} VP8_COMMON;
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_ONYXC_INT_H_
diff --git a/thirdparty/libvpx/vp8/common/onyxd.h b/thirdparty/libvpx/vp8/common/onyxd.h
new file mode 100644
index 0000000000..e37b29f32c
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/onyxd.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_ONYXD_H_
+#define VP8_COMMON_ONYXD_H_
+
+
+/* Create/destroy static data structures. */
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+#include "vpx_scale/yv12config.h"
+#include "ppflags.h"
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vp8.h"
+
+ struct VP8D_COMP;
+
+ typedef struct
+ {
+ int Width;
+ int Height;
+ int Version;
+ int postprocess;
+ int max_threads;
+ int error_concealment;
+ } VP8D_CONFIG;
+
+ typedef enum
+ {
+ VP8D_OK = 0
+ } VP8D_SETTING;
+
+ void vp8dx_initialize(void);
+
+ void vp8dx_set_setting(struct VP8D_COMP* comp, VP8D_SETTING oxst, int x);
+
+ int vp8dx_get_setting(struct VP8D_COMP* comp, VP8D_SETTING oxst);
+
+ int vp8dx_receive_compressed_data(struct VP8D_COMP* comp,
+ size_t size, const uint8_t *dest,
+ int64_t time_stamp);
+ int vp8dx_get_raw_frame(struct VP8D_COMP* comp, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags);
+
+ vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP* comp, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+ vpx_codec_err_t vp8dx_set_reference(struct VP8D_COMP* comp, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif // VP8_COMMON_ONYXD_H_
diff --git a/thirdparty/libvpx/vp8/common/ppflags.h b/thirdparty/libvpx/vp8/common/ppflags.h
new file mode 100644
index 0000000000..768224aad5
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/ppflags.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_PPFLAGS_H_
+#define VP8_COMMON_PPFLAGS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+enum
+{
+ VP8D_NOFILTERING = 0,
+ VP8D_DEBLOCK = 1<<0,
+ VP8D_DEMACROBLOCK = 1<<1,
+ VP8D_ADDNOISE = 1<<2,
+ VP8D_DEBUG_TXT_FRAME_INFO = 1<<3,
+ VP8D_DEBUG_TXT_MBLK_MODES = 1<<4,
+ VP8D_DEBUG_TXT_DC_DIFF = 1<<5,
+ VP8D_DEBUG_TXT_RATE_INFO = 1<<6,
+ VP8D_DEBUG_DRAW_MV = 1<<7,
+ VP8D_DEBUG_CLR_BLK_MODES = 1<<8,
+ VP8D_DEBUG_CLR_FRM_REF_BLKS = 1<<9,
+ VP8D_MFQE = 1<<10
+};
+
+typedef struct
+{
+ int post_proc_flag;
+ int deblocking_level;
+ int noise_level;
+ int display_ref_frame_flag;
+ int display_mb_modes_flag;
+ int display_b_modes_flag;
+ int display_mv_flag;
+} vp8_ppflags_t;
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_PPFLAGS_H_
diff --git a/thirdparty/libvpx/vp8/common/quant_common.c b/thirdparty/libvpx/vp8/common/quant_common.c
new file mode 100644
index 0000000000..05f9210702
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/quant_common.c
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "quant_common.h"
+
+static const int dc_qlookup[QINDEX_RANGE] =
+{
+ 4, 5, 6, 7, 8, 9, 10, 10, 11, 12, 13, 14, 15, 16, 17, 17,
+ 18, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 25, 25, 26, 27, 28,
+ 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43,
+ 44, 45, 46, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
+ 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 91, 93, 95, 96, 98, 100, 101, 102, 104, 106, 108, 110, 112, 114, 116, 118,
+ 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 143, 145, 148, 151, 154, 157,
+};
+
+static const int ac_qlookup[QINDEX_RANGE] =
+{
+ 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
+ 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
+ 52, 53, 54, 55, 56, 57, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76,
+ 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108,
+ 110, 112, 114, 116, 119, 122, 125, 128, 131, 134, 137, 140, 143, 146, 149, 152,
+ 155, 158, 161, 164, 167, 170, 173, 177, 181, 185, 189, 193, 197, 201, 205, 209,
+ 213, 217, 221, 225, 229, 234, 239, 245, 249, 254, 259, 264, 269, 274, 279, 284,
+};
+
+
+int vp8_dc_quant(int QIndex, int Delta)
+{
+ int retval;
+
+ QIndex = QIndex + Delta;
+
+ if (QIndex > 127)
+ QIndex = 127;
+ else if (QIndex < 0)
+ QIndex = 0;
+
+ retval = dc_qlookup[ QIndex ];
+ return retval;
+}
+
+int vp8_dc2quant(int QIndex, int Delta)
+{
+ int retval;
+
+ QIndex = QIndex + Delta;
+
+ if (QIndex > 127)
+ QIndex = 127;
+ else if (QIndex < 0)
+ QIndex = 0;
+
+ retval = dc_qlookup[ QIndex ] * 2;
+ return retval;
+
+}
+int vp8_dc_uv_quant(int QIndex, int Delta)
+{
+ int retval;
+
+ QIndex = QIndex + Delta;
+
+ if (QIndex > 127)
+ QIndex = 127;
+ else if (QIndex < 0)
+ QIndex = 0;
+
+ retval = dc_qlookup[ QIndex ];
+
+ if (retval > 132)
+ retval = 132;
+
+ return retval;
+}
+
+int vp8_ac_yquant(int QIndex)
+{
+ int retval;
+
+ if (QIndex > 127)
+ QIndex = 127;
+ else if (QIndex < 0)
+ QIndex = 0;
+
+ retval = ac_qlookup[ QIndex ];
+ return retval;
+}
+
+int vp8_ac2quant(int QIndex, int Delta)
+{
+ int retval;
+
+ QIndex = QIndex + Delta;
+
+ if (QIndex > 127)
+ QIndex = 127;
+ else if (QIndex < 0)
+ QIndex = 0;
+
+ /* For all x in [0..284], x*155/100 is bitwise equal to (x*101581) >> 16.
+ * The smallest precision for that is '(x*6349) >> 12' but 16 is a good
+ * word size. */
+ retval = (ac_qlookup[ QIndex ] * 101581) >> 16;
+
+ if (retval < 8)
+ retval = 8;
+
+ return retval;
+}
+int vp8_ac_uv_quant(int QIndex, int Delta)
+{
+ int retval;
+
+ QIndex = QIndex + Delta;
+
+ if (QIndex > 127)
+ QIndex = 127;
+ else if (QIndex < 0)
+ QIndex = 0;
+
+ retval = ac_qlookup[ QIndex ];
+ return retval;
+}
diff --git a/thirdparty/libvpx/vp8/common/quant_common.h b/thirdparty/libvpx/vp8/common/quant_common.h
new file mode 100644
index 0000000000..700b5e6d72
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/quant_common.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_COMMON_QUANT_COMMON_H_
+#define VP8_COMMON_QUANT_COMMON_H_
+
+
+#include "string.h"
+#include "blockd.h"
+#include "onyxc_int.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int vp8_ac_yquant(int QIndex);
+extern int vp8_dc_quant(int QIndex, int Delta);
+extern int vp8_dc2quant(int QIndex, int Delta);
+extern int vp8_ac2quant(int QIndex, int Delta);
+extern int vp8_dc_uv_quant(int QIndex, int Delta);
+extern int vp8_ac_uv_quant(int QIndex, int Delta);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_QUANT_COMMON_H_
diff --git a/thirdparty/libvpx/vp8/common/reconinter.c b/thirdparty/libvpx/vp8/common/reconinter.c
new file mode 100644
index 0000000000..e302595587
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/reconinter.c
@@ -0,0 +1,544 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <limits.h>
+#include <string.h>
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "blockd.h"
+#include "reconinter.h"
+#if CONFIG_RUNTIME_CPU_DETECT
+#include "onyxc_int.h"
+#endif
+
+void vp8_copy_mem16x16_c(
+ unsigned char *src,
+ int src_stride,
+ unsigned char *dst,
+ int dst_stride)
+{
+
+ int r;
+
+ for (r = 0; r < 16; r++)
+ {
+ memcpy(dst, src, 16);
+
+ src += src_stride;
+ dst += dst_stride;
+
+ }
+
+}
+
+void vp8_copy_mem8x8_c(
+ unsigned char *src,
+ int src_stride,
+ unsigned char *dst,
+ int dst_stride)
+{
+ int r;
+
+ for (r = 0; r < 8; r++)
+ {
+ memcpy(dst, src, 8);
+
+ src += src_stride;
+ dst += dst_stride;
+
+ }
+
+}
+
+void vp8_copy_mem8x4_c(
+ unsigned char *src,
+ int src_stride,
+ unsigned char *dst,
+ int dst_stride)
+{
+ int r;
+
+ for (r = 0; r < 4; r++)
+ {
+ memcpy(dst, src, 8);
+
+ src += src_stride;
+ dst += dst_stride;
+
+ }
+
+}
+
+
+void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, unsigned char *base_pre, int pre_stride, vp8_subpix_fn_t sppf)
+{
+ int r;
+ unsigned char *pred_ptr = d->predictor;
+ unsigned char *ptr;
+ ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3);
+
+ if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
+ {
+ sppf(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, pred_ptr, pitch);
+ }
+ else
+ {
+ for (r = 0; r < 4; r++)
+ {
+ pred_ptr[0] = ptr[0];
+ pred_ptr[1] = ptr[1];
+ pred_ptr[2] = ptr[2];
+ pred_ptr[3] = ptr[3];
+ pred_ptr += pitch;
+ ptr += pre_stride;
+ }
+ }
+}
+
+static void build_inter_predictors4b(MACROBLOCKD *x, BLOCKD *d, unsigned char *dst, int dst_stride, unsigned char *base_pre, int pre_stride)
+{
+ unsigned char *ptr;
+ ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3);
+
+ if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
+ {
+ x->subpixel_predict8x8(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride);
+ }
+ else
+ {
+ vp8_copy_mem8x8(ptr, pre_stride, dst, dst_stride);
+ }
+}
+
+static void build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d, unsigned char *dst, int dst_stride, unsigned char *base_pre, int pre_stride)
+{
+ unsigned char *ptr;
+ ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3);
+
+ if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
+ {
+ x->subpixel_predict8x4(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride);
+ }
+ else
+ {
+ vp8_copy_mem8x4(ptr, pre_stride, dst, dst_stride);
+ }
+}
+
+static void build_inter_predictors_b(BLOCKD *d, unsigned char *dst, int dst_stride, unsigned char *base_pre, int pre_stride, vp8_subpix_fn_t sppf)
+{
+ int r;
+ unsigned char *ptr;
+ ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3);
+
+ if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
+ {
+ sppf(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride);
+ }
+ else
+ {
+ for (r = 0; r < 4; r++)
+ {
+ dst[0] = ptr[0];
+ dst[1] = ptr[1];
+ dst[2] = ptr[2];
+ dst[3] = ptr[3];
+ dst += dst_stride;
+ ptr += pre_stride;
+ }
+ }
+}
+
+
+/*encoder only*/
+void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x)
+{
+ unsigned char *uptr, *vptr;
+ unsigned char *upred_ptr = &x->predictor[256];
+ unsigned char *vpred_ptr = &x->predictor[320];
+
+ int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
+ int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
+ int offset;
+ int pre_stride = x->pre.uv_stride;
+
+ /* calc uv motion vectors */
+ mv_row += 1 | (mv_row >> (sizeof(int) * CHAR_BIT - 1));
+ mv_col += 1 | (mv_col >> (sizeof(int) * CHAR_BIT - 1));
+ mv_row /= 2;
+ mv_col /= 2;
+ mv_row &= x->fullpixel_mask;
+ mv_col &= x->fullpixel_mask;
+
+ offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
+ uptr = x->pre.u_buffer + offset;
+ vptr = x->pre.v_buffer + offset;
+
+ if ((mv_row | mv_col) & 7)
+ {
+ x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, 8);
+ x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, 8);
+ }
+ else
+ {
+ vp8_copy_mem8x8(uptr, pre_stride, upred_ptr, 8);
+ vp8_copy_mem8x8(vptr, pre_stride, vpred_ptr, 8);
+ }
+}
+
+/*encoder only*/
+void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x)
+{
+ int i, j;
+ int pre_stride = x->pre.uv_stride;
+ unsigned char *base_pre;
+
+ /* build uv mvs */
+ for (i = 0; i < 2; i++)
+ {
+ for (j = 0; j < 2; j++)
+ {
+ int yoffset = i * 8 + j * 2;
+ int uoffset = 16 + i * 2 + j;
+ int voffset = 20 + i * 2 + j;
+
+ int temp;
+
+ temp = x->block[yoffset ].bmi.mv.as_mv.row
+ + x->block[yoffset+1].bmi.mv.as_mv.row
+ + x->block[yoffset+4].bmi.mv.as_mv.row
+ + x->block[yoffset+5].bmi.mv.as_mv.row;
+
+ temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8);
+
+ x->block[uoffset].bmi.mv.as_mv.row = (temp / 8) & x->fullpixel_mask;
+
+ temp = x->block[yoffset ].bmi.mv.as_mv.col
+ + x->block[yoffset+1].bmi.mv.as_mv.col
+ + x->block[yoffset+4].bmi.mv.as_mv.col
+ + x->block[yoffset+5].bmi.mv.as_mv.col;
+
+ temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8);
+
+ x->block[uoffset].bmi.mv.as_mv.col = (temp / 8) & x->fullpixel_mask;
+
+ x->block[voffset].bmi.mv.as_int = x->block[uoffset].bmi.mv.as_int;
+ }
+ }
+
+ base_pre = x->pre.u_buffer;
+ for (i = 16; i < 20; i += 2)
+ {
+ BLOCKD *d0 = &x->block[i];
+ BLOCKD *d1 = &x->block[i+1];
+
+ if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+ build_inter_predictors2b(x, d0, d0->predictor, 8, base_pre, pre_stride);
+ else
+ {
+ vp8_build_inter_predictors_b(d0, 8, base_pre, pre_stride, x->subpixel_predict);
+ vp8_build_inter_predictors_b(d1, 8, base_pre, pre_stride, x->subpixel_predict);
+ }
+ }
+
+ base_pre = x->pre.v_buffer;
+ for (i = 20; i < 24; i += 2)
+ {
+ BLOCKD *d0 = &x->block[i];
+ BLOCKD *d1 = &x->block[i+1];
+
+ if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+ build_inter_predictors2b(x, d0, d0->predictor, 8, base_pre, pre_stride);
+ else
+ {
+ vp8_build_inter_predictors_b(d0, 8, base_pre, pre_stride, x->subpixel_predict);
+ vp8_build_inter_predictors_b(d1, 8, base_pre, pre_stride, x->subpixel_predict);
+ }
+ }
+}
+
+
+/*encoder only*/
+void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x,
+ unsigned char *dst_y,
+ int dst_ystride)
+{
+ unsigned char *ptr_base;
+ unsigned char *ptr;
+ int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
+ int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
+ int pre_stride = x->pre.y_stride;
+
+ ptr_base = x->pre.y_buffer;
+ ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
+
+ if ((mv_row | mv_col) & 7)
+ {
+ x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7,
+ dst_y, dst_ystride);
+ }
+ else
+ {
+ vp8_copy_mem16x16(ptr, pre_stride, dst_y,
+ dst_ystride);
+ }
+}
+
+static void clamp_mv_to_umv_border(MV *mv, const MACROBLOCKD *xd)
+{
+ /* If the MV points so far into the UMV border that no visible pixels
+ * are used for reconstruction, the subpel part of the MV can be
+ * discarded and the MV limited to 16 pixels with equivalent results.
+ *
+ * This limit kicks in at 19 pixels for the top and left edges, for
+ * the 16 pixels plus 3 taps right of the central pixel when subpel
+ * filtering. The bottom and right edges use 16 pixels plus 2 pixels
+ * left of the central pixel when filtering.
+ */
+ if (mv->col < (xd->mb_to_left_edge - (19 << 3)))
+ mv->col = xd->mb_to_left_edge - (16 << 3);
+ else if (mv->col > xd->mb_to_right_edge + (18 << 3))
+ mv->col = xd->mb_to_right_edge + (16 << 3);
+
+ if (mv->row < (xd->mb_to_top_edge - (19 << 3)))
+ mv->row = xd->mb_to_top_edge - (16 << 3);
+ else if (mv->row > xd->mb_to_bottom_edge + (18 << 3))
+ mv->row = xd->mb_to_bottom_edge + (16 << 3);
+}
+
+/* A version of the above function for chroma block MVs.*/
+static void clamp_uvmv_to_umv_border(MV *mv, const MACROBLOCKD *xd)
+{
+ mv->col = (2*mv->col < (xd->mb_to_left_edge - (19 << 3))) ?
+ (xd->mb_to_left_edge - (16 << 3)) >> 1 : mv->col;
+ mv->col = (2*mv->col > xd->mb_to_right_edge + (18 << 3)) ?
+ (xd->mb_to_right_edge + (16 << 3)) >> 1 : mv->col;
+
+ mv->row = (2*mv->row < (xd->mb_to_top_edge - (19 << 3))) ?
+ (xd->mb_to_top_edge - (16 << 3)) >> 1 : mv->row;
+ mv->row = (2*mv->row > xd->mb_to_bottom_edge + (18 << 3)) ?
+ (xd->mb_to_bottom_edge + (16 << 3)) >> 1 : mv->row;
+}
+
+void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x,
+ unsigned char *dst_y,
+ unsigned char *dst_u,
+ unsigned char *dst_v,
+ int dst_ystride,
+ int dst_uvstride)
+{
+ int offset;
+ unsigned char *ptr;
+ unsigned char *uptr, *vptr;
+
+ int_mv _16x16mv;
+
+ unsigned char *ptr_base = x->pre.y_buffer;
+ int pre_stride = x->pre.y_stride;
+
+ _16x16mv.as_int = x->mode_info_context->mbmi.mv.as_int;
+
+ if (x->mode_info_context->mbmi.need_to_clamp_mvs)
+ {
+ clamp_mv_to_umv_border(&_16x16mv.as_mv, x);
+ }
+
+ ptr = ptr_base + ( _16x16mv.as_mv.row >> 3) * pre_stride + (_16x16mv.as_mv.col >> 3);
+
+ if ( _16x16mv.as_int & 0x00070007)
+ {
+ x->subpixel_predict16x16(ptr, pre_stride, _16x16mv.as_mv.col & 7, _16x16mv.as_mv.row & 7, dst_y, dst_ystride);
+ }
+ else
+ {
+ vp8_copy_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
+ }
+
+ /* calc uv motion vectors */
+ _16x16mv.as_mv.row += 1 | (_16x16mv.as_mv.row >> (sizeof(int) * CHAR_BIT - 1));
+ _16x16mv.as_mv.col += 1 | (_16x16mv.as_mv.col >> (sizeof(int) * CHAR_BIT - 1));
+ _16x16mv.as_mv.row /= 2;
+ _16x16mv.as_mv.col /= 2;
+ _16x16mv.as_mv.row &= x->fullpixel_mask;
+ _16x16mv.as_mv.col &= x->fullpixel_mask;
+
+ pre_stride >>= 1;
+ offset = ( _16x16mv.as_mv.row >> 3) * pre_stride + (_16x16mv.as_mv.col >> 3);
+ uptr = x->pre.u_buffer + offset;
+ vptr = x->pre.v_buffer + offset;
+
+ if ( _16x16mv.as_int & 0x00070007)
+ {
+ x->subpixel_predict8x8(uptr, pre_stride, _16x16mv.as_mv.col & 7, _16x16mv.as_mv.row & 7, dst_u, dst_uvstride);
+ x->subpixel_predict8x8(vptr, pre_stride, _16x16mv.as_mv.col & 7, _16x16mv.as_mv.row & 7, dst_v, dst_uvstride);
+ }
+ else
+ {
+ vp8_copy_mem8x8(uptr, pre_stride, dst_u, dst_uvstride);
+ vp8_copy_mem8x8(vptr, pre_stride, dst_v, dst_uvstride);
+ }
+}
+
+static void build_inter4x4_predictors_mb(MACROBLOCKD *x)
+{
+ int i;
+ unsigned char *base_dst = x->dst.y_buffer;
+ unsigned char *base_pre = x->pre.y_buffer;
+
+ if (x->mode_info_context->mbmi.partitioning < 3)
+ {
+ BLOCKD *b;
+ int dst_stride = x->dst.y_stride;
+
+ x->block[ 0].bmi = x->mode_info_context->bmi[ 0];
+ x->block[ 2].bmi = x->mode_info_context->bmi[ 2];
+ x->block[ 8].bmi = x->mode_info_context->bmi[ 8];
+ x->block[10].bmi = x->mode_info_context->bmi[10];
+ if (x->mode_info_context->mbmi.need_to_clamp_mvs)
+ {
+ clamp_mv_to_umv_border(&x->block[ 0].bmi.mv.as_mv, x);
+ clamp_mv_to_umv_border(&x->block[ 2].bmi.mv.as_mv, x);
+ clamp_mv_to_umv_border(&x->block[ 8].bmi.mv.as_mv, x);
+ clamp_mv_to_umv_border(&x->block[10].bmi.mv.as_mv, x);
+ }
+
+ b = &x->block[ 0];
+ build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride);
+ b = &x->block[ 2];
+ build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride);
+ b = &x->block[ 8];
+ build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride);
+ b = &x->block[10];
+ build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride);
+ }
+ else
+ {
+ for (i = 0; i < 16; i += 2)
+ {
+ BLOCKD *d0 = &x->block[i];
+ BLOCKD *d1 = &x->block[i+1];
+ int dst_stride = x->dst.y_stride;
+
+ x->block[i+0].bmi = x->mode_info_context->bmi[i+0];
+ x->block[i+1].bmi = x->mode_info_context->bmi[i+1];
+ if (x->mode_info_context->mbmi.need_to_clamp_mvs)
+ {
+ clamp_mv_to_umv_border(&x->block[i+0].bmi.mv.as_mv, x);
+ clamp_mv_to_umv_border(&x->block[i+1].bmi.mv.as_mv, x);
+ }
+
+ if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+ build_inter_predictors2b(x, d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride);
+ else
+ {
+ build_inter_predictors_b(d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
+ build_inter_predictors_b(d1, base_dst + d1->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
+ }
+
+ }
+
+ }
+ base_dst = x->dst.u_buffer;
+ base_pre = x->pre.u_buffer;
+ for (i = 16; i < 20; i += 2)
+ {
+ BLOCKD *d0 = &x->block[i];
+ BLOCKD *d1 = &x->block[i+1];
+ int dst_stride = x->dst.uv_stride;
+
+ /* Note: uv mvs already clamped in build_4x4uvmvs() */
+
+ if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+ build_inter_predictors2b(x, d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride);
+ else
+ {
+ build_inter_predictors_b(d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
+ build_inter_predictors_b(d1, base_dst + d1->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
+ }
+ }
+
+ base_dst = x->dst.v_buffer;
+ base_pre = x->pre.v_buffer;
+ for (i = 20; i < 24; i += 2)
+ {
+ BLOCKD *d0 = &x->block[i];
+ BLOCKD *d1 = &x->block[i+1];
+ int dst_stride = x->dst.uv_stride;
+
+ /* Note: uv mvs already clamped in build_4x4uvmvs() */
+
+ if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+ build_inter_predictors2b(x, d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride);
+ else
+ {
+ build_inter_predictors_b(d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
+ build_inter_predictors_b(d1, base_dst + d1->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
+ }
+ }
+}
+
+static
+void build_4x4uvmvs(MACROBLOCKD *x)
+{
+ int i, j;
+
+ for (i = 0; i < 2; i++)
+ {
+ for (j = 0; j < 2; j++)
+ {
+ int yoffset = i * 8 + j * 2;
+ int uoffset = 16 + i * 2 + j;
+ int voffset = 20 + i * 2 + j;
+
+ int temp;
+
+ temp = x->mode_info_context->bmi[yoffset + 0].mv.as_mv.row
+ + x->mode_info_context->bmi[yoffset + 1].mv.as_mv.row
+ + x->mode_info_context->bmi[yoffset + 4].mv.as_mv.row
+ + x->mode_info_context->bmi[yoffset + 5].mv.as_mv.row;
+
+ temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8);
+
+ x->block[uoffset].bmi.mv.as_mv.row = (temp / 8) & x->fullpixel_mask;
+
+ temp = x->mode_info_context->bmi[yoffset + 0].mv.as_mv.col
+ + x->mode_info_context->bmi[yoffset + 1].mv.as_mv.col
+ + x->mode_info_context->bmi[yoffset + 4].mv.as_mv.col
+ + x->mode_info_context->bmi[yoffset + 5].mv.as_mv.col;
+
+ temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8);
+
+ x->block[uoffset].bmi.mv.as_mv.col = (temp / 8) & x->fullpixel_mask;
+
+ if (x->mode_info_context->mbmi.need_to_clamp_mvs)
+ clamp_uvmv_to_umv_border(&x->block[uoffset].bmi.mv.as_mv, x);
+
+ x->block[voffset].bmi.mv.as_int = x->block[uoffset].bmi.mv.as_int;
+ }
+ }
+}
+
+void vp8_build_inter_predictors_mb(MACROBLOCKD *xd)
+{
+ if (xd->mode_info_context->mbmi.mode != SPLITMV)
+ {
+ vp8_build_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
+ xd->dst.u_buffer, xd->dst.v_buffer,
+ xd->dst.y_stride, xd->dst.uv_stride);
+ }
+ else
+ {
+ build_4x4uvmvs(xd);
+ build_inter4x4_predictors_mb(xd);
+ }
+}
diff --git a/thirdparty/libvpx/vp8/common/reconinter.h b/thirdparty/libvpx/vp8/common/reconinter.h
new file mode 100644
index 0000000000..ba979b9664
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/reconinter.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_RECONINTER_H_
+#define VP8_COMMON_RECONINTER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern void vp8_build_inter_predictors_mb(MACROBLOCKD *x);
+extern void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x,
+ unsigned char *dst_y,
+ unsigned char *dst_u,
+ unsigned char *dst_v,
+ int dst_ystride,
+ int dst_uvstride);
+
+
+extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x,
+ unsigned char *dst_y,
+ int dst_ystride);
+extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch,
+ unsigned char *base_pre,
+ int pre_stride,
+ vp8_subpix_fn_t sppf);
+
+extern void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x);
+extern void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_RECONINTER_H_
diff --git a/thirdparty/libvpx/vp8/common/reconintra.c b/thirdparty/libvpx/vp8/common/reconintra.c
new file mode 100644
index 0000000000..356655dac7
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/reconintra.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vp8_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_once.h"
+#include "blockd.h"
+#include "vp8/common/reconintra.h"
+#include "vp8/common/reconintra4x4.h"
+
+enum {
+ SIZE_16,
+ SIZE_8,
+ NUM_SIZES,
+};
+
+typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left);
+
+static intra_pred_fn pred[4][NUM_SIZES];
+static intra_pred_fn dc_pred[2][2][NUM_SIZES];
+
+static void vp8_init_intra_predictors_internal(void)
+{
+#define INIT_SIZE(sz) \
+ pred[V_PRED][SIZE_##sz] = vpx_v_predictor_##sz##x##sz; \
+ pred[H_PRED][SIZE_##sz] = vpx_h_predictor_##sz##x##sz; \
+ pred[TM_PRED][SIZE_##sz] = vpx_tm_predictor_##sz##x##sz; \
+ \
+ dc_pred[0][0][SIZE_##sz] = vpx_dc_128_predictor_##sz##x##sz; \
+ dc_pred[0][1][SIZE_##sz] = vpx_dc_top_predictor_##sz##x##sz; \
+ dc_pred[1][0][SIZE_##sz] = vpx_dc_left_predictor_##sz##x##sz; \
+ dc_pred[1][1][SIZE_##sz] = vpx_dc_predictor_##sz##x##sz
+
+ INIT_SIZE(16);
+ INIT_SIZE(8);
+ vp8_init_intra4x4_predictors_internal();
+}
+
+void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x,
+ unsigned char * yabove_row,
+ unsigned char * yleft,
+ int left_stride,
+ unsigned char * ypred_ptr,
+ int y_stride)
+{
+ MB_PREDICTION_MODE mode = x->mode_info_context->mbmi.mode;
+ DECLARE_ALIGNED(16, uint8_t, yleft_col[16]);
+ int i;
+ intra_pred_fn fn;
+
+ for (i = 0; i < 16; i++)
+ {
+ yleft_col[i] = yleft[i* left_stride];
+ }
+
+ if (mode == DC_PRED)
+ {
+ fn = dc_pred[x->left_available][x->up_available][SIZE_16];
+ }
+ else
+ {
+ fn = pred[mode][SIZE_16];
+ }
+
+ fn(ypred_ptr, y_stride, yabove_row, yleft_col);
+}
+
+void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x,
+ unsigned char * uabove_row,
+ unsigned char * vabove_row,
+ unsigned char * uleft,
+ unsigned char * vleft,
+ int left_stride,
+ unsigned char * upred_ptr,
+ unsigned char * vpred_ptr,
+ int pred_stride)
+{
+ MB_PREDICTION_MODE uvmode = x->mode_info_context->mbmi.uv_mode;
+ unsigned char uleft_col[8];
+ unsigned char vleft_col[8];
+ int i;
+ intra_pred_fn fn;
+
+ for (i = 0; i < 8; i++)
+ {
+ uleft_col[i] = uleft[i * left_stride];
+ vleft_col[i] = vleft[i * left_stride];
+ }
+
+ if (uvmode == DC_PRED)
+ {
+ fn = dc_pred[x->left_available][x->up_available][SIZE_8];
+ }
+ else
+ {
+ fn = pred[uvmode][SIZE_8];
+ }
+
+ fn(upred_ptr, pred_stride, uabove_row, uleft_col);
+ fn(vpred_ptr, pred_stride, vabove_row, vleft_col);
+}
+
+void vp8_init_intra_predictors(void)
+{
+ once(vp8_init_intra_predictors_internal);
+}
diff --git a/thirdparty/libvpx/vp8/common/reconintra.h b/thirdparty/libvpx/vp8/common/reconintra.h
new file mode 100644
index 0000000000..b6225a6637
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/reconintra.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_RECONINTRA_H_
+#define VP8_COMMON_RECONINTRA_H_
+
+#include "vp8/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x,
+ unsigned char *yabove_row,
+ unsigned char *yleft,
+ int left_stride,
+ unsigned char *ypred_ptr,
+ int y_stride);
+
+void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x,
+ unsigned char * uabove_row,
+ unsigned char * vabove_row,
+ unsigned char * uleft,
+ unsigned char * vleft,
+ int left_stride,
+ unsigned char * upred_ptr,
+ unsigned char * vpred_ptr,
+ int pred_stride);
+
+void vp8_init_intra_predictors(void);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_RECONINTRA_H_
diff --git a/thirdparty/libvpx/vp8/common/reconintra4x4.c b/thirdparty/libvpx/vp8/common/reconintra4x4.c
new file mode 100644
index 0000000000..35ad891eff
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/reconintra4x4.c
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string.h>
+
+#include "vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vp8_rtcd.h"
+#include "blockd.h"
+
+typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left);
+
+static intra_pred_fn pred[10];
+
+void vp8_init_intra4x4_predictors_internal(void)
+{
+ pred[B_DC_PRED] = vpx_dc_predictor_4x4;
+ pred[B_TM_PRED] = vpx_tm_predictor_4x4;
+ pred[B_VE_PRED] = vpx_ve_predictor_4x4;
+ pred[B_HE_PRED] = vpx_he_predictor_4x4;
+ pred[B_LD_PRED] = vpx_d45e_predictor_4x4;
+ pred[B_RD_PRED] = vpx_d135_predictor_4x4;
+ pred[B_VR_PRED] = vpx_d117_predictor_4x4;
+ pred[B_VL_PRED] = vpx_d63f_predictor_4x4;
+ pred[B_HD_PRED] = vpx_d153_predictor_4x4;
+ pred[B_HU_PRED] = vpx_d207_predictor_4x4;
+}
+
+void vp8_intra4x4_predict(unsigned char *above,
+ unsigned char *yleft, int left_stride,
+ B_PREDICTION_MODE b_mode,
+ unsigned char *dst, int dst_stride,
+ unsigned char top_left)
+{
+ unsigned char Left[4];
+ unsigned char Aboveb[12], *Above = Aboveb + 4;
+
+ Left[0] = yleft[0];
+ Left[1] = yleft[left_stride];
+ Left[2] = yleft[2 * left_stride];
+ Left[3] = yleft[3 * left_stride];
+ memcpy(Above, above, 8);
+ Above[-1] = top_left;
+
+ pred[b_mode](dst, dst_stride, Above, Left);
+}
diff --git a/thirdparty/libvpx/vp8/common/reconintra4x4.h b/thirdparty/libvpx/vp8/common/reconintra4x4.h
new file mode 100644
index 0000000000..5dc5d13a5c
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/reconintra4x4.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_RECONINTRA4X4_H_
+#define VP8_COMMON_RECONINTRA4X4_H_
+#include "vp8/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE void intra_prediction_down_copy(MACROBLOCKD *xd,
+ unsigned char *above_right_src)
+{
+ int dst_stride = xd->dst.y_stride;
+ unsigned char *above_right_dst = xd->dst.y_buffer - dst_stride + 16;
+
+ unsigned int *src_ptr = (unsigned int *)above_right_src;
+ unsigned int *dst_ptr0 = (unsigned int *)(above_right_dst + 4 * dst_stride);
+ unsigned int *dst_ptr1 = (unsigned int *)(above_right_dst + 8 * dst_stride);
+ unsigned int *dst_ptr2 = (unsigned int *)(above_right_dst + 12 * dst_stride);
+
+ *dst_ptr0 = *src_ptr;
+ *dst_ptr1 = *src_ptr;
+ *dst_ptr2 = *src_ptr;
+}
+
+void vp8_intra4x4_predict(unsigned char *Above,
+ unsigned char *yleft, int left_stride,
+ B_PREDICTION_MODE b_mode,
+ unsigned char *dst, int dst_stride,
+ unsigned char top_left);
+
+void vp8_init_intra4x4_predictors_internal(void);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_RECONINTRA4X4_H_
diff --git a/thirdparty/libvpx/vp8/common/rtcd.c b/thirdparty/libvpx/vp8/common/rtcd.c
new file mode 100644
index 0000000000..ab0e9b47fe
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/rtcd.c
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#define RTCD_C
+#include "./vp8_rtcd.h"
+#include "vpx_ports/vpx_once.h"
+
+
+void vp8_rtcd()
+{
+ once(setup_rtcd_internal);
+}
diff --git a/thirdparty/libvpx/vp8/common/setupintrarecon.c b/thirdparty/libvpx/vp8/common/setupintrarecon.c
new file mode 100644
index 0000000000..669564db42
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/setupintrarecon.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "setupintrarecon.h"
+#include "vpx_mem/vpx_mem.h"
+
+void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf)
+{
+ int i;
+
+ /* set up frame new frame for intra coded blocks */
+ memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
+ for (i = 0; i < ybf->y_height; i++)
+ ybf->y_buffer[ybf->y_stride *i - 1] = (unsigned char) 129;
+
+ memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+ for (i = 0; i < ybf->uv_height; i++)
+ ybf->u_buffer[ybf->uv_stride *i - 1] = (unsigned char) 129;
+
+ memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+ for (i = 0; i < ybf->uv_height; i++)
+ ybf->v_buffer[ybf->uv_stride *i - 1] = (unsigned char) 129;
+
+}
+
+void vp8_setup_intra_recon_top_line(YV12_BUFFER_CONFIG *ybf)
+{
+ memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
+ memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+ memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+}
diff --git a/thirdparty/libvpx/vp8/common/setupintrarecon.h b/thirdparty/libvpx/vp8/common/setupintrarecon.h
new file mode 100644
index 0000000000..1857c4e26a
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/setupintrarecon.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_COMMON_SETUPINTRARECON_H_
+#define VP8_COMMON_SETUPINTRARECON_H_
+
+#include "./vpx_config.h"
+#include "vpx_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf);
+extern void vp8_setup_intra_recon_top_line(YV12_BUFFER_CONFIG *ybf);
+
+static INLINE void setup_intra_recon_left(unsigned char *y_buffer,
+ unsigned char *u_buffer,
+ unsigned char *v_buffer,
+ int y_stride,
+ int uv_stride)
+{
+ int i;
+
+ for (i = 0; i < 16; i++)
+ y_buffer[y_stride *i] = (unsigned char) 129;
+
+ for (i = 0; i < 8; i++)
+ u_buffer[uv_stride *i] = (unsigned char) 129;
+
+ for (i = 0; i < 8; i++)
+ v_buffer[uv_stride *i] = (unsigned char) 129;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_SETUPINTRARECON_H_
diff --git a/thirdparty/libvpx/vp8/common/swapyv12buffer.c b/thirdparty/libvpx/vp8/common/swapyv12buffer.c
new file mode 100644
index 0000000000..73656b3d72
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/swapyv12buffer.c
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "swapyv12buffer.h"
+
+void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, YV12_BUFFER_CONFIG *last_frame)
+{
+ unsigned char *temp;
+
+ temp = last_frame->buffer_alloc;
+ last_frame->buffer_alloc = new_frame->buffer_alloc;
+ new_frame->buffer_alloc = temp;
+
+ temp = last_frame->y_buffer;
+ last_frame->y_buffer = new_frame->y_buffer;
+ new_frame->y_buffer = temp;
+
+ temp = last_frame->u_buffer;
+ last_frame->u_buffer = new_frame->u_buffer;
+ new_frame->u_buffer = temp;
+
+ temp = last_frame->v_buffer;
+ last_frame->v_buffer = new_frame->v_buffer;
+ new_frame->v_buffer = temp;
+
+}
diff --git a/thirdparty/libvpx/vp8/common/swapyv12buffer.h b/thirdparty/libvpx/vp8/common/swapyv12buffer.h
new file mode 100644
index 0000000000..1d66cd3d62
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/swapyv12buffer.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_SWAPYV12BUFFER_H_
+#define VP8_COMMON_SWAPYV12BUFFER_H_
+
+#include "vpx_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, YV12_BUFFER_CONFIG *last_frame);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_SWAPYV12BUFFER_H_
diff --git a/thirdparty/libvpx/vp8/common/systemdependent.h b/thirdparty/libvpx/vp8/common/systemdependent.h
new file mode 100644
index 0000000000..3d44e37cf2
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/systemdependent.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_COMMON_SYSTEMDEPENDENT_H_
+#define VP8_COMMON_SYSTEMDEPENDENT_H_
+
+#include "vpx_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP8Common;
+void vp8_machine_specific_config(struct VP8Common *);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_SYSTEMDEPENDENT_H_
diff --git a/thirdparty/libvpx/vp8/common/threading.h b/thirdparty/libvpx/vp8/common/threading.h
new file mode 100644
index 0000000000..183b49b8ff
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/threading.h
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_THREADING_H_
+#define VP8_COMMON_THREADING_H_
+
+#include "./vpx_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD
+
+/* Thread management macros */
+#if defined(_WIN32) && !HAVE_PTHREAD_H
+/* Win32 */
+#include <process.h>
+#include <windows.h>
+#define THREAD_FUNCTION unsigned int __stdcall
+#define THREAD_FUNCTION_RETURN DWORD
+#define THREAD_SPECIFIC_INDEX DWORD
+#define pthread_t HANDLE
+#define pthread_attr_t DWORD
+#define pthread_detach(thread) if(thread!=NULL)CloseHandle(thread)
+#define thread_sleep(nms) Sleep(nms)
+#define pthread_cancel(thread) terminate_thread(thread,0)
+#define ts_key_create(ts_key, destructor) {ts_key = TlsAlloc();};
+#define pthread_getspecific(ts_key) TlsGetValue(ts_key)
+#define pthread_setspecific(ts_key, value) TlsSetValue(ts_key, (void *)value)
+#define pthread_self() GetCurrentThreadId()
+
+#elif defined(__OS2__)
+/* OS/2 */
+#define INCL_DOS
+#include <os2.h>
+
+#include <stdlib.h>
+#define THREAD_FUNCTION void *
+#define THREAD_FUNCTION_RETURN void *
+#define THREAD_SPECIFIC_INDEX PULONG
+#define pthread_t TID
+#define pthread_attr_t ULONG
+#define pthread_detach(thread) 0
+#define thread_sleep(nms) DosSleep(nms)
+#define pthread_cancel(thread) DosKillThread(thread)
+#define ts_key_create(ts_key, destructor) \
+ DosAllocThreadLocalMemory(1, &(ts_key));
+#define pthread_getspecific(ts_key) ((void *)(*(ts_key)))
+#define pthread_setspecific(ts_key, value) (*(ts_key)=(ULONG)(value))
+#define pthread_self() _gettid()
+#else
+#ifdef __APPLE__
+#include <mach/mach_init.h>
+#include <mach/semaphore.h>
+#include <mach/task.h>
+#include <time.h>
+#include <unistd.h>
+
+#else
+#include <semaphore.h>
+#endif
+
+#include <pthread.h>
+/* pthreads */
+/* Nearly everything is already defined */
+#define THREAD_FUNCTION void *
+#define THREAD_FUNCTION_RETURN void *
+#define THREAD_SPECIFIC_INDEX pthread_key_t
+#define ts_key_create(ts_key, destructor) pthread_key_create (&(ts_key), destructor);
+#endif
+
+/* Synchronization macros: Win32 and Pthreads */
+#if defined(_WIN32) && !HAVE_PTHREAD_H
+#define sem_t HANDLE
+#define pause(voidpara) __asm PAUSE
+#define sem_init(sem, sem_attr1, sem_init_value) (int)((*sem = CreateSemaphore(NULL,0,32768,NULL))==NULL)
+#define sem_wait(sem) (int)(WAIT_OBJECT_0 != WaitForSingleObject(*sem,INFINITE))
+#define sem_post(sem) ReleaseSemaphore(*sem,1,NULL)
+#define sem_destroy(sem) if(*sem)((int)(CloseHandle(*sem))==TRUE)
+#define thread_sleep(nms) Sleep(nms)
+
+#elif defined(__OS2__)
+typedef struct
+{
+ HEV event;
+ HMTX wait_mutex;
+ HMTX count_mutex;
+ int count;
+} sem_t;
+
+static inline int sem_init(sem_t *sem, int pshared, unsigned int value)
+{
+ DosCreateEventSem(NULL, &sem->event, pshared ? DC_SEM_SHARED : 0,
+ value > 0 ? TRUE : FALSE);
+ DosCreateMutexSem(NULL, &sem->wait_mutex, 0, FALSE);
+ DosCreateMutexSem(NULL, &sem->count_mutex, 0, FALSE);
+
+ sem->count = value;
+
+ return 0;
+}
+
+static inline int sem_wait(sem_t * sem)
+{
+ DosRequestMutexSem(sem->wait_mutex, -1);
+
+ DosWaitEventSem(sem->event, -1);
+
+ DosRequestMutexSem(sem->count_mutex, -1);
+
+ sem->count--;
+ if (sem->count == 0)
+ {
+ ULONG post_count;
+
+ DosResetEventSem(sem->event, &post_count);
+ }
+
+ DosReleaseMutexSem(sem->count_mutex);
+
+ DosReleaseMutexSem(sem->wait_mutex);
+
+ return 0;
+}
+
+static inline int sem_post(sem_t * sem)
+{
+ DosRequestMutexSem(sem->count_mutex, -1);
+
+ if (sem->count < 32768)
+ {
+ sem->count++;
+ DosPostEventSem(sem->event);
+ }
+
+ DosReleaseMutexSem(sem->count_mutex);
+
+ return 0;
+}
+
+static inline int sem_destroy(sem_t * sem)
+{
+ DosCloseEventSem(sem->event);
+ DosCloseMutexSem(sem->wait_mutex);
+ DosCloseMutexSem(sem->count_mutex);
+
+ return 0;
+}
+
+#define thread_sleep(nms) DosSleep(nms)
+
+#else
+
+#ifdef __APPLE__
+#define sem_t semaphore_t
+#define sem_init(X,Y,Z) semaphore_create(mach_task_self(), X, SYNC_POLICY_FIFO, Z)
+#define sem_wait(sem) (semaphore_wait(*sem) )
+#define sem_post(sem) semaphore_signal(*sem)
+#define sem_destroy(sem) semaphore_destroy(mach_task_self(),*sem)
+#define thread_sleep(nms) /* { struct timespec ts;ts.tv_sec=0; ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} */
+#else
+#include <unistd.h>
+#include <sched.h>
+#define thread_sleep(nms) sched_yield();/* {struct timespec ts;ts.tv_sec=0; ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} */
+#endif
+/* Not Windows. Assume pthreads */
+
+#endif
+
+#if ARCH_X86 || ARCH_X86_64
+#include "vpx_ports/x86.h"
+#else
+#define x86_pause_hint()
+#endif
+
+#include "vpx_util/vpx_thread.h"
+
+static INLINE void mutex_lock(pthread_mutex_t *const mutex) {
+ const int kMaxTryLocks = 4000;
+ int locked = 0;
+ int i;
+
+ for (i = 0; i < kMaxTryLocks; ++i) {
+ if (!pthread_mutex_trylock(mutex)) {
+ locked = 1;
+ break;
+ }
+ }
+
+ if (!locked)
+ pthread_mutex_lock(mutex);
+}
+
+static INLINE int protected_read(pthread_mutex_t *const mutex, const int *p) {
+ int ret;
+ mutex_lock(mutex);
+ ret = *p;
+ pthread_mutex_unlock(mutex);
+ return ret;
+}
+
+static INLINE void sync_read(pthread_mutex_t *const mutex, int mb_col,
+ const int *last_row_current_mb_col,
+ const int nsync) {
+ while (mb_col > (protected_read(mutex, last_row_current_mb_col) - nsync)) {
+ x86_pause_hint();
+ thread_sleep(0);
+ }
+}
+
+static INLINE void protected_write(pthread_mutex_t *mutex, int *p, int v) {
+ mutex_lock(mutex);
+ *p = v;
+ pthread_mutex_unlock(mutex);
+}
+
+#endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_THREADING_H_
diff --git a/thirdparty/libvpx/vp8/common/treecoder.c b/thirdparty/libvpx/vp8/common/treecoder.c
new file mode 100644
index 0000000000..d80c64bdfa
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/treecoder.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#if CONFIG_DEBUG
+#include <assert.h>
+#endif
+#include <stdio.h>
+
+#include "treecoder.h"
+
+static void tree2tok(
+ struct vp8_token_struct *const p,
+ vp8_tree t,
+ int i,
+ int v,
+ int L
+)
+{
+ v += v;
+ ++L;
+
+ do
+ {
+ const vp8_tree_index j = t[i++];
+
+ if (j <= 0)
+ {
+ p[-j].value = v;
+ p[-j].Len = L;
+ }
+ else
+ tree2tok(p, t, j, v, L);
+ }
+ while (++v & 1);
+}
+
+void vp8_tokens_from_tree(struct vp8_token_struct *p, vp8_tree t)
+{
+ tree2tok(p, t, 0, 0, 0);
+}
+
+void vp8_tokens_from_tree_offset(struct vp8_token_struct *p, vp8_tree t,
+ int offset)
+{
+ tree2tok(p - offset, t, 0, 0, 0);
+}
+
+static void branch_counts(
+ int n, /* n = size of alphabet */
+ vp8_token tok [ /* n */ ],
+ vp8_tree tree,
+ unsigned int branch_ct [ /* n-1 */ ] [2],
+ const unsigned int num_events[ /* n */ ]
+)
+{
+ const int tree_len = n - 1;
+ int t = 0;
+
+#if CONFIG_DEBUG
+ assert(tree_len);
+#endif
+
+ do
+ {
+ branch_ct[t][0] = branch_ct[t][1] = 0;
+ }
+ while (++t < tree_len);
+
+ t = 0;
+
+ do
+ {
+ int L = tok[t].Len;
+ const int enc = tok[t].value;
+ const unsigned int ct = num_events[t];
+
+ vp8_tree_index i = 0;
+
+ do
+ {
+ const int b = (enc >> --L) & 1;
+ const int j = i >> 1;
+#if CONFIG_DEBUG
+ assert(j < tree_len && 0 <= L);
+#endif
+
+ branch_ct [j] [b] += ct;
+ i = tree[ i + b];
+ }
+ while (i > 0);
+
+#if CONFIG_DEBUG
+ assert(!L);
+#endif
+ }
+ while (++t < n);
+
+}
+
+
+void vp8_tree_probs_from_distribution(
+ int n, /* n = size of alphabet */
+ vp8_token tok [ /* n */ ],
+ vp8_tree tree,
+ vp8_prob probs [ /* n-1 */ ],
+ unsigned int branch_ct [ /* n-1 */ ] [2],
+ const unsigned int num_events[ /* n */ ],
+ unsigned int Pfac,
+ int rd
+)
+{
+ const int tree_len = n - 1;
+ int t = 0;
+
+ branch_counts(n, tok, tree, branch_ct, num_events);
+
+ do
+ {
+ const unsigned int *const c = branch_ct[t];
+ const unsigned int tot = c[0] + c[1];
+
+#if CONFIG_DEBUG
+ assert(tot < (1 << 24)); /* no overflow below */
+#endif
+
+ if (tot)
+ {
+ const unsigned int p = ((c[0] * Pfac) + (rd ? tot >> 1 : 0)) / tot;
+ probs[t] = p < 256 ? (p ? p : 1) : 255; /* agree w/old version for now */
+ }
+ else
+ probs[t] = vp8_prob_half;
+ }
+ while (++t < tree_len);
+}
diff --git a/thirdparty/libvpx/vp8/common/treecoder.h b/thirdparty/libvpx/vp8/common/treecoder.h
new file mode 100644
index 0000000000..d22b7c570c
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/treecoder.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_TREECODER_H_
+#define VP8_COMMON_TREECODER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef unsigned char vp8bc_index_t; /* probability index */
+
+
+typedef unsigned char vp8_prob;
+
+#define vp8_prob_half ( (vp8_prob) 128)
+
+typedef signed char vp8_tree_index;
+struct bool_coder_spec;
+
+typedef struct bool_coder_spec bool_coder_spec;
+typedef struct bool_writer bool_writer;
+typedef struct bool_reader bool_reader;
+
+typedef const bool_coder_spec c_bool_coder_spec;
+typedef const bool_writer c_bool_writer;
+typedef const bool_reader c_bool_reader;
+
+
+
+# define vp8_complement( x) (255 - x)
+
+
+/* We build coding trees compactly in arrays.
+ Each node of the tree is a pair of vp8_tree_indices.
+ Array index often references a corresponding probability table.
+ Index <= 0 means done encoding/decoding and value = -Index,
+ Index > 0 means need another bit, specification at index.
+ Nonnegative indices are always even; processing begins at node 0. */
+
+typedef const vp8_tree_index vp8_tree[], *vp8_tree_p;
+
+
+typedef const struct vp8_token_struct
+{
+ int value;
+ int Len;
+} vp8_token;
+
+/* Construct encoding array from tree. */
+
+void vp8_tokens_from_tree(struct vp8_token_struct *, vp8_tree);
+void vp8_tokens_from_tree_offset(struct vp8_token_struct *, vp8_tree,
+ int offset);
+
+
+/* Convert array of token occurrence counts into a table of probabilities
+ for the associated binary encoding tree. Also writes count of branches
+ taken for each node on the tree; this facilitiates decisions as to
+ probability updates. */
+
+void vp8_tree_probs_from_distribution(
+ int n, /* n = size of alphabet */
+ vp8_token tok [ /* n */ ],
+ vp8_tree tree,
+ vp8_prob probs [ /* n-1 */ ],
+ unsigned int branch_ct [ /* n-1 */ ] [2],
+ const unsigned int num_events[ /* n */ ],
+ unsigned int Pfactor,
+ int Round
+);
+
+/* Variant of above using coder spec rather than hardwired 8-bit probs. */
+
+void vp8bc_tree_probs_from_distribution(
+ int n, /* n = size of alphabet */
+ vp8_token tok [ /* n */ ],
+ vp8_tree tree,
+ vp8_prob probs [ /* n-1 */ ],
+ unsigned int branch_ct [ /* n-1 */ ] [2],
+ const unsigned int num_events[ /* n */ ],
+ c_bool_coder_spec *s
+);
+
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_TREECODER_H_
diff --git a/thirdparty/libvpx/vp8/common/vp8_entropymodedata.h b/thirdparty/libvpx/vp8/common/vp8_entropymodedata.h
new file mode 100644
index 0000000000..c4aed49897
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/vp8_entropymodedata.h
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+*/
+
+#ifndef VP8_COMMON_VP8_ENTROPYMODEDATA_H_
+#define VP8_COMMON_VP8_ENTROPYMODEDATA_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*Generated file, included by entropymode.c*/
+
+
+const struct vp8_token_struct vp8_bmode_encodings[VP8_BINTRAMODES] =
+{
+ { 0, 1 },
+ { 2, 2 },
+ { 6, 3 },
+ { 28, 5 },
+ { 30, 5 },
+ { 58, 6 },
+ { 59, 6 },
+ { 62, 6 },
+ { 126, 7 },
+ { 127, 7 }
+};
+
+const struct vp8_token_struct vp8_ymode_encodings[VP8_YMODES] =
+{
+ { 0, 1 },
+ { 4, 3 },
+ { 5, 3 },
+ { 6, 3 },
+ { 7, 3 }
+};
+
+const struct vp8_token_struct vp8_kf_ymode_encodings[VP8_YMODES] =
+{
+ { 4, 3 },
+ { 5, 3 },
+ { 6, 3 },
+ { 7, 3 },
+ { 0, 1 }
+};
+
+const struct vp8_token_struct vp8_uv_mode_encodings[VP8_UV_MODES] =
+{
+ { 0, 1 },
+ { 2, 2 },
+ { 6, 3 },
+ { 7, 3 }
+};
+
+const struct vp8_token_struct vp8_mbsplit_encodings[VP8_NUMMBSPLITS] =
+{
+ { 6, 3 },
+ { 7, 3 },
+ { 2, 2 },
+ { 0, 1 }
+};
+
+const struct vp8_token_struct vp8_mv_ref_encoding_array[VP8_MVREFS] =
+{
+ { 2, 2 },
+ { 6, 3 },
+ { 0, 1 },
+ { 14, 4 },
+ { 15, 4 }
+};
+
+const struct vp8_token_struct vp8_sub_mv_ref_encoding_array[VP8_SUBMVREFS] =
+{
+ { 0, 1 },
+ { 2, 2 },
+ { 6, 3 },
+ { 7, 3 }
+};
+
+const struct vp8_token_struct vp8_small_mvencodings[8] =
+{
+ { 0, 3 },
+ { 1, 3 },
+ { 2, 3 },
+ { 3, 3 },
+ { 4, 3 },
+ { 5, 3 },
+ { 6, 3 },
+ { 7, 3 }
+};
+
+const vp8_prob vp8_ymode_prob[VP8_YMODES-1] =
+{
+ 112, 86, 140, 37
+};
+
+const vp8_prob vp8_kf_ymode_prob[VP8_YMODES-1] =
+{
+ 145, 156, 163, 128
+};
+
+const vp8_prob vp8_uv_mode_prob[VP8_UV_MODES-1] =
+{
+ 162, 101, 204
+};
+
+const vp8_prob vp8_kf_uv_mode_prob[VP8_UV_MODES-1] =
+{
+ 142, 114, 183
+};
+
+const vp8_prob vp8_bmode_prob[VP8_BINTRAMODES-1] =
+{
+ 120, 90, 79, 133, 87, 85, 80, 111, 151
+};
+
+
+
+const vp8_prob vp8_kf_bmode_prob
+[VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES-1] =
+{
+ {
+ { 231, 120, 48, 89, 115, 113, 120, 152, 112 },
+ { 152, 179, 64, 126, 170, 118, 46, 70, 95 },
+ { 175, 69, 143, 80, 85, 82, 72, 155, 103 },
+ { 56, 58, 10, 171, 218, 189, 17, 13, 152 },
+ { 144, 71, 10, 38, 171, 213, 144, 34, 26 },
+ { 114, 26, 17, 163, 44, 195, 21, 10, 173 },
+ { 121, 24, 80, 195, 26, 62, 44, 64, 85 },
+ { 170, 46, 55, 19, 136, 160, 33, 206, 71 },
+ { 63, 20, 8, 114, 114, 208, 12, 9, 226 },
+ { 81, 40, 11, 96, 182, 84, 29, 16, 36 }
+ },
+ {
+ { 134, 183, 89, 137, 98, 101, 106, 165, 148 },
+ { 72, 187, 100, 130, 157, 111, 32, 75, 80 },
+ { 66, 102, 167, 99, 74, 62, 40, 234, 128 },
+ { 41, 53, 9, 178, 241, 141, 26, 8, 107 },
+ { 104, 79, 12, 27, 217, 255, 87, 17, 7 },
+ { 74, 43, 26, 146, 73, 166, 49, 23, 157 },
+ { 65, 38, 105, 160, 51, 52, 31, 115, 128 },
+ { 87, 68, 71, 44, 114, 51, 15, 186, 23 },
+ { 47, 41, 14, 110, 182, 183, 21, 17, 194 },
+ { 66, 45, 25, 102, 197, 189, 23, 18, 22 }
+ },
+ {
+ { 88, 88, 147, 150, 42, 46, 45, 196, 205 },
+ { 43, 97, 183, 117, 85, 38, 35, 179, 61 },
+ { 39, 53, 200, 87, 26, 21, 43, 232, 171 },
+ { 56, 34, 51, 104, 114, 102, 29, 93, 77 },
+ { 107, 54, 32, 26, 51, 1, 81, 43, 31 },
+ { 39, 28, 85, 171, 58, 165, 90, 98, 64 },
+ { 34, 22, 116, 206, 23, 34, 43, 166, 73 },
+ { 68, 25, 106, 22, 64, 171, 36, 225, 114 },
+ { 34, 19, 21, 102, 132, 188, 16, 76, 124 },
+ { 62, 18, 78, 95, 85, 57, 50, 48, 51 }
+ },
+ {
+ { 193, 101, 35, 159, 215, 111, 89, 46, 111 },
+ { 60, 148, 31, 172, 219, 228, 21, 18, 111 },
+ { 112, 113, 77, 85, 179, 255, 38, 120, 114 },
+ { 40, 42, 1, 196, 245, 209, 10, 25, 109 },
+ { 100, 80, 8, 43, 154, 1, 51, 26, 71 },
+ { 88, 43, 29, 140, 166, 213, 37, 43, 154 },
+ { 61, 63, 30, 155, 67, 45, 68, 1, 209 },
+ { 142, 78, 78, 16, 255, 128, 34, 197, 171 },
+ { 41, 40, 5, 102, 211, 183, 4, 1, 221 },
+ { 51, 50, 17, 168, 209, 192, 23, 25, 82 }
+ },
+ {
+ { 125, 98, 42, 88, 104, 85, 117, 175, 82 },
+ { 95, 84, 53, 89, 128, 100, 113, 101, 45 },
+ { 75, 79, 123, 47, 51, 128, 81, 171, 1 },
+ { 57, 17, 5, 71, 102, 57, 53, 41, 49 },
+ { 115, 21, 2, 10, 102, 255, 166, 23, 6 },
+ { 38, 33, 13, 121, 57, 73, 26, 1, 85 },
+ { 41, 10, 67, 138, 77, 110, 90, 47, 114 },
+ { 101, 29, 16, 10, 85, 128, 101, 196, 26 },
+ { 57, 18, 10, 102, 102, 213, 34, 20, 43 },
+ { 117, 20, 15, 36, 163, 128, 68, 1, 26 }
+ },
+ {
+ { 138, 31, 36, 171, 27, 166, 38, 44, 229 },
+ { 67, 87, 58, 169, 82, 115, 26, 59, 179 },
+ { 63, 59, 90, 180, 59, 166, 93, 73, 154 },
+ { 40, 40, 21, 116, 143, 209, 34, 39, 175 },
+ { 57, 46, 22, 24, 128, 1, 54, 17, 37 },
+ { 47, 15, 16, 183, 34, 223, 49, 45, 183 },
+ { 46, 17, 33, 183, 6, 98, 15, 32, 183 },
+ { 65, 32, 73, 115, 28, 128, 23, 128, 205 },
+ { 40, 3, 9, 115, 51, 192, 18, 6, 223 },
+ { 87, 37, 9, 115, 59, 77, 64, 21, 47 }
+ },
+ {
+ { 104, 55, 44, 218, 9, 54, 53, 130, 226 },
+ { 64, 90, 70, 205, 40, 41, 23, 26, 57 },
+ { 54, 57, 112, 184, 5, 41, 38, 166, 213 },
+ { 30, 34, 26, 133, 152, 116, 10, 32, 134 },
+ { 75, 32, 12, 51, 192, 255, 160, 43, 51 },
+ { 39, 19, 53, 221, 26, 114, 32, 73, 255 },
+ { 31, 9, 65, 234, 2, 15, 1, 118, 73 },
+ { 88, 31, 35, 67, 102, 85, 55, 186, 85 },
+ { 56, 21, 23, 111, 59, 205, 45, 37, 192 },
+ { 55, 38, 70, 124, 73, 102, 1, 34, 98 }
+ },
+ {
+ { 102, 61, 71, 37, 34, 53, 31, 243, 192 },
+ { 69, 60, 71, 38, 73, 119, 28, 222, 37 },
+ { 68, 45, 128, 34, 1, 47, 11, 245, 171 },
+ { 62, 17, 19, 70, 146, 85, 55, 62, 70 },
+ { 75, 15, 9, 9, 64, 255, 184, 119, 16 },
+ { 37, 43, 37, 154, 100, 163, 85, 160, 1 },
+ { 63, 9, 92, 136, 28, 64, 32, 201, 85 },
+ { 86, 6, 28, 5, 64, 255, 25, 248, 1 },
+ { 56, 8, 17, 132, 137, 255, 55, 116, 128 },
+ { 58, 15, 20, 82, 135, 57, 26, 121, 40 }
+ },
+ {
+ { 164, 50, 31, 137, 154, 133, 25, 35, 218 },
+ { 51, 103, 44, 131, 131, 123, 31, 6, 158 },
+ { 86, 40, 64, 135, 148, 224, 45, 183, 128 },
+ { 22, 26, 17, 131, 240, 154, 14, 1, 209 },
+ { 83, 12, 13, 54, 192, 255, 68, 47, 28 },
+ { 45, 16, 21, 91, 64, 222, 7, 1, 197 },
+ { 56, 21, 39, 155, 60, 138, 23, 102, 213 },
+ { 85, 26, 85, 85, 128, 128, 32, 146, 171 },
+ { 18, 11, 7, 63, 144, 171, 4, 4, 246 },
+ { 35, 27, 10, 146, 174, 171, 12, 26, 128 }
+ },
+ {
+ { 190, 80, 35, 99, 180, 80, 126, 54, 45 },
+ { 85, 126, 47, 87, 176, 51, 41, 20, 32 },
+ { 101, 75, 128, 139, 118, 146, 116, 128, 85 },
+ { 56, 41, 15, 176, 236, 85, 37, 9, 62 },
+ { 146, 36, 19, 30, 171, 255, 97, 27, 20 },
+ { 71, 30, 17, 119, 118, 255, 17, 18, 138 },
+ { 101, 38, 60, 138, 55, 70, 43, 26, 142 },
+ { 138, 45, 61, 62, 219, 1, 81, 188, 64 },
+ { 32, 41, 20, 117, 151, 142, 20, 21, 163 },
+ { 112, 19, 12, 61, 195, 128, 48, 4, 24 }
+ }
+};
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_VP8_ENTROPYMODEDATA_H_
diff --git a/thirdparty/libvpx/vp8/common/vp8_loopfilter.c b/thirdparty/libvpx/vp8/common/vp8_loopfilter.c
new file mode 100644
index 0000000000..756ad488f9
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/vp8_loopfilter.c
@@ -0,0 +1,661 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "loopfilter.h"
+#include "onyxc_int.h"
+#include "vpx_mem/vpx_mem.h"
+
+
+static void lf_init_lut(loop_filter_info_n *lfi)
+{
+ int filt_lvl;
+
+ for (filt_lvl = 0; filt_lvl <= MAX_LOOP_FILTER; filt_lvl++)
+ {
+ if (filt_lvl >= 40)
+ {
+ lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 2;
+ lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 3;
+ }
+ else if (filt_lvl >= 20)
+ {
+ lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;
+ lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 2;
+ }
+ else if (filt_lvl >= 15)
+ {
+ lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;
+ lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 1;
+ }
+ else
+ {
+ lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 0;
+ lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 0;
+ }
+ }
+
+ lfi->mode_lf_lut[DC_PRED] = 1;
+ lfi->mode_lf_lut[V_PRED] = 1;
+ lfi->mode_lf_lut[H_PRED] = 1;
+ lfi->mode_lf_lut[TM_PRED] = 1;
+ lfi->mode_lf_lut[B_PRED] = 0;
+
+ lfi->mode_lf_lut[ZEROMV] = 1;
+ lfi->mode_lf_lut[NEARESTMV] = 2;
+ lfi->mode_lf_lut[NEARMV] = 2;
+ lfi->mode_lf_lut[NEWMV] = 2;
+ lfi->mode_lf_lut[SPLITMV] = 3;
+
+}
+
+void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi,
+ int sharpness_lvl)
+{
+ int i;
+
+ /* For each possible value for the loop filter fill out limits */
+ for (i = 0; i <= MAX_LOOP_FILTER; i++)
+ {
+ int filt_lvl = i;
+ int block_inside_limit = 0;
+
+ /* Set loop filter paramaeters that control sharpness. */
+ block_inside_limit = filt_lvl >> (sharpness_lvl > 0);
+ block_inside_limit = block_inside_limit >> (sharpness_lvl > 4);
+
+ if (sharpness_lvl > 0)
+ {
+ if (block_inside_limit > (9 - sharpness_lvl))
+ block_inside_limit = (9 - sharpness_lvl);
+ }
+
+ if (block_inside_limit < 1)
+ block_inside_limit = 1;
+
+ memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH);
+ memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit), SIMD_WIDTH);
+ memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit),
+ SIMD_WIDTH);
+ }
+}
+
+void vp8_loop_filter_init(VP8_COMMON *cm)
+{
+ loop_filter_info_n *lfi = &cm->lf_info;
+ int i;
+
+ /* init limits for given sharpness*/
+ vp8_loop_filter_update_sharpness(lfi, cm->sharpness_level);
+ cm->last_sharpness_level = cm->sharpness_level;
+
+ /* init LUT for lvl and hev thr picking */
+ lf_init_lut(lfi);
+
+ /* init hev threshold const vectors */
+ for(i = 0; i < 4 ; i++)
+ {
+ memset(lfi->hev_thr[i], i, SIMD_WIDTH);
+ }
+}
+
+void vp8_loop_filter_frame_init(VP8_COMMON *cm,
+ MACROBLOCKD *mbd,
+ int default_filt_lvl)
+{
+ int seg, /* segment number */
+ ref, /* index in ref_lf_deltas */
+ mode; /* index in mode_lf_deltas */
+
+ loop_filter_info_n *lfi = &cm->lf_info;
+
+ /* update limits if sharpness has changed */
+ if(cm->last_sharpness_level != cm->sharpness_level)
+ {
+ vp8_loop_filter_update_sharpness(lfi, cm->sharpness_level);
+ cm->last_sharpness_level = cm->sharpness_level;
+ }
+
+ for(seg = 0; seg < MAX_MB_SEGMENTS; seg++)
+ {
+ int lvl_seg = default_filt_lvl;
+ int lvl_ref, lvl_mode;
+
+ /* Note the baseline filter values for each segment */
+ if (mbd->segmentation_enabled)
+ {
+ /* Abs value */
+ if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
+ {
+ lvl_seg = mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
+ }
+ else /* Delta Value */
+ {
+ lvl_seg += mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
+ }
+ lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63: lvl_seg) : 0;
+ }
+
+ if (!mbd->mode_ref_lf_delta_enabled)
+ {
+ /* we could get rid of this if we assume that deltas are set to
+ * zero when not in use; encoder always uses deltas
+ */
+ memset(lfi->lvl[seg][0], lvl_seg, 4 * 4 );
+ continue;
+ }
+
+ /* INTRA_FRAME */
+ ref = INTRA_FRAME;
+
+ /* Apply delta for reference frame */
+ lvl_ref = lvl_seg + mbd->ref_lf_deltas[ref];
+
+ /* Apply delta for Intra modes */
+ mode = 0; /* B_PRED */
+ /* Only the split mode BPRED has a further special case */
+ lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode];
+ /* clamp */
+ lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0;
+
+ lfi->lvl[seg][ref][mode] = lvl_mode;
+
+ mode = 1; /* all the rest of Intra modes */
+ /* clamp */
+ lvl_mode = (lvl_ref > 0) ? (lvl_ref > 63 ? 63 : lvl_ref) : 0;
+ lfi->lvl[seg][ref][mode] = lvl_mode;
+
+ /* LAST, GOLDEN, ALT */
+ for(ref = 1; ref < MAX_REF_FRAMES; ref++)
+ {
+ /* Apply delta for reference frame */
+ lvl_ref = lvl_seg + mbd->ref_lf_deltas[ref];
+
+ /* Apply delta for Inter modes */
+ for (mode = 1; mode < 4; mode++)
+ {
+ lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode];
+ /* clamp */
+ lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0;
+
+ lfi->lvl[seg][ref][mode] = lvl_mode;
+ }
+ }
+ }
+}
+
+
+void vp8_loop_filter_row_normal(VP8_COMMON *cm, MODE_INFO *mode_info_context,
+ int mb_row, int post_ystride, int post_uvstride,
+ unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr)
+{
+ int mb_col;
+ int filter_level;
+ loop_filter_info_n *lfi_n = &cm->lf_info;
+ loop_filter_info lfi;
+ FRAME_TYPE frame_type = cm->frame_type;
+
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+ {
+ int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+ mode_info_context->mbmi.mode != SPLITMV &&
+ mode_info_context->mbmi.mb_skip_coeff);
+
+ const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+ const int seg = mode_info_context->mbmi.segment_id;
+ const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+ filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+ if (filter_level)
+ {
+ const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+ lfi.mblim = lfi_n->mblim[filter_level];
+ lfi.blim = lfi_n->blim[filter_level];
+ lfi.lim = lfi_n->lim[filter_level];
+ lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+ if (mb_col > 0)
+ vp8_loop_filter_mbv
+ (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi);
+
+ if (!skip_lf)
+ vp8_loop_filter_bv
+ (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi);
+
+ /* don't apply across umv border */
+ if (mb_row > 0)
+ vp8_loop_filter_mbh
+ (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi);
+
+ if (!skip_lf)
+ vp8_loop_filter_bh
+ (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi);
+ }
+
+ y_ptr += 16;
+ u_ptr += 8;
+ v_ptr += 8;
+
+ mode_info_context++; /* step to next MB */
+ }
+
+}
+
+void vp8_loop_filter_row_simple(VP8_COMMON *cm, MODE_INFO *mode_info_context,
+ int mb_row, int post_ystride, int post_uvstride,
+ unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr)
+{
+ int mb_col;
+ int filter_level;
+ loop_filter_info_n *lfi_n = &cm->lf_info;
+ (void)post_uvstride;
+
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+ {
+ int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+ mode_info_context->mbmi.mode != SPLITMV &&
+ mode_info_context->mbmi.mb_skip_coeff);
+
+ const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+ const int seg = mode_info_context->mbmi.segment_id;
+ const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+ filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+ if (filter_level)
+ {
+ if (mb_col > 0)
+ vp8_loop_filter_simple_mbv
+ (y_ptr, post_ystride, lfi_n->mblim[filter_level]);
+
+ if (!skip_lf)
+ vp8_loop_filter_simple_bv
+ (y_ptr, post_ystride, lfi_n->blim[filter_level]);
+
+ /* don't apply across umv border */
+ if (mb_row > 0)
+ vp8_loop_filter_simple_mbh
+ (y_ptr, post_ystride, lfi_n->mblim[filter_level]);
+
+ if (!skip_lf)
+ vp8_loop_filter_simple_bh
+ (y_ptr, post_ystride, lfi_n->blim[filter_level]);
+ }
+
+ y_ptr += 16;
+ u_ptr += 8;
+ v_ptr += 8;
+
+ mode_info_context++; /* step to next MB */
+ }
+
+}
+void vp8_loop_filter_frame(VP8_COMMON *cm,
+ MACROBLOCKD *mbd,
+ int frame_type)
+{
+ YV12_BUFFER_CONFIG *post = cm->frame_to_show;
+ loop_filter_info_n *lfi_n = &cm->lf_info;
+ loop_filter_info lfi;
+
+ int mb_row;
+ int mb_col;
+ int mb_rows = cm->mb_rows;
+ int mb_cols = cm->mb_cols;
+
+ int filter_level;
+
+ unsigned char *y_ptr, *u_ptr, *v_ptr;
+
+ /* Point at base of Mb MODE_INFO list */
+ const MODE_INFO *mode_info_context = cm->mi;
+ int post_y_stride = post->y_stride;
+ int post_uv_stride = post->uv_stride;
+
+ /* Initialize the loop filter for this frame. */
+ vp8_loop_filter_frame_init(cm, mbd, cm->filter_level);
+
+ /* Set up the buffer pointers */
+ y_ptr = post->y_buffer;
+ u_ptr = post->u_buffer;
+ v_ptr = post->v_buffer;
+
+ /* vp8_filter each macro block */
+ if (cm->filter_type == NORMAL_LOOPFILTER)
+ {
+ for (mb_row = 0; mb_row < mb_rows; mb_row++)
+ {
+ for (mb_col = 0; mb_col < mb_cols; mb_col++)
+ {
+ int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+ mode_info_context->mbmi.mode != SPLITMV &&
+ mode_info_context->mbmi.mb_skip_coeff);
+
+ const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+ const int seg = mode_info_context->mbmi.segment_id;
+ const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+ filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+ if (filter_level)
+ {
+ const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+ lfi.mblim = lfi_n->mblim[filter_level];
+ lfi.blim = lfi_n->blim[filter_level];
+ lfi.lim = lfi_n->lim[filter_level];
+ lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+ if (mb_col > 0)
+ vp8_loop_filter_mbv
+ (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi);
+
+ if (!skip_lf)
+ vp8_loop_filter_bv
+ (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi);
+
+ /* don't apply across umv border */
+ if (mb_row > 0)
+ vp8_loop_filter_mbh
+ (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi);
+
+ if (!skip_lf)
+ vp8_loop_filter_bh
+ (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi);
+ }
+
+ y_ptr += 16;
+ u_ptr += 8;
+ v_ptr += 8;
+
+ mode_info_context++; /* step to next MB */
+ }
+ y_ptr += post_y_stride * 16 - post->y_width;
+ u_ptr += post_uv_stride * 8 - post->uv_width;
+ v_ptr += post_uv_stride * 8 - post->uv_width;
+
+ mode_info_context++; /* Skip border mb */
+
+ }
+ }
+ else /* SIMPLE_LOOPFILTER */
+ {
+ for (mb_row = 0; mb_row < mb_rows; mb_row++)
+ {
+ for (mb_col = 0; mb_col < mb_cols; mb_col++)
+ {
+ int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+ mode_info_context->mbmi.mode != SPLITMV &&
+ mode_info_context->mbmi.mb_skip_coeff);
+
+ const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+ const int seg = mode_info_context->mbmi.segment_id;
+ const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+ filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+ if (filter_level)
+ {
+ const unsigned char * mblim = lfi_n->mblim[filter_level];
+ const unsigned char * blim = lfi_n->blim[filter_level];
+
+ if (mb_col > 0)
+ vp8_loop_filter_simple_mbv
+ (y_ptr, post_y_stride, mblim);
+
+ if (!skip_lf)
+ vp8_loop_filter_simple_bv
+ (y_ptr, post_y_stride, blim);
+
+ /* don't apply across umv border */
+ if (mb_row > 0)
+ vp8_loop_filter_simple_mbh
+ (y_ptr, post_y_stride, mblim);
+
+ if (!skip_lf)
+ vp8_loop_filter_simple_bh
+ (y_ptr, post_y_stride, blim);
+ }
+
+ y_ptr += 16;
+ u_ptr += 8;
+ v_ptr += 8;
+
+ mode_info_context++; /* step to next MB */
+ }
+ y_ptr += post_y_stride * 16 - post->y_width;
+ u_ptr += post_uv_stride * 8 - post->uv_width;
+ v_ptr += post_uv_stride * 8 - post->uv_width;
+
+ mode_info_context++; /* Skip border mb */
+
+ }
+ }
+}
+
+void vp8_loop_filter_frame_yonly
+(
+ VP8_COMMON *cm,
+ MACROBLOCKD *mbd,
+ int default_filt_lvl
+)
+{
+ YV12_BUFFER_CONFIG *post = cm->frame_to_show;
+
+ unsigned char *y_ptr;
+ int mb_row;
+ int mb_col;
+
+ loop_filter_info_n *lfi_n = &cm->lf_info;
+ loop_filter_info lfi;
+
+ int filter_level;
+ FRAME_TYPE frame_type = cm->frame_type;
+
+ /* Point at base of Mb MODE_INFO list */
+ const MODE_INFO *mode_info_context = cm->mi;
+
+#if 0
+ if(default_filt_lvl == 0) /* no filter applied */
+ return;
+#endif
+
+ /* Initialize the loop filter for this frame. */
+ vp8_loop_filter_frame_init( cm, mbd, default_filt_lvl);
+
+ /* Set up the buffer pointers */
+ y_ptr = post->y_buffer;
+
+ /* vp8_filter each macro block */
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+ {
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+ {
+ int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+ mode_info_context->mbmi.mode != SPLITMV &&
+ mode_info_context->mbmi.mb_skip_coeff);
+
+ const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+ const int seg = mode_info_context->mbmi.segment_id;
+ const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+ filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+ if (filter_level)
+ {
+ if (cm->filter_type == NORMAL_LOOPFILTER)
+ {
+ const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+ lfi.mblim = lfi_n->mblim[filter_level];
+ lfi.blim = lfi_n->blim[filter_level];
+ lfi.lim = lfi_n->lim[filter_level];
+ lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+ if (mb_col > 0)
+ vp8_loop_filter_mbv
+ (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+ if (!skip_lf)
+ vp8_loop_filter_bv
+ (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+ /* don't apply across umv border */
+ if (mb_row > 0)
+ vp8_loop_filter_mbh
+ (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+ if (!skip_lf)
+ vp8_loop_filter_bh
+ (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+ }
+ else
+ {
+ if (mb_col > 0)
+ vp8_loop_filter_simple_mbv
+ (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
+
+ if (!skip_lf)
+ vp8_loop_filter_simple_bv
+ (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
+
+ /* don't apply across umv border */
+ if (mb_row > 0)
+ vp8_loop_filter_simple_mbh
+ (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
+
+ if (!skip_lf)
+ vp8_loop_filter_simple_bh
+ (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
+ }
+ }
+
+ y_ptr += 16;
+ mode_info_context ++; /* step to next MB */
+
+ }
+
+ y_ptr += post->y_stride * 16 - post->y_width;
+ mode_info_context ++; /* Skip border mb */
+ }
+
+}
+
+void vp8_loop_filter_partial_frame
+(
+ VP8_COMMON *cm,
+ MACROBLOCKD *mbd,
+ int default_filt_lvl
+)
+{
+ YV12_BUFFER_CONFIG *post = cm->frame_to_show;
+
+ unsigned char *y_ptr;
+ int mb_row;
+ int mb_col;
+ int mb_cols = post->y_width >> 4;
+ int mb_rows = post->y_height >> 4;
+
+ int linestocopy;
+
+ loop_filter_info_n *lfi_n = &cm->lf_info;
+ loop_filter_info lfi;
+
+ int filter_level;
+ FRAME_TYPE frame_type = cm->frame_type;
+
+ const MODE_INFO *mode_info_context;
+
+#if 0
+ if(default_filt_lvl == 0) /* no filter applied */
+ return;
+#endif
+
+ /* Initialize the loop filter for this frame. */
+ vp8_loop_filter_frame_init( cm, mbd, default_filt_lvl);
+
+ /* number of MB rows to use in partial filtering */
+ linestocopy = mb_rows / PARTIAL_FRAME_FRACTION;
+ linestocopy = linestocopy ? linestocopy << 4 : 16; /* 16 lines per MB */
+
+ /* Set up the buffer pointers; partial image starts at ~middle of frame */
+ y_ptr = post->y_buffer + ((post->y_height >> 5) * 16) * post->y_stride;
+ mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1);
+
+ /* vp8_filter each macro block */
+ for (mb_row = 0; mb_row<(linestocopy >> 4); mb_row++)
+ {
+ for (mb_col = 0; mb_col < mb_cols; mb_col++)
+ {
+ int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+ mode_info_context->mbmi.mode != SPLITMV &&
+ mode_info_context->mbmi.mb_skip_coeff);
+
+ const int mode_index =
+ lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+ const int seg = mode_info_context->mbmi.segment_id;
+ const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+ filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+ if (filter_level)
+ {
+ if (cm->filter_type == NORMAL_LOOPFILTER)
+ {
+ const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+ lfi.mblim = lfi_n->mblim[filter_level];
+ lfi.blim = lfi_n->blim[filter_level];
+ lfi.lim = lfi_n->lim[filter_level];
+ lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+ if (mb_col > 0)
+ vp8_loop_filter_mbv
+ (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+ if (!skip_lf)
+ vp8_loop_filter_bv
+ (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+ vp8_loop_filter_mbh
+ (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+ if (!skip_lf)
+ vp8_loop_filter_bh
+ (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+ }
+ else
+ {
+ if (mb_col > 0)
+ vp8_loop_filter_simple_mbv
+ (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
+
+ if (!skip_lf)
+ vp8_loop_filter_simple_bv
+ (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
+
+ vp8_loop_filter_simple_mbh
+ (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
+
+ if (!skip_lf)
+ vp8_loop_filter_simple_bh
+ (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
+ }
+ }
+
+ y_ptr += 16;
+ mode_info_context += 1; /* step to next MB */
+ }
+
+ y_ptr += post->y_stride * 16 - post->y_width;
+ mode_info_context += 1; /* Skip border mb */
+ }
+}
diff --git a/thirdparty/libvpx/vp8/common/x86/copy_sse2.asm b/thirdparty/libvpx/vp8/common/x86/copy_sse2.asm
new file mode 100644
index 0000000000..86fae26956
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/x86/copy_sse2.asm
@@ -0,0 +1,93 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;void vp8_copy32xn_sse2(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *dst_ptr,
+; int dst_stride,
+; int height);
+global sym(vp8_copy32xn_sse2) PRIVATE
+sym(vp8_copy32xn_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;dst_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;dst_stride
+ movsxd rcx, dword ptr arg(4) ;height
+
+.block_copy_sse2_loopx4:
+ movdqu xmm0, XMMWORD PTR [rsi]
+ movdqu xmm1, XMMWORD PTR [rsi + 16]
+ movdqu xmm2, XMMWORD PTR [rsi + rax]
+ movdqu xmm3, XMMWORD PTR [rsi + rax + 16]
+
+ lea rsi, [rsi+rax*2]
+
+ movdqu xmm4, XMMWORD PTR [rsi]
+ movdqu xmm5, XMMWORD PTR [rsi + 16]
+ movdqu xmm6, XMMWORD PTR [rsi + rax]
+ movdqu xmm7, XMMWORD PTR [rsi + rax + 16]
+
+ lea rsi, [rsi+rax*2]
+
+ movdqa XMMWORD PTR [rdi], xmm0
+ movdqa XMMWORD PTR [rdi + 16], xmm1
+ movdqa XMMWORD PTR [rdi + rdx], xmm2
+ movdqa XMMWORD PTR [rdi + rdx + 16], xmm3
+
+ lea rdi, [rdi+rdx*2]
+
+ movdqa XMMWORD PTR [rdi], xmm4
+ movdqa XMMWORD PTR [rdi + 16], xmm5
+ movdqa XMMWORD PTR [rdi + rdx], xmm6
+ movdqa XMMWORD PTR [rdi + rdx + 16], xmm7
+
+ lea rdi, [rdi+rdx*2]
+
+ sub rcx, 4
+ cmp rcx, 4
+ jge .block_copy_sse2_loopx4
+
+ cmp rcx, 0
+ je .copy_is_done
+
+.block_copy_sse2_loop:
+ movdqu xmm0, XMMWORD PTR [rsi]
+ movdqu xmm1, XMMWORD PTR [rsi + 16]
+ lea rsi, [rsi+rax]
+
+ movdqa XMMWORD PTR [rdi], xmm0
+ movdqa XMMWORD PTR [rdi + 16], xmm1
+ lea rdi, [rdi+rdx]
+
+ sub rcx, 1
+ jne .block_copy_sse2_loop
+
+.copy_is_done:
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/thirdparty/libvpx/vp8/common/x86/copy_sse3.asm b/thirdparty/libvpx/vp8/common/x86/copy_sse3.asm
new file mode 100644
index 0000000000..d789a40ccf
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/x86/copy_sse3.asm
@@ -0,0 +1,146 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro STACK_FRAME_CREATE_X3 0
+%if ABI_IS_32BIT
+ %define src_ptr rsi
+ %define src_stride rax
+ %define ref_ptr rdi
+ %define ref_stride rdx
+ %define end_ptr rcx
+ %define ret_var rbx
+ %define result_ptr arg(4)
+ %define max_sad arg(4)
+ %define height dword ptr arg(4)
+ push rbp
+ mov rbp, rsp
+ push rsi
+ push rdi
+ push rbx
+
+ mov rsi, arg(0) ; src_ptr
+ mov rdi, arg(2) ; ref_ptr
+
+ movsxd rax, dword ptr arg(1) ; src_stride
+ movsxd rdx, dword ptr arg(3) ; ref_stride
+%else
+ %if LIBVPX_YASM_WIN64
+ SAVE_XMM 7, u
+ %define src_ptr rcx
+ %define src_stride rdx
+ %define ref_ptr r8
+ %define ref_stride r9
+ %define end_ptr r10
+ %define ret_var r11
+ %define result_ptr [rsp+xmm_stack_space+8+4*8]
+ %define max_sad [rsp+xmm_stack_space+8+4*8]
+ %define height dword ptr [rsp+xmm_stack_space+8+4*8]
+ %else
+ %define src_ptr rdi
+ %define src_stride rsi
+ %define ref_ptr rdx
+ %define ref_stride rcx
+ %define end_ptr r9
+ %define ret_var r10
+ %define result_ptr r8
+ %define max_sad r8
+ %define height r8
+ %endif
+%endif
+
+%endmacro
+
+%macro STACK_FRAME_DESTROY_X3 0
+ %define src_ptr
+ %define src_stride
+ %define ref_ptr
+ %define ref_stride
+ %define end_ptr
+ %define ret_var
+ %define result_ptr
+ %define max_sad
+ %define height
+
+%if ABI_IS_32BIT
+ pop rbx
+ pop rdi
+ pop rsi
+ pop rbp
+%else
+ %if LIBVPX_YASM_WIN64
+ RESTORE_XMM
+ %endif
+%endif
+ ret
+%endmacro
+
+
+;void vp8_copy32xn_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *dst_ptr,
+; int dst_stride,
+; int height);
+global sym(vp8_copy32xn_sse3) PRIVATE
+sym(vp8_copy32xn_sse3):
+
+ STACK_FRAME_CREATE_X3
+
+.block_copy_sse3_loopx4:
+ lea end_ptr, [src_ptr+src_stride*2]
+
+ movdqu xmm0, XMMWORD PTR [src_ptr]
+ movdqu xmm1, XMMWORD PTR [src_ptr + 16]
+ movdqu xmm2, XMMWORD PTR [src_ptr + src_stride]
+ movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16]
+ movdqu xmm4, XMMWORD PTR [end_ptr]
+ movdqu xmm5, XMMWORD PTR [end_ptr + 16]
+ movdqu xmm6, XMMWORD PTR [end_ptr + src_stride]
+ movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16]
+
+ lea src_ptr, [src_ptr+src_stride*4]
+
+ lea end_ptr, [ref_ptr+ref_stride*2]
+
+ movdqa XMMWORD PTR [ref_ptr], xmm0
+ movdqa XMMWORD PTR [ref_ptr + 16], xmm1
+ movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2
+ movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
+ movdqa XMMWORD PTR [end_ptr], xmm4
+ movdqa XMMWORD PTR [end_ptr + 16], xmm5
+ movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6
+ movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
+
+ lea ref_ptr, [ref_ptr+ref_stride*4]
+
+ sub height, 4
+ cmp height, 4
+ jge .block_copy_sse3_loopx4
+
+ ;Check to see if there is more rows need to be copied.
+ cmp height, 0
+ je .copy_is_done
+
+.block_copy_sse3_loop:
+ movdqu xmm0, XMMWORD PTR [src_ptr]
+ movdqu xmm1, XMMWORD PTR [src_ptr + 16]
+ lea src_ptr, [src_ptr+src_stride]
+
+ movdqa XMMWORD PTR [ref_ptr], xmm0
+ movdqa XMMWORD PTR [ref_ptr + 16], xmm1
+ lea ref_ptr, [ref_ptr+ref_stride]
+
+ sub height, 1
+ jne .block_copy_sse3_loop
+
+.copy_is_done:
+ STACK_FRAME_DESTROY_X3
diff --git a/thirdparty/libvpx/vp8/common/x86/dequantize_mmx.asm b/thirdparty/libvpx/vp8/common/x86/dequantize_mmx.asm
new file mode 100644
index 0000000000..4e551f00aa
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/x86/dequantize_mmx.asm
@@ -0,0 +1,258 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q)
+global sym(vp8_dequantize_b_impl_mmx) PRIVATE
+sym(vp8_dequantize_b_impl_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;sq
+ mov rdi, arg(1) ;dq
+ mov rax, arg(2) ;q
+
+ movq mm1, [rsi]
+ pmullw mm1, [rax+0] ; mm4 *= kernel 0 modifiers.
+ movq [rdi], mm1
+
+ movq mm1, [rsi+8]
+ pmullw mm1, [rax+8] ; mm4 *= kernel 0 modifiers.
+ movq [rdi+8], mm1
+
+ movq mm1, [rsi+16]
+ pmullw mm1, [rax+16] ; mm4 *= kernel 0 modifiers.
+ movq [rdi+16], mm1
+
+ movq mm1, [rsi+24]
+ pmullw mm1, [rax+24] ; mm4 *= kernel 0 modifiers.
+ movq [rdi+24], mm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void dequant_idct_add_mmx(
+;short *input, 0
+;short *dq, 1
+;unsigned char *dest, 2
+;int stride) 3
+global sym(vp8_dequant_idct_add_mmx) PRIVATE
+sym(vp8_dequant_idct_add_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ GET_GOT rbx
+ push rdi
+ ; end prolog
+
+ mov rax, arg(0) ;input
+ mov rdx, arg(1) ;dq
+
+
+ movq mm0, [rax ]
+ pmullw mm0, [rdx]
+
+ movq mm1, [rax +8]
+ pmullw mm1, [rdx +8]
+
+ movq mm2, [rax+16]
+ pmullw mm2, [rdx+16]
+
+ movq mm3, [rax+24]
+ pmullw mm3, [rdx+24]
+
+ mov rdx, arg(2) ;dest
+
+ pxor mm7, mm7
+
+
+ movq [rax], mm7
+ movq [rax+8], mm7
+
+ movq [rax+16],mm7
+ movq [rax+24],mm7
+
+
+ movsxd rdi, dword ptr arg(3) ;stride
+
+ psubw mm0, mm2 ; b1= 0-2
+ paddw mm2, mm2 ;
+
+ movq mm5, mm1
+ paddw mm2, mm0 ; a1 =0+2
+
+ pmulhw mm5, [GLOBAL(x_s1sqr2)];
+ paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movq mm7, mm3 ;
+ pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
+
+ paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw mm7, mm5 ; c1
+
+ movq mm5, mm1
+ movq mm4, mm3
+
+ pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
+ paddw mm5, mm1
+
+ pmulhw mm3, [GLOBAL(x_s1sqr2)]
+ paddw mm3, mm4
+
+ paddw mm3, mm5 ; d1
+ movq mm6, mm2 ; a1
+
+ movq mm4, mm0 ; b1
+ paddw mm2, mm3 ;0
+
+ paddw mm4, mm7 ;1
+ psubw mm0, mm7 ;2
+
+ psubw mm6, mm3 ;3
+
+ movq mm1, mm2 ; 03 02 01 00
+ movq mm3, mm4 ; 23 22 21 20
+
+ punpcklwd mm1, mm0 ; 11 01 10 00
+ punpckhwd mm2, mm0 ; 13 03 12 02
+
+ punpcklwd mm3, mm6 ; 31 21 30 20
+ punpckhwd mm4, mm6 ; 33 23 32 22
+
+ movq mm0, mm1 ; 11 01 10 00
+ movq mm5, mm2 ; 13 03 12 02
+
+ punpckldq mm0, mm3 ; 30 20 10 00
+ punpckhdq mm1, mm3 ; 31 21 11 01
+
+ punpckldq mm2, mm4 ; 32 22 12 02
+ punpckhdq mm5, mm4 ; 33 23 13 03
+
+ movq mm3, mm5 ; 33 23 13 03
+
+ psubw mm0, mm2 ; b1= 0-2
+ paddw mm2, mm2 ;
+
+ movq mm5, mm1
+ paddw mm2, mm0 ; a1 =0+2
+
+ pmulhw mm5, [GLOBAL(x_s1sqr2)];
+ paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movq mm7, mm3 ;
+ pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
+
+ paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw mm7, mm5 ; c1
+
+ movq mm5, mm1
+ movq mm4, mm3
+
+ pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
+ paddw mm5, mm1
+
+ pmulhw mm3, [GLOBAL(x_s1sqr2)]
+ paddw mm3, mm4
+
+ paddw mm3, mm5 ; d1
+ paddw mm0, [GLOBAL(fours)]
+
+ paddw mm2, [GLOBAL(fours)]
+ movq mm6, mm2 ; a1
+
+ movq mm4, mm0 ; b1
+ paddw mm2, mm3 ;0
+
+ paddw mm4, mm7 ;1
+ psubw mm0, mm7 ;2
+
+ psubw mm6, mm3 ;3
+ psraw mm2, 3
+
+ psraw mm0, 3
+ psraw mm4, 3
+
+ psraw mm6, 3
+
+ movq mm1, mm2 ; 03 02 01 00
+ movq mm3, mm4 ; 23 22 21 20
+
+ punpcklwd mm1, mm0 ; 11 01 10 00
+ punpckhwd mm2, mm0 ; 13 03 12 02
+
+ punpcklwd mm3, mm6 ; 31 21 30 20
+ punpckhwd mm4, mm6 ; 33 23 32 22
+
+ movq mm0, mm1 ; 11 01 10 00
+ movq mm5, mm2 ; 13 03 12 02
+
+ punpckldq mm0, mm3 ; 30 20 10 00
+ punpckhdq mm1, mm3 ; 31 21 11 01
+
+ punpckldq mm2, mm4 ; 32 22 12 02
+ punpckhdq mm5, mm4 ; 33 23 13 03
+
+ pxor mm7, mm7
+
+ movd mm4, [rdx]
+ punpcklbw mm4, mm7
+ paddsw mm0, mm4
+ packuswb mm0, mm7
+ movd [rdx], mm0
+
+ movd mm4, [rdx+rdi]
+ punpcklbw mm4, mm7
+ paddsw mm1, mm4
+ packuswb mm1, mm7
+ movd [rdx+rdi], mm1
+
+ movd mm4, [rdx+2*rdi]
+ punpcklbw mm4, mm7
+ paddsw mm2, mm4
+ packuswb mm2, mm7
+ movd [rdx+rdi*2], mm2
+
+ add rdx, rdi
+
+ movd mm4, [rdx+2*rdi]
+ punpcklbw mm4, mm7
+ paddsw mm5, mm4
+ packuswb mm5, mm7
+ movd [rdx+rdi*2], mm5
+
+ ; begin epilog
+ pop rdi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+x_s1sqr2:
+ times 4 dw 0x8A8C
+align 16
+x_c1sqr2less1:
+ times 4 dw 0x4E7B
+align 16
+fours:
+ times 4 dw 0x0004
diff --git a/thirdparty/libvpx/vp8/common/x86/filter_x86.c b/thirdparty/libvpx/vp8/common/x86/filter_x86.c
new file mode 100644
index 0000000000..7f496ed7db
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/x86/filter_x86.c
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp8/common/x86/filter_x86.h"
+
+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]) =
+{
+ { 128, 128, 128, 128, 0, 0, 0, 0 },
+ { 112, 112, 112, 112, 16, 16, 16, 16 },
+ { 96, 96, 96, 96, 32, 32, 32, 32 },
+ { 80, 80, 80, 80, 48, 48, 48, 48 },
+ { 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 48, 48, 48, 48, 80, 80, 80, 80 },
+ { 32, 32, 32, 32, 96, 96, 96, 96 },
+ { 16, 16, 16, 16, 112, 112, 112, 112 }
+};
+
+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]) =
+{
+ { 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 },
+ { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 },
+ { 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 },
+ { 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 },
+ { 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 }
+};
diff --git a/thirdparty/libvpx/vp8/common/x86/filter_x86.h b/thirdparty/libvpx/vp8/common/x86/filter_x86.h
new file mode 100644
index 0000000000..d282841bee
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/x86/filter_x86.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_COMMON_X86_FILTER_X86_H_
+#define VP8_COMMON_X86_FILTER_X86_H_
+
+#include "vpx_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* x86 assembly specific copy of vp8/common/filter.c:vp8_bilinear_filters with
+ * duplicated values */
+
+/* duplicated 4x */
+extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]);
+
+/* duplicated 8x */
+extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_COMMON_X86_FILTER_X86_H_
diff --git a/thirdparty/libvpx/vp8/common/x86/idct_blk_mmx.c b/thirdparty/libvpx/vp8/common/x86/idct_blk_mmx.c
new file mode 100644
index 0000000000..f2532b34da
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/x86/idct_blk_mmx.c
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "vp8/common/blockd.h"
+#include "vpx_mem/vpx_mem.h"
+
+extern void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q);
+
+void vp8_dequantize_b_mmx(BLOCKD *d, short *DQC)
+{
+ short *sq = (short *) d->qcoeff;
+ short *dq = (short *) d->dqcoeff;
+
+ vp8_dequantize_b_impl_mmx(sq, dq, DQC);
+}
+
+void vp8_dequant_idct_add_y_block_mmx
+ (short *q, short *dq,
+ unsigned char *dst, int stride, char *eobs)
+{
+ int i;
+
+ for (i = 0; i < 4; i++)
+ {
+ if (eobs[0] > 1)
+ vp8_dequant_idct_add_mmx (q, dq, dst, stride);
+ else if (eobs[0] == 1)
+ {
+ vp8_dc_only_idct_add_mmx (q[0]*dq[0], dst, stride, dst, stride);
+ memset(q, 0, 2 * sizeof(q[0]));
+ }
+
+ if (eobs[1] > 1)
+ vp8_dequant_idct_add_mmx (q+16, dq, dst+4, stride);
+ else if (eobs[1] == 1)
+ {
+ vp8_dc_only_idct_add_mmx (q[16]*dq[0], dst+4, stride,
+ dst+4, stride);
+ memset(q + 16, 0, 2 * sizeof(q[0]));
+ }
+
+ if (eobs[2] > 1)
+ vp8_dequant_idct_add_mmx (q+32, dq, dst+8, stride);
+ else if (eobs[2] == 1)
+ {
+ vp8_dc_only_idct_add_mmx (q[32]*dq[0], dst+8, stride,
+ dst+8, stride);
+ memset(q + 32, 0, 2 * sizeof(q[0]));
+ }
+
+ if (eobs[3] > 1)
+ vp8_dequant_idct_add_mmx (q+48, dq, dst+12, stride);
+ else if (eobs[3] == 1)
+ {
+ vp8_dc_only_idct_add_mmx (q[48]*dq[0], dst+12, stride,
+ dst+12, stride);
+ memset(q + 48, 0, 2 * sizeof(q[0]));
+ }
+
+ q += 64;
+ dst += 4*stride;
+ eobs += 4;
+ }
+}
+
+void vp8_dequant_idct_add_uv_block_mmx
+ (short *q, short *dq,
+ unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
+{
+ int i;
+
+ for (i = 0; i < 2; i++)
+ {
+ if (eobs[0] > 1)
+ vp8_dequant_idct_add_mmx (q, dq, dstu, stride);
+ else if (eobs[0] == 1)
+ {
+ vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstu, stride, dstu, stride);
+ memset(q, 0, 2 * sizeof(q[0]));
+ }
+
+ if (eobs[1] > 1)
+ vp8_dequant_idct_add_mmx (q+16, dq, dstu+4, stride);
+ else if (eobs[1] == 1)
+ {
+ vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstu+4, stride,
+ dstu+4, stride);
+ memset(q + 16, 0, 2 * sizeof(q[0]));
+ }
+
+ q += 32;
+ dstu += 4*stride;
+ eobs += 2;
+ }
+
+ for (i = 0; i < 2; i++)
+ {
+ if (eobs[0] > 1)
+ vp8_dequant_idct_add_mmx (q, dq, dstv, stride);
+ else if (eobs[0] == 1)
+ {
+ vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstv, stride, dstv, stride);
+ memset(q, 0, 2 * sizeof(q[0]));
+ }
+
+ if (eobs[1] > 1)
+ vp8_dequant_idct_add_mmx (q+16, dq, dstv+4, stride);
+ else if (eobs[1] == 1)
+ {
+ vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstv+4, stride,
+ dstv+4, stride);
+ memset(q + 16, 0, 2 * sizeof(q[0]));
+ }
+
+ q += 32;
+ dstv += 4*stride;
+ eobs += 2;
+ }
+}
diff --git a/thirdparty/libvpx/vp8/common/x86/idct_blk_sse2.c b/thirdparty/libvpx/vp8/common/x86/idct_blk_sse2.c
new file mode 100644
index 0000000000..ae96ec858c
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/x86/idct_blk_sse2.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+
+void vp8_idct_dequant_0_2x_sse2
+ (short *q, short *dq ,
+ unsigned char *dst, int dst_stride);
+void vp8_idct_dequant_full_2x_sse2
+ (short *q, short *dq ,
+ unsigned char *dst, int dst_stride);
+
+void vp8_dequant_idct_add_y_block_sse2
+ (short *q, short *dq,
+ unsigned char *dst, int stride, char *eobs)
+{
+ int i;
+
+ for (i = 0; i < 4; i++)
+ {
+ if (((short *)(eobs))[0])
+ {
+ if (((short *)(eobs))[0] & 0xfefe)
+ vp8_idct_dequant_full_2x_sse2 (q, dq, dst, stride);
+ else
+ vp8_idct_dequant_0_2x_sse2 (q, dq, dst, stride);
+ }
+ if (((short *)(eobs))[1])
+ {
+ if (((short *)(eobs))[1] & 0xfefe)
+ vp8_idct_dequant_full_2x_sse2 (q+32, dq, dst+8, stride);
+ else
+ vp8_idct_dequant_0_2x_sse2 (q+32, dq, dst+8, stride);
+ }
+ q += 64;
+ dst += stride*4;
+ eobs += 4;
+ }
+}
+
+void vp8_dequant_idct_add_uv_block_sse2
+ (short *q, short *dq,
+ unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
+{
+ if (((short *)(eobs))[0])
+ {
+ if (((short *)(eobs))[0] & 0xfefe)
+ vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride);
+ else
+ vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride);
+ }
+ q += 32;
+ dstu += stride*4;
+
+ if (((short *)(eobs))[1])
+ {
+ if (((short *)(eobs))[1] & 0xfefe)
+ vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride);
+ else
+ vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride);
+ }
+ q += 32;
+
+ if (((short *)(eobs))[2])
+ {
+ if (((short *)(eobs))[2] & 0xfefe)
+ vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride);
+ else
+ vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride);
+ }
+ q += 32;
+ dstv += stride*4;
+
+ if (((short *)(eobs))[3])
+ {
+ if (((short *)(eobs))[3] & 0xfefe)
+ vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride);
+ else
+ vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride);
+ }
+}
diff --git a/thirdparty/libvpx/vp8/common/x86/idctllm_mmx.asm b/thirdparty/libvpx/vp8/common/x86/idctllm_mmx.asm
new file mode 100644
index 0000000000..96fa2c60d0
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/x86/idctllm_mmx.asm
@@ -0,0 +1,295 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; /****************************************************************************
+; * Notes:
+; *
+; * This implementation makes use of 16 bit fixed point version of two multiply
+; * constants:
+; * 1. sqrt(2) * cos (pi/8)
+; * 2. sqrt(2) * sin (pi/8)
+; * Because the first constant is bigger than 1, to maintain the same 16 bit
+; * fixed point precision as the second one, we use a trick of
+; * x * a = x + x*(a-1)
+; * so
+; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
+; *
+; * For the second constant, because of the 16bit version is 35468, which
+; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative
+; * number.
+; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
+; *
+; **************************************************************************/
+
+
+;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred,
+;int pitch, unsigned char *dest,int stride)
+global sym(vp8_short_idct4x4llm_mmx) PRIVATE
+sym(vp8_short_idct4x4llm_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rax, arg(0) ;input
+ mov rsi, arg(1) ;pred
+
+ movq mm0, [rax ]
+ movq mm1, [rax+ 8]
+ movq mm2, [rax+16]
+ movq mm3, [rax+24]
+
+%if 0
+ pxor mm7, mm7
+ movq [rax], mm7
+ movq [rax+8], mm7
+ movq [rax+16],mm7
+ movq [rax+24],mm7
+%endif
+ movsxd rax, dword ptr arg(2) ;pitch
+ mov rdx, arg(3) ;dest
+ movsxd rdi, dword ptr arg(4) ;stride
+
+
+ psubw mm0, mm2 ; b1= 0-2
+ paddw mm2, mm2 ;
+
+ movq mm5, mm1
+ paddw mm2, mm0 ; a1 =0+2
+
+ pmulhw mm5, [GLOBAL(x_s1sqr2)];
+ paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movq mm7, mm3 ;
+ pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
+
+ paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw mm7, mm5 ; c1
+
+ movq mm5, mm1
+ movq mm4, mm3
+
+ pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
+ paddw mm5, mm1
+
+ pmulhw mm3, [GLOBAL(x_s1sqr2)]
+ paddw mm3, mm4
+
+ paddw mm3, mm5 ; d1
+ movq mm6, mm2 ; a1
+
+ movq mm4, mm0 ; b1
+ paddw mm2, mm3 ;0
+
+ paddw mm4, mm7 ;1
+ psubw mm0, mm7 ;2
+
+ psubw mm6, mm3 ;3
+
+ movq mm1, mm2 ; 03 02 01 00
+ movq mm3, mm4 ; 23 22 21 20
+
+ punpcklwd mm1, mm0 ; 11 01 10 00
+ punpckhwd mm2, mm0 ; 13 03 12 02
+
+ punpcklwd mm3, mm6 ; 31 21 30 20
+ punpckhwd mm4, mm6 ; 33 23 32 22
+
+ movq mm0, mm1 ; 11 01 10 00
+ movq mm5, mm2 ; 13 03 12 02
+
+ punpckldq mm0, mm3 ; 30 20 10 00
+ punpckhdq mm1, mm3 ; 31 21 11 01
+
+ punpckldq mm2, mm4 ; 32 22 12 02
+ punpckhdq mm5, mm4 ; 33 23 13 03
+
+ movq mm3, mm5 ; 33 23 13 03
+
+ psubw mm0, mm2 ; b1= 0-2
+ paddw mm2, mm2 ;
+
+ movq mm5, mm1
+ paddw mm2, mm0 ; a1 =0+2
+
+ pmulhw mm5, [GLOBAL(x_s1sqr2)];
+ paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movq mm7, mm3 ;
+ pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
+
+ paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw mm7, mm5 ; c1
+
+ movq mm5, mm1
+ movq mm4, mm3
+
+ pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
+ paddw mm5, mm1
+
+ pmulhw mm3, [GLOBAL(x_s1sqr2)]
+ paddw mm3, mm4
+
+ paddw mm3, mm5 ; d1
+ paddw mm0, [GLOBAL(fours)]
+
+ paddw mm2, [GLOBAL(fours)]
+ movq mm6, mm2 ; a1
+
+ movq mm4, mm0 ; b1
+ paddw mm2, mm3 ;0
+
+ paddw mm4, mm7 ;1
+ psubw mm0, mm7 ;2
+
+ psubw mm6, mm3 ;3
+ psraw mm2, 3
+
+ psraw mm0, 3
+ psraw mm4, 3
+
+ psraw mm6, 3
+
+ movq mm1, mm2 ; 03 02 01 00
+ movq mm3, mm4 ; 23 22 21 20
+
+ punpcklwd mm1, mm0 ; 11 01 10 00
+ punpckhwd mm2, mm0 ; 13 03 12 02
+
+ punpcklwd mm3, mm6 ; 31 21 30 20
+ punpckhwd mm4, mm6 ; 33 23 32 22
+
+ movq mm0, mm1 ; 11 01 10 00
+ movq mm5, mm2 ; 13 03 12 02
+
+ punpckldq mm0, mm3 ; 30 20 10 00
+ punpckhdq mm1, mm3 ; 31 21 11 01
+
+ punpckldq mm2, mm4 ; 32 22 12 02
+ punpckhdq mm5, mm4 ; 33 23 13 03
+
+ pxor mm7, mm7
+
+ movd mm4, [rsi]
+ punpcklbw mm4, mm7
+ paddsw mm0, mm4
+ packuswb mm0, mm7
+ movd [rdx], mm0
+
+ movd mm4, [rsi+rax]
+ punpcklbw mm4, mm7
+ paddsw mm1, mm4
+ packuswb mm1, mm7
+ movd [rdx+rdi], mm1
+
+ movd mm4, [rsi+2*rax]
+ punpcklbw mm4, mm7
+ paddsw mm2, mm4
+ packuswb mm2, mm7
+ movd [rdx+rdi*2], mm2
+
+ add rdx, rdi
+ add rsi, rax
+
+ movd mm4, [rsi+2*rax]
+ punpcklbw mm4, mm7
+ paddsw mm5, mm4
+ packuswb mm5, mm7
+ movd [rdx+rdi*2], mm5
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_dc_only_idct_add_mmx(
+;short input_dc,
+;unsigned char *pred_ptr,
+;int pred_stride,
+;unsigned char *dst_ptr,
+;int stride)
+global sym(vp8_dc_only_idct_add_mmx) PRIVATE
+sym(vp8_dc_only_idct_add_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ ; end prolog
+
+ movd mm5, arg(0) ;input_dc
+ mov rax, arg(1) ;pred_ptr
+ movsxd rdx, dword ptr arg(2) ;pred_stride
+
+ pxor mm0, mm0
+
+ paddw mm5, [GLOBAL(fours)]
+ lea rcx, [rdx + rdx*2]
+
+ psraw mm5, 3
+
+ punpcklwd mm5, mm5
+
+ punpckldq mm5, mm5
+
+ movd mm1, [rax]
+ movd mm2, [rax+rdx]
+ movd mm3, [rax+2*rdx]
+ movd mm4, [rax+rcx]
+
+ mov rax, arg(3) ;d -- destination
+ movsxd rdx, dword ptr arg(4) ;dst_stride
+
+ punpcklbw mm1, mm0
+ paddsw mm1, mm5
+ packuswb mm1, mm0 ; pack and unpack to saturate
+ lea rcx, [rdx + rdx*2]
+
+ punpcklbw mm2, mm0
+ paddsw mm2, mm5
+ packuswb mm2, mm0 ; pack and unpack to saturate
+
+ punpcklbw mm3, mm0
+ paddsw mm3, mm5
+ packuswb mm3, mm0 ; pack and unpack to saturate
+
+ punpcklbw mm4, mm0
+ paddsw mm4, mm5
+ packuswb mm4, mm0 ; pack and unpack to saturate
+
+ movd [rax], mm1
+ movd [rax+rdx], mm2
+ movd [rax+2*rdx], mm3
+ movd [rax+rcx], mm4
+
+ ; begin epilog
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+x_s1sqr2:
+ times 4 dw 0x8A8C
+align 16
+x_c1sqr2less1:
+ times 4 dw 0x4E7B
+align 16
+fours:
+ times 4 dw 0x0004
diff --git a/thirdparty/libvpx/vp8/common/x86/idctllm_sse2.asm b/thirdparty/libvpx/vp8/common/x86/idctllm_sse2.asm
new file mode 100644
index 0000000000..bf8e2c4021
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/x86/idctllm_sse2.asm
@@ -0,0 +1,708 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_idct_dequant_0_2x_sse2
+; (
+; short *qcoeff - 0
+; short *dequant - 1
+; unsigned char *dst - 2
+; int dst_stride - 3
+; )
+
+global sym(vp8_idct_dequant_0_2x_sse2) PRIVATE
+sym(vp8_idct_dequant_0_2x_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ GET_GOT rbx
+ ; end prolog
+
+ mov rdx, arg(1) ; dequant
+ mov rax, arg(0) ; qcoeff
+
+ movd xmm4, [rax]
+ movd xmm5, [rdx]
+
+ pinsrw xmm4, [rax+32], 4
+ pinsrw xmm5, [rdx], 4
+
+ pmullw xmm4, xmm5
+
+ ; Zero out xmm5, for use unpacking
+ pxor xmm5, xmm5
+
+ ; clear coeffs
+ movd [rax], xmm5
+ movd [rax+32], xmm5
+;pshufb
+ mov rax, arg(2) ; dst
+ movsxd rdx, dword ptr arg(3) ; dst_stride
+
+ pshuflw xmm4, xmm4, 00000000b
+ pshufhw xmm4, xmm4, 00000000b
+
+ lea rcx, [rdx + rdx*2]
+ paddw xmm4, [GLOBAL(fours)]
+
+ psraw xmm4, 3
+
+ movq xmm0, [rax]
+ movq xmm1, [rax+rdx]
+ movq xmm2, [rax+2*rdx]
+ movq xmm3, [rax+rcx]
+
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ punpcklbw xmm2, xmm5
+ punpcklbw xmm3, xmm5
+
+
+ ; Add to predict buffer
+ paddw xmm0, xmm4
+ paddw xmm1, xmm4
+ paddw xmm2, xmm4
+ paddw xmm3, xmm4
+
+ ; pack up before storing
+ packuswb xmm0, xmm5
+ packuswb xmm1, xmm5
+ packuswb xmm2, xmm5
+ packuswb xmm3, xmm5
+
+ ; store blocks back out
+ movq [rax], xmm0
+ movq [rax + rdx], xmm1
+
+ lea rax, [rax + 2*rdx]
+
+ movq [rax], xmm2
+ movq [rax + rdx], xmm3
+
+ ; begin epilog
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_idct_dequant_full_2x_sse2
+; (
+; short *qcoeff - 0
+; short *dequant - 1
+; unsigned char *dst - 2
+; int dst_stride - 3
+; )
+global sym(vp8_idct_dequant_full_2x_sse2) PRIVATE
+sym(vp8_idct_dequant_full_2x_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ; special case when 2 blocks have 0 or 1 coeffs
+ ; dc is set as first coeff, so no need to load qcoeff
+ mov rax, arg(0) ; qcoeff
+ mov rdx, arg(1) ; dequant
+ mov rdi, arg(2) ; dst
+
+
+ ; Zero out xmm7, for use unpacking
+ pxor xmm7, xmm7
+
+
+ ; note the transpose of xmm1 and xmm2, necessary for shuffle
+ ; to spit out sensicle data
+ movdqa xmm0, [rax]
+ movdqa xmm2, [rax+16]
+ movdqa xmm1, [rax+32]
+ movdqa xmm3, [rax+48]
+
+ ; Clear out coeffs
+ movdqa [rax], xmm7
+ movdqa [rax+16], xmm7
+ movdqa [rax+32], xmm7
+ movdqa [rax+48], xmm7
+
+ ; dequantize qcoeff buffer
+ pmullw xmm0, [rdx]
+ pmullw xmm2, [rdx+16]
+ pmullw xmm1, [rdx]
+ pmullw xmm3, [rdx+16]
+ movsxd rdx, dword ptr arg(3) ; dst_stride
+
+ ; repack so block 0 row x and block 1 row x are together
+ movdqa xmm4, xmm0
+ punpckldq xmm0, xmm1
+ punpckhdq xmm4, xmm1
+
+ pshufd xmm0, xmm0, 11011000b
+ pshufd xmm1, xmm4, 11011000b
+
+ movdqa xmm4, xmm2
+ punpckldq xmm2, xmm3
+ punpckhdq xmm4, xmm3
+
+ pshufd xmm2, xmm2, 11011000b
+ pshufd xmm3, xmm4, 11011000b
+
+ ; first pass
+ psubw xmm0, xmm2 ; b1 = 0-2
+ paddw xmm2, xmm2 ;
+
+ movdqa xmm5, xmm1
+ paddw xmm2, xmm0 ; a1 = 0+2
+
+ pmulhw xmm5, [GLOBAL(x_s1sqr2)]
+ lea rcx, [rdx + rdx*2] ;dst_stride * 3
+ paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movdqa xmm7, xmm3
+ pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
+
+ paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw xmm7, xmm5 ; c1
+
+ movdqa xmm5, xmm1
+ movdqa xmm4, xmm3
+
+ pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
+ paddw xmm5, xmm1
+
+ pmulhw xmm3, [GLOBAL(x_s1sqr2)]
+ paddw xmm3, xmm4
+
+ paddw xmm3, xmm5 ; d1
+ movdqa xmm6, xmm2 ; a1
+
+ movdqa xmm4, xmm0 ; b1
+ paddw xmm2, xmm3 ;0
+
+ paddw xmm4, xmm7 ;1
+ psubw xmm0, xmm7 ;2
+
+ psubw xmm6, xmm3 ;3
+
+ ; transpose for the second pass
+ movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
+ punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
+ punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
+
+ movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
+ punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
+ punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
+
+
+ movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
+ punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
+ punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
+
+ movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
+ punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
+ punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
+
+
+ movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
+ punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
+ punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
+
+ movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
+ punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
+ punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
+
+ pshufd xmm0, xmm2, 11011000b
+ pshufd xmm2, xmm1, 11011000b
+
+ pshufd xmm1, xmm5, 11011000b
+ pshufd xmm3, xmm7, 11011000b
+
+ ; second pass
+ psubw xmm0, xmm2 ; b1 = 0-2
+ paddw xmm2, xmm2
+
+ movdqa xmm5, xmm1
+ paddw xmm2, xmm0 ; a1 = 0+2
+
+ pmulhw xmm5, [GLOBAL(x_s1sqr2)]
+ paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movdqa xmm7, xmm3
+ pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
+
+ paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw xmm7, xmm5 ; c1
+
+ movdqa xmm5, xmm1
+ movdqa xmm4, xmm3
+
+ pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
+ paddw xmm5, xmm1
+
+ pmulhw xmm3, [GLOBAL(x_s1sqr2)]
+ paddw xmm3, xmm4
+
+ paddw xmm3, xmm5 ; d1
+ paddw xmm0, [GLOBAL(fours)]
+
+ paddw xmm2, [GLOBAL(fours)]
+ movdqa xmm6, xmm2 ; a1
+
+ movdqa xmm4, xmm0 ; b1
+ paddw xmm2, xmm3 ;0
+
+ paddw xmm4, xmm7 ;1
+ psubw xmm0, xmm7 ;2
+
+ psubw xmm6, xmm3 ;3
+ psraw xmm2, 3
+
+ psraw xmm0, 3
+ psraw xmm4, 3
+
+ psraw xmm6, 3
+
+ ; transpose to save
+ movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
+ punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
+ punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
+
+ movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
+ punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
+ punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
+
+
+ movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
+ punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
+ punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
+
+ movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
+ punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
+ punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
+
+
+ movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
+ punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
+ punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
+
+ movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
+ punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
+ punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
+
+ pshufd xmm0, xmm2, 11011000b
+ pshufd xmm2, xmm1, 11011000b
+
+ pshufd xmm1, xmm5, 11011000b
+ pshufd xmm3, xmm7, 11011000b
+
+ pxor xmm7, xmm7
+
+ ; Load up predict blocks
+ movq xmm4, [rdi]
+ movq xmm5, [rdi+rdx]
+
+ punpcklbw xmm4, xmm7
+ punpcklbw xmm5, xmm7
+
+ paddw xmm0, xmm4
+ paddw xmm1, xmm5
+
+ movq xmm4, [rdi+2*rdx]
+ movq xmm5, [rdi+rcx]
+
+ punpcklbw xmm4, xmm7
+ punpcklbw xmm5, xmm7
+
+ paddw xmm2, xmm4
+ paddw xmm3, xmm5
+
+.finish:
+
+ ; pack up before storing
+ packuswb xmm0, xmm7
+ packuswb xmm1, xmm7
+ packuswb xmm2, xmm7
+ packuswb xmm3, xmm7
+
+ ; store blocks back out
+ movq [rdi], xmm0
+ movq [rdi + rdx], xmm1
+ movq [rdi + rdx*2], xmm2
+ movq [rdi + rcx], xmm3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_idct_dequant_dc_0_2x_sse2
+; (
+; short *qcoeff - 0
+; short *dequant - 1
+; unsigned char *dst - 2
+; int dst_stride - 3
+; short *dc - 4
+; )
+global sym(vp8_idct_dequant_dc_0_2x_sse2) PRIVATE
+sym(vp8_idct_dequant_dc_0_2x_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ push rdi
+ ; end prolog
+
+ ; special case when 2 blocks have 0 or 1 coeffs
+ ; dc is set as first coeff, so no need to load qcoeff
+ mov rax, arg(0) ; qcoeff
+
+ mov rdi, arg(2) ; dst
+ mov rdx, arg(4) ; dc
+
+ ; Zero out xmm5, for use unpacking
+ pxor xmm5, xmm5
+
+ ; load up 2 dc words here == 2*16 = doubleword
+ movd xmm4, [rdx]
+
+ movsxd rdx, dword ptr arg(3) ; dst_stride
+ lea rcx, [rdx + rdx*2]
+ ; Load up predict blocks
+ movq xmm0, [rdi]
+ movq xmm1, [rdi+rdx*1]
+ movq xmm2, [rdi+rdx*2]
+ movq xmm3, [rdi+rcx]
+
+ ; Duplicate and expand dc across
+ punpcklwd xmm4, xmm4
+ punpckldq xmm4, xmm4
+
+ ; Rounding to dequant and downshift
+ paddw xmm4, [GLOBAL(fours)]
+ psraw xmm4, 3
+
+ ; Predict buffer needs to be expanded from bytes to words
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ punpcklbw xmm2, xmm5
+ punpcklbw xmm3, xmm5
+
+ ; Add to predict buffer
+ paddw xmm0, xmm4
+ paddw xmm1, xmm4
+ paddw xmm2, xmm4
+ paddw xmm3, xmm4
+
+ ; pack up before storing
+ packuswb xmm0, xmm5
+ packuswb xmm1, xmm5
+ packuswb xmm2, xmm5
+ packuswb xmm3, xmm5
+
+ ; store blocks back out
+ movq [rdi], xmm0
+ movq [rdi + rdx], xmm1
+ movq [rdi + rdx*2], xmm2
+ movq [rdi + rcx], xmm3
+
+ ; begin epilog
+ pop rdi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+;void vp8_idct_dequant_dc_full_2x_sse2
+; (
+; short *qcoeff - 0
+; short *dequant - 1
+; unsigned char *dst - 2
+; int dst_stride - 3
+; short *dc - 4
+; )
+global sym(vp8_idct_dequant_dc_full_2x_sse2) PRIVATE
+sym(vp8_idct_dequant_dc_full_2x_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rdi
+ ; end prolog
+
+ ; special case when 2 blocks have 0 or 1 coeffs
+ ; dc is set as first coeff, so no need to load qcoeff
+ mov rax, arg(0) ; qcoeff
+ mov rdx, arg(1) ; dequant
+
+ mov rdi, arg(2) ; dst
+
+ ; Zero out xmm7, for use unpacking
+ pxor xmm7, xmm7
+
+
+ ; note the transpose of xmm1 and xmm2, necessary for shuffle
+ ; to spit out sensicle data
+ movdqa xmm0, [rax]
+ movdqa xmm2, [rax+16]
+ movdqa xmm1, [rax+32]
+ movdqa xmm3, [rax+48]
+
+ ; Clear out coeffs
+ movdqa [rax], xmm7
+ movdqa [rax+16], xmm7
+ movdqa [rax+32], xmm7
+ movdqa [rax+48], xmm7
+
+ ; dequantize qcoeff buffer
+ pmullw xmm0, [rdx]
+ pmullw xmm2, [rdx+16]
+ pmullw xmm1, [rdx]
+ pmullw xmm3, [rdx+16]
+
+ ; DC component
+ mov rdx, arg(4)
+
+ ; repack so block 0 row x and block 1 row x are together
+ movdqa xmm4, xmm0
+ punpckldq xmm0, xmm1
+ punpckhdq xmm4, xmm1
+
+ pshufd xmm0, xmm0, 11011000b
+ pshufd xmm1, xmm4, 11011000b
+
+ movdqa xmm4, xmm2
+ punpckldq xmm2, xmm3
+ punpckhdq xmm4, xmm3
+
+ pshufd xmm2, xmm2, 11011000b
+ pshufd xmm3, xmm4, 11011000b
+
+ ; insert DC component
+ pinsrw xmm0, [rdx], 0
+ pinsrw xmm0, [rdx+2], 4
+
+ ; first pass
+ psubw xmm0, xmm2 ; b1 = 0-2
+ paddw xmm2, xmm2 ;
+
+ movdqa xmm5, xmm1
+ paddw xmm2, xmm0 ; a1 = 0+2
+
+ pmulhw xmm5, [GLOBAL(x_s1sqr2)]
+ paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movdqa xmm7, xmm3
+ pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
+
+ paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw xmm7, xmm5 ; c1
+
+ movdqa xmm5, xmm1
+ movdqa xmm4, xmm3
+
+ pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
+ paddw xmm5, xmm1
+
+ pmulhw xmm3, [GLOBAL(x_s1sqr2)]
+ paddw xmm3, xmm4
+
+ paddw xmm3, xmm5 ; d1
+ movdqa xmm6, xmm2 ; a1
+
+ movdqa xmm4, xmm0 ; b1
+ paddw xmm2, xmm3 ;0
+
+ paddw xmm4, xmm7 ;1
+ psubw xmm0, xmm7 ;2
+
+ psubw xmm6, xmm3 ;3
+
+ ; transpose for the second pass
+ movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
+ punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
+ punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
+
+ movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
+ punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
+ punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
+
+
+ movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
+ punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
+ punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
+
+ movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
+ punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
+ punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
+
+
+ movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
+ punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
+ punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
+
+ movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
+ punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
+ punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
+
+ pshufd xmm0, xmm2, 11011000b
+ pshufd xmm2, xmm1, 11011000b
+
+ pshufd xmm1, xmm5, 11011000b
+ pshufd xmm3, xmm7, 11011000b
+
+ ; second pass
+ psubw xmm0, xmm2 ; b1 = 0-2
+ paddw xmm2, xmm2
+
+ movdqa xmm5, xmm1
+ paddw xmm2, xmm0 ; a1 = 0+2
+
+ pmulhw xmm5, [GLOBAL(x_s1sqr2)]
+ paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movdqa xmm7, xmm3
+ pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
+
+ paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw xmm7, xmm5 ; c1
+
+ movdqa xmm5, xmm1
+ movdqa xmm4, xmm3
+
+ pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
+ paddw xmm5, xmm1
+
+ pmulhw xmm3, [GLOBAL(x_s1sqr2)]
+ paddw xmm3, xmm4
+
+ paddw xmm3, xmm5 ; d1
+ paddw xmm0, [GLOBAL(fours)]
+
+ paddw xmm2, [GLOBAL(fours)]
+ movdqa xmm6, xmm2 ; a1
+
+ movdqa xmm4, xmm0 ; b1
+ paddw xmm2, xmm3 ;0
+
+ paddw xmm4, xmm7 ;1
+ psubw xmm0, xmm7 ;2
+
+ psubw xmm6, xmm3 ;3
+ psraw xmm2, 3
+
+ psraw xmm0, 3
+ psraw xmm4, 3
+
+ psraw xmm6, 3
+
+ ; transpose to save
+ movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
+ punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
+ punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
+
+ movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
+ punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
+ punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
+
+
+ movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
+ punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
+ punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
+
+ movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
+ punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
+ punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
+
+
+ movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
+ punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
+ punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
+
+ movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
+ punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
+ punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
+
+ pshufd xmm0, xmm2, 11011000b
+ pshufd xmm2, xmm1, 11011000b
+
+ pshufd xmm1, xmm5, 11011000b
+ pshufd xmm3, xmm7, 11011000b
+
+ pxor xmm7, xmm7
+
+ ; Load up predict blocks
+ movsxd rdx, dword ptr arg(3) ; dst_stride
+ movq xmm4, [rdi]
+ movq xmm5, [rdi+rdx]
+ lea rcx, [rdx + rdx*2]
+
+ punpcklbw xmm4, xmm7
+ punpcklbw xmm5, xmm7
+
+ paddw xmm0, xmm4
+ paddw xmm1, xmm5
+
+ movq xmm4, [rdi+rdx*2]
+ movq xmm5, [rdi+rcx]
+
+ punpcklbw xmm4, xmm7
+ punpcklbw xmm5, xmm7
+
+ paddw xmm2, xmm4
+ paddw xmm3, xmm5
+
+.finish:
+
+ ; pack up before storing
+ packuswb xmm0, xmm7
+ packuswb xmm1, xmm7
+ packuswb xmm2, xmm7
+ packuswb xmm3, xmm7
+
+ ; Load destination stride before writing out,
+ ; doesn't need to persist
+ movsxd rdx, dword ptr arg(3) ; dst_stride
+
+ ; store blocks back out
+ movq [rdi], xmm0
+ movq [rdi + rdx], xmm1
+
+ lea rdi, [rdi + 2*rdx]
+
+ movq [rdi], xmm2
+ movq [rdi + rdx], xmm3
+
+
+ ; begin epilog
+ pop rdi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+fours:
+ times 8 dw 0x0004
+align 16
+x_s1sqr2:
+ times 8 dw 0x8A8C
+align 16
+x_c1sqr2less1:
+ times 8 dw 0x4E7B
diff --git a/thirdparty/libvpx/vp8/common/x86/iwalsh_mmx.asm b/thirdparty/libvpx/vp8/common/x86/iwalsh_mmx.asm
new file mode 100644
index 0000000000..158c3b7458
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/x86/iwalsh_mmx.asm
@@ -0,0 +1,140 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_short_inv_walsh4x4_mmx(short *input, short *output)
+global sym(vp8_short_inv_walsh4x4_mmx) PRIVATE
+sym(vp8_short_inv_walsh4x4_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 2
+ ; end prolog
+
+ mov rdx, arg(0)
+ mov rax, 30003h
+
+ movq mm0, [rdx + 0] ;ip[0]
+ movq mm1, [rdx + 8] ;ip[4]
+ movq mm7, rax
+
+ movq mm2, [rdx + 16] ;ip[8]
+ movq mm3, [rdx + 24] ;ip[12]
+ punpcklwd mm7, mm7 ;0003000300030003h
+ mov rdx, arg(1)
+
+ movq mm4, mm0
+ movq mm5, mm1
+
+ paddw mm4, mm3 ;ip[0] + ip[12] aka al
+ paddw mm5, mm2 ;ip[4] + ip[8] aka bl
+
+ movq mm6, mm4 ;temp al
+ paddw mm4, mm5 ;al + bl
+ psubw mm6, mm5 ;al - bl
+
+ psubw mm0, mm3 ;ip[0] - ip[12] aka d1
+ psubw mm1, mm2 ;ip[4] - ip[8] aka c1
+
+ movq mm5, mm0 ;temp dl
+ paddw mm0, mm1 ;dl + cl
+ psubw mm5, mm1 ;dl - cl
+
+ ; 03 02 01 00
+ ; 13 12 11 10
+ ; 23 22 21 20
+ ; 33 32 31 30
+
+ movq mm3, mm4 ; 03 02 01 00
+ punpcklwd mm4, mm0 ; 11 01 10 00
+ punpckhwd mm3, mm0 ; 13 03 12 02
+
+ movq mm1, mm6 ; 23 22 21 20
+ punpcklwd mm6, mm5 ; 31 21 30 20
+ punpckhwd mm1, mm5 ; 33 23 32 22
+
+ movq mm0, mm4 ; 11 01 10 00
+ movq mm2, mm3 ; 13 03 12 02
+
+ punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0]
+ punpckhdq mm4, mm6 ; 31 21 11 01 aka ip[4]
+
+ punpckldq mm2, mm1 ; 32 22 12 02 aka ip[8]
+ punpckhdq mm3, mm1 ; 33 23 13 03 aka ip[12]
+;~~~~~~~~~~~~~~~~~~~~~
+ movq mm1, mm0
+ movq mm5, mm4
+ paddw mm1, mm3 ;ip[0] + ip[12] aka al
+ paddw mm5, mm2 ;ip[4] + ip[8] aka bl
+
+ movq mm6, mm1 ;temp al
+ paddw mm1, mm5 ;al + bl
+ psubw mm6, mm5 ;al - bl
+ paddw mm1, mm7
+ paddw mm6, mm7
+ psraw mm1, 3
+ psraw mm6, 3
+
+ psubw mm0, mm3 ;ip[0] - ip[12] aka d1
+ psubw mm4, mm2 ;ip[4] - ip[8] aka c1
+
+ movq mm5, mm0 ;temp dl
+ paddw mm0, mm4 ;dl + cl
+ psubw mm5, mm4 ;dl - cl
+ paddw mm0, mm7
+ paddw mm5, mm7
+ psraw mm0, 3
+ psraw mm5, 3
+;~~~~~~~~~~~~~~~~~~~~~
+
+ movd eax, mm1
+ movd ecx, mm0
+ psrlq mm0, 32
+ psrlq mm1, 32
+ mov word ptr[rdx+32*0], ax
+ mov word ptr[rdx+32*1], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*4], ax
+ mov word ptr[rdx+32*5], cx
+ movd eax, mm1
+ movd ecx, mm0
+ mov word ptr[rdx+32*8], ax
+ mov word ptr[rdx+32*9], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*12], ax
+ mov word ptr[rdx+32*13], cx
+
+ movd eax, mm6
+ movd ecx, mm5
+ psrlq mm5, 32
+ psrlq mm6, 32
+ mov word ptr[rdx+32*2], ax
+ mov word ptr[rdx+32*3], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*6], ax
+ mov word ptr[rdx+32*7], cx
+ movd eax, mm6
+ movd ecx, mm5
+ mov word ptr[rdx+32*10], ax
+ mov word ptr[rdx+32*11], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*14], ax
+ mov word ptr[rdx+32*15], cx
+
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
diff --git a/thirdparty/libvpx/vp8/common/x86/iwalsh_sse2.asm b/thirdparty/libvpx/vp8/common/x86/iwalsh_sse2.asm
new file mode 100644
index 0000000000..06e86a80b6
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/x86/iwalsh_sse2.asm
@@ -0,0 +1,121 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_short_inv_walsh4x4_sse2(short *input, short *output)
+global sym(vp8_short_inv_walsh4x4_sse2) PRIVATE
+sym(vp8_short_inv_walsh4x4_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 2
+ ; end prolog
+
+ mov rcx, arg(0)
+ mov rdx, arg(1)
+ mov rax, 30003h
+
+ movdqa xmm0, [rcx + 0] ;ip[4] ip[0]
+ movdqa xmm1, [rcx + 16] ;ip[12] ip[8]
+
+
+ pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
+ movdqa xmm3, xmm0 ;ip[4] ip[0]
+
+ paddw xmm0, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
+ psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
+
+ movdqa xmm4, xmm0
+ punpcklqdq xmm0, xmm3 ;d1 a1
+ punpckhqdq xmm4, xmm3 ;c1 b1
+
+ movdqa xmm1, xmm4 ;c1 b1
+ paddw xmm4, xmm0 ;dl+cl a1+b1 aka op[4] op[0]
+ psubw xmm0, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
+
+ ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ ; 13 12 11 10 03 02 01 00
+ ;
+ ; 33 32 31 30 23 22 21 20
+ ;
+ movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00
+ punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00
+ punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10
+ movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00
+ punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00
+ punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02
+ ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ movd xmm0, eax
+ pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
+ movdqa xmm3, xmm4 ;ip[4] ip[0]
+
+ pshufd xmm0, xmm0, 0 ;03 03 03 03 03 03 03 03
+
+ paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
+ psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
+
+ movdqa xmm5, xmm4
+ punpcklqdq xmm4, xmm3 ;d1 a1
+ punpckhqdq xmm5, xmm3 ;c1 b1
+
+ movdqa xmm1, xmm5 ;c1 b1
+ paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0]
+ psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
+
+ paddw xmm5, xmm0
+ paddw xmm4, xmm0
+ psraw xmm5, 3
+ psraw xmm4, 3
+
+ movd eax, xmm5
+ movd ecx, xmm4
+ psrldq xmm5, 4
+ psrldq xmm4, 4
+ mov word ptr[rdx+32*0], ax
+ mov word ptr[rdx+32*2], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*4], ax
+ mov word ptr[rdx+32*6], cx
+ movd eax, xmm5
+ movd ecx, xmm4
+ psrldq xmm5, 4
+ psrldq xmm4, 4
+ mov word ptr[rdx+32*8], ax
+ mov word ptr[rdx+32*10], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*12], ax
+ mov word ptr[rdx+32*14], cx
+
+ movd eax, xmm5
+ movd ecx, xmm4
+ psrldq xmm5, 4
+ psrldq xmm4, 4
+ mov word ptr[rdx+32*1], ax
+ mov word ptr[rdx+32*3], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*5], ax
+ mov word ptr[rdx+32*7], cx
+ movd eax, xmm5
+ movd ecx, xmm4
+ mov word ptr[rdx+32*9], ax
+ mov word ptr[rdx+32*11], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*13], ax
+ mov word ptr[rdx+32*15], cx
+
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/thirdparty/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm b/thirdparty/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm
new file mode 100644
index 0000000000..6d5aaa19db
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm
@@ -0,0 +1,815 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro LF_ABS 2
+ ; %1 value not preserved
+ ; %2 value preserved
+ ; output in %1
+ movdqa scratch1, %2 ; v2
+
+ psubusb scratch1, %1 ; v2 - v1
+ psubusb %1, %2 ; v1 - v2
+ por %1, scratch1 ; abs(v2 - v1)
+%endmacro
+
+%macro LF_FILTER_HEV_MASK 8-9
+
+ LF_ABS %1, %2 ; abs(p3 - p2)
+ LF_ABS %2, %3 ; abs(p2 - p1)
+ pmaxub %1, %2 ; accumulate mask
+%if %0 == 8
+ movdqa scratch2, %3 ; save p1
+ LF_ABS scratch2, %4 ; abs(p1 - p0)
+%endif
+ LF_ABS %4, %5 ; abs(p0 - q0)
+ LF_ABS %5, %6 ; abs(q0 - q1)
+%if %0 == 8
+ pmaxub %5, scratch2 ; accumulate hev
+%else
+ pmaxub %5, %9
+%endif
+ pmaxub %1, %5 ; accumulate mask
+
+ LF_ABS %3, %6 ; abs(p1 - q1)
+ LF_ABS %6, %7 ; abs(q1 - q2)
+ pmaxub %1, %6 ; accumulate mask
+ LF_ABS %7, %8 ; abs(q2 - q3)
+ pmaxub %1, %7 ; accumulate mask
+
+ paddusb %4, %4 ; 2 * abs(p0 - q0)
+ pand %3, [GLOBAL(tfe)]
+ psrlw %3, 1 ; abs(p1 - q1) / 2
+ paddusb %4, %3 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
+
+ psubusb %1, [limit]
+ psubusb %4, [blimit]
+ por %1, %4
+ pcmpeqb %1, zero ; mask
+
+ psubusb %5, [thresh]
+ pcmpeqb %5, zero ; ~hev
+%endmacro
+
+%macro LF_FILTER 6
+ ; %1-%4: p1-q1
+ ; %5: mask
+ ; %6: hev
+
+ movdqa scratch2, %6 ; save hev
+
+ pxor %1, [GLOBAL(t80)] ; ps1
+ pxor %4, [GLOBAL(t80)] ; qs1
+ movdqa scratch1, %1
+ psubsb scratch1, %4 ; signed_char_clamp(ps1 - qs1)
+ pandn scratch2, scratch1 ; vp8_filter &= hev
+
+ pxor %2, [GLOBAL(t80)] ; ps0
+ pxor %3, [GLOBAL(t80)] ; qs0
+ movdqa scratch1, %3
+ psubsb scratch1, %2 ; qs0 - ps0
+ paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
+ paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
+ paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
+ pand %5, scratch2 ; &= mask
+
+ movdqa scratch2, %5
+ paddsb %5, [GLOBAL(t4)] ; Filter1
+ paddsb scratch2, [GLOBAL(t3)] ; Filter2
+
+ ; Filter1 >> 3
+ movdqa scratch1, zero
+ pcmpgtb scratch1, %5
+ psrlw %5, 3
+ pand scratch1, [GLOBAL(te0)]
+ pand %5, [GLOBAL(t1f)]
+ por %5, scratch1
+
+ psubsb %3, %5 ; qs0 - Filter1
+ pxor %3, [GLOBAL(t80)]
+
+ ; Filter2 >> 3
+ movdqa scratch1, zero
+ pcmpgtb scratch1, scratch2
+ psrlw scratch2, 3
+ pand scratch1, [GLOBAL(te0)]
+ pand scratch2, [GLOBAL(t1f)]
+ por scratch2, scratch1
+
+ paddsb %2, scratch2 ; ps0 + Filter2
+ pxor %2, [GLOBAL(t80)]
+
+ ; outer tap adjustments
+ paddsb %5, [GLOBAL(t1)]
+ movdqa scratch1, zero
+ pcmpgtb scratch1, %5
+ psrlw %5, 1
+ pand scratch1, [GLOBAL(t80)]
+ pand %5, [GLOBAL(t7f)]
+ por %5, scratch1
+ pand %5, %6 ; vp8_filter &= ~hev
+
+ psubsb %4, %5 ; qs1 - vp8_filter
+ pxor %4, [GLOBAL(t80)]
+
+ paddsb %1, %5 ; ps1 + vp8_filter
+ pxor %1, [GLOBAL(t80)]
+%endmacro
+
+;void vp8_loop_filter_bh_y_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh
+;)
+global sym(vp8_loop_filter_bh_y_sse2) PRIVATE
+sym(vp8_loop_filter_bh_y_sse2):
+
+%if LIBVPX_YASM_WIN64
+ %define src rcx ; src_ptr
+ %define stride rdx ; src_pixel_step
+ %define blimit r8
+ %define limit r9
+ %define thresh r10
+
+ %define spp rax
+ %define stride3 r11
+ %define stride5 r12
+ %define stride7 r13
+
+ push rbp
+ mov rbp, rsp
+ SAVE_XMM 11
+ push r12
+ push r13
+ mov thresh, arg(4)
+%else
+ %define src rdi ; src_ptr
+ %define stride rsi ; src_pixel_step
+ %define blimit rdx
+ %define limit rcx
+ %define thresh r8
+
+ %define spp rax
+ %define stride3 r9
+ %define stride5 r10
+ %define stride7 r11
+%endif
+
+ %define scratch1 xmm5
+ %define scratch2 xmm6
+ %define zero xmm7
+
+ %define i0 [src]
+ %define i1 [spp]
+ %define i2 [src + 2 * stride]
+ %define i3 [spp + 2 * stride]
+ %define i4 [src + 4 * stride]
+ %define i5 [spp + 4 * stride]
+ %define i6 [src + 2 * stride3]
+ %define i7 [spp + 2 * stride3]
+ %define i8 [src + 8 * stride]
+ %define i9 [spp + 8 * stride]
+ %define i10 [src + 2 * stride5]
+ %define i11 [spp + 2 * stride5]
+ %define i12 [src + 4 * stride3]
+ %define i13 [spp + 4 * stride3]
+ %define i14 [src + 2 * stride7]
+ %define i15 [spp + 2 * stride7]
+
+ ; prep work
+ lea spp, [src + stride]
+ lea stride3, [stride + 2 * stride]
+ lea stride5, [stride3 + 2 * stride]
+ lea stride7, [stride3 + 4 * stride]
+ pxor zero, zero
+
+ ; load the first set into registers
+ movdqa xmm0, i0
+ movdqa xmm1, i1
+ movdqa xmm2, i2
+ movdqa xmm3, i3
+ movdqa xmm4, i4
+ movdqa xmm8, i5
+ movdqa xmm9, i6 ; q2, will contain abs(p1-p0)
+ movdqa xmm10, i7
+LF_FILTER_HEV_MASK xmm0, xmm1, xmm2, xmm3, xmm4, xmm8, xmm9, xmm10
+
+ movdqa xmm1, i2
+ movdqa xmm2, i3
+ movdqa xmm3, i4
+ movdqa xmm8, i5
+LF_FILTER xmm1, xmm2, xmm3, xmm8, xmm0, xmm4
+ movdqa i2, xmm1
+ movdqa i3, xmm2
+
+; second set
+ movdqa i4, xmm3
+ movdqa i5, xmm8
+
+ movdqa xmm0, i6
+ movdqa xmm1, i7
+ movdqa xmm2, i8
+ movdqa xmm4, i9
+ movdqa xmm10, i10 ; q2, will contain abs(p1-p0)
+ movdqa xmm11, i11
+LF_FILTER_HEV_MASK xmm3, xmm8, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm9
+
+ movdqa xmm0, i6
+ movdqa xmm1, i7
+ movdqa xmm4, i8
+ movdqa xmm8, i9
+LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
+ movdqa i6, xmm0
+ movdqa i7, xmm1
+
+; last set
+ movdqa i8, xmm4
+ movdqa i9, xmm8
+
+ movdqa xmm0, i10
+ movdqa xmm1, i11
+ movdqa xmm2, i12
+ movdqa xmm3, i13
+ movdqa xmm9, i14 ; q2, will contain abs(p1-p0)
+ movdqa xmm11, i15
+LF_FILTER_HEV_MASK xmm4, xmm8, xmm0, xmm1, xmm2, xmm3, xmm9, xmm11, xmm10
+
+ movdqa xmm0, i10
+ movdqa xmm1, i11
+ movdqa xmm3, i12
+ movdqa xmm8, i13
+LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2
+ movdqa i10, xmm0
+ movdqa i11, xmm1
+ movdqa i12, xmm3
+ movdqa i13, xmm8
+
+%if LIBVPX_YASM_WIN64
+ pop r13
+ pop r12
+ RESTORE_XMM
+ pop rbp
+%endif
+
+ ret
+
+
+;void vp8_loop_filter_bv_y_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh
+;)
+
+global sym(vp8_loop_filter_bv_y_sse2) PRIVATE
+sym(vp8_loop_filter_bv_y_sse2):
+
+%if LIBVPX_YASM_WIN64
+ %define src rcx ; src_ptr
+ %define stride rdx ; src_pixel_step
+ %define blimit r8
+ %define limit r9
+ %define thresh r10
+
+ %define spp rax
+ %define stride3 r11
+ %define stride5 r12
+ %define stride7 r13
+
+ push rbp
+ mov rbp, rsp
+ SAVE_XMM 15
+ push r12
+ push r13
+ mov thresh, arg(4)
+%else
+ %define src rdi
+ %define stride rsi
+ %define blimit rdx
+ %define limit rcx
+ %define thresh r8
+
+ %define spp rax
+ %define stride3 r9
+ %define stride5 r10
+ %define stride7 r11
+%endif
+
+ %define scratch1 xmm5
+ %define scratch2 xmm6
+ %define zero xmm7
+
+ %define s0 [src]
+ %define s1 [spp]
+ %define s2 [src + 2 * stride]
+ %define s3 [spp + 2 * stride]
+ %define s4 [src + 4 * stride]
+ %define s5 [spp + 4 * stride]
+ %define s6 [src + 2 * stride3]
+ %define s7 [spp + 2 * stride3]
+ %define s8 [src + 8 * stride]
+ %define s9 [spp + 8 * stride]
+ %define s10 [src + 2 * stride5]
+ %define s11 [spp + 2 * stride5]
+ %define s12 [src + 4 * stride3]
+ %define s13 [spp + 4 * stride3]
+ %define s14 [src + 2 * stride7]
+ %define s15 [spp + 2 * stride7]
+
+ %define i0 [rsp]
+ %define i1 [rsp + 16]
+ %define i2 [rsp + 32]
+ %define i3 [rsp + 48]
+ %define i4 [rsp + 64]
+ %define i5 [rsp + 80]
+ %define i6 [rsp + 96]
+ %define i7 [rsp + 112]
+ %define i8 [rsp + 128]
+ %define i9 [rsp + 144]
+ %define i10 [rsp + 160]
+ %define i11 [rsp + 176]
+ %define i12 [rsp + 192]
+ %define i13 [rsp + 208]
+ %define i14 [rsp + 224]
+ %define i15 [rsp + 240]
+
+ ALIGN_STACK 16, rax
+
+ ; reserve stack space
+ %define temp_storage 0 ; size is 256 (16*16)
+ %define stack_size 256
+ sub rsp, stack_size
+
+ ; prep work
+ lea spp, [src + stride]
+ lea stride3, [stride + 2 * stride]
+ lea stride5, [stride3 + 2 * stride]
+ lea stride7, [stride3 + 4 * stride]
+
+ ; 8-f
+ movdqa xmm0, s8
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, s9 ; 80 90
+ punpckhbw xmm1, s9 ; 88 98
+
+ movdqa xmm2, s10
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, s11 ; a0 b0
+ punpckhbw xmm3, s11 ; a8 b8
+
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm2 ; 80 90 a0 b0
+ punpckhwd xmm4, xmm2 ; 84 94 a4 b4
+
+ movdqa xmm2, xmm1
+ punpcklwd xmm1, xmm3 ; 88 98 a8 b8
+ punpckhwd xmm2, xmm3 ; 8c 9c ac bc
+
+ ; using xmm[0124]
+ ; work on next 4 rows
+
+ movdqa xmm3, s12
+ movdqa xmm5, xmm3
+ punpcklbw xmm3, s13 ; c0 d0
+ punpckhbw xmm5, s13 ; c8 d8
+
+ movdqa xmm6, s14
+ movdqa xmm7, xmm6
+ punpcklbw xmm6, s15 ; e0 f0
+ punpckhbw xmm7, s15 ; e8 f8
+
+ movdqa xmm8, xmm3
+ punpcklwd xmm3, xmm6 ; c0 d0 e0 f0
+ punpckhwd xmm8, xmm6 ; c4 d4 e4 f4
+
+ movdqa xmm6, xmm5
+ punpcklwd xmm5, xmm7 ; c8 d8 e8 f8
+ punpckhwd xmm6, xmm7 ; cc dc ec fc
+
+ ; pull the third and fourth sets together
+
+ movdqa xmm7, xmm0
+ punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0
+ punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2
+
+ movdqa xmm3, xmm4
+ punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4
+ punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6
+
+ movdqa xmm8, xmm1
+ punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8
+ punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa
+
+ movdqa xmm5, xmm2
+ punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc
+ punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe
+
+ ; save the calculations. we only have 15 registers ...
+ movdqa i0, xmm0
+ movdqa i1, xmm7
+ movdqa i2, xmm4
+ movdqa i3, xmm3
+ movdqa i4, xmm1
+ movdqa i5, xmm8
+ movdqa i6, xmm2
+ movdqa i7, xmm5
+
+ ; 0-7
+ movdqa xmm0, s0
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, s1 ; 00 10
+ punpckhbw xmm1, s1 ; 08 18
+
+ movdqa xmm2, s2
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, s3 ; 20 30
+ punpckhbw xmm3, s3 ; 28 38
+
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm2 ; 00 10 20 30
+ punpckhwd xmm4, xmm2 ; 04 14 24 34
+
+ movdqa xmm2, xmm1
+ punpcklwd xmm1, xmm3 ; 08 18 28 38
+ punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c
+
+ ; using xmm[0124]
+ ; work on next 4 rows
+
+ movdqa xmm3, s4
+ movdqa xmm5, xmm3
+ punpcklbw xmm3, s5 ; 40 50
+ punpckhbw xmm5, s5 ; 48 58
+
+ movdqa xmm6, s6
+ movdqa xmm7, xmm6
+ punpcklbw xmm6, s7 ; 60 70
+ punpckhbw xmm7, s7 ; 68 78
+
+ movdqa xmm8, xmm3
+ punpcklwd xmm3, xmm6 ; 40 50 60 70
+ punpckhwd xmm8, xmm6 ; 44 54 64 74
+
+ movdqa xmm6, xmm5
+ punpcklwd xmm5, xmm7 ; 48 58 68 78
+ punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c
+
+ ; pull the first two sets together
+
+ movdqa xmm7, xmm0
+ punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70
+ punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72
+
+ movdqa xmm3, xmm4
+ punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74
+ punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76
+
+ movdqa xmm8, xmm1
+ punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78
+ punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a
+
+ movdqa xmm5, xmm2
+ punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c
+ punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e
+ ; final combination
+
+ movdqa xmm6, xmm0
+ punpcklqdq xmm0, i0
+ punpckhqdq xmm6, i0
+
+ movdqa xmm9, xmm7
+ punpcklqdq xmm7, i1
+ punpckhqdq xmm9, i1
+
+ movdqa xmm10, xmm4
+ punpcklqdq xmm4, i2
+ punpckhqdq xmm10, i2
+
+ movdqa xmm11, xmm3
+ punpcklqdq xmm3, i3
+ punpckhqdq xmm11, i3
+
+ movdqa xmm12, xmm1
+ punpcklqdq xmm1, i4
+ punpckhqdq xmm12, i4
+
+ movdqa xmm13, xmm8
+ punpcklqdq xmm8, i5
+ punpckhqdq xmm13, i5
+
+ movdqa xmm14, xmm2
+ punpcklqdq xmm2, i6
+ punpckhqdq xmm14, i6
+
+ movdqa xmm15, xmm5
+ punpcklqdq xmm5, i7
+ punpckhqdq xmm15, i7
+
+ movdqa i0, xmm0
+ movdqa i1, xmm6
+ movdqa i2, xmm7
+ movdqa i3, xmm9
+ movdqa i4, xmm4
+ movdqa i5, xmm10
+ movdqa i6, xmm3
+ movdqa i7, xmm11
+ movdqa i8, xmm1
+ movdqa i9, xmm12
+ movdqa i10, xmm8
+ movdqa i11, xmm13
+ movdqa i12, xmm2
+ movdqa i13, xmm14
+ movdqa i14, xmm5
+ movdqa i15, xmm15
+
+; TRANSPOSED DATA AVAILABLE ON THE STACK
+
+ movdqa xmm12, xmm6
+ movdqa xmm13, xmm7
+
+ pxor zero, zero
+
+LF_FILTER_HEV_MASK xmm0, xmm12, xmm13, xmm9, xmm4, xmm10, xmm3, xmm11
+
+ movdqa xmm1, i2
+ movdqa xmm2, i3
+ movdqa xmm8, i4
+ movdqa xmm9, i5
+LF_FILTER xmm1, xmm2, xmm8, xmm9, xmm0, xmm4
+ movdqa i2, xmm1
+ movdqa i3, xmm2
+
+; second set
+ movdqa i4, xmm8
+ movdqa i5, xmm9
+
+ movdqa xmm0, i6
+ movdqa xmm1, i7
+ movdqa xmm2, i8
+ movdqa xmm4, i9
+ movdqa xmm10, i10 ; q2, will contain abs(p1-p0)
+ movdqa xmm11, i11
+LF_FILTER_HEV_MASK xmm8, xmm9, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm3
+
+ movdqa xmm0, i6
+ movdqa xmm1, i7
+ movdqa xmm3, i8
+ movdqa xmm4, i9
+LF_FILTER xmm0, xmm1, xmm3, xmm4, xmm8, xmm2
+ movdqa i6, xmm0
+ movdqa i7, xmm1
+
+; last set
+ movdqa i8, xmm3
+ movdqa i9, xmm4
+
+ movdqa xmm0, i10
+ movdqa xmm1, i11
+ movdqa xmm2, i12
+ movdqa xmm8, i13
+ movdqa xmm9, i14 ; q2, will contain abs(p1-p0)
+ movdqa xmm11, i15
+LF_FILTER_HEV_MASK xmm3, xmm4, xmm0, xmm1, xmm2, xmm8, xmm9, xmm11, xmm10
+
+ movdqa xmm0, i10
+ movdqa xmm1, i11
+ movdqa xmm4, i12
+ movdqa xmm8, i13
+LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
+ movdqa i10, xmm0
+ movdqa i11, xmm1
+ movdqa i12, xmm4
+ movdqa i13, xmm8
+
+
+; RESHUFFLE AND WRITE OUT
+ ; 8-f
+ movdqa xmm0, i8
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, i9 ; 80 90
+ punpckhbw xmm1, i9 ; 88 98
+
+ movdqa xmm2, i10
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, i11 ; a0 b0
+ punpckhbw xmm3, i11 ; a8 b8
+
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm2 ; 80 90 a0 b0
+ punpckhwd xmm4, xmm2 ; 84 94 a4 b4
+
+ movdqa xmm2, xmm1
+ punpcklwd xmm1, xmm3 ; 88 98 a8 b8
+ punpckhwd xmm2, xmm3 ; 8c 9c ac bc
+
+ ; using xmm[0124]
+ ; work on next 4 rows
+
+ movdqa xmm3, i12
+ movdqa xmm5, xmm3
+ punpcklbw xmm3, i13 ; c0 d0
+ punpckhbw xmm5, i13 ; c8 d8
+
+ movdqa xmm6, i14
+ movdqa xmm7, xmm6
+ punpcklbw xmm6, i15 ; e0 f0
+ punpckhbw xmm7, i15 ; e8 f8
+
+ movdqa xmm8, xmm3
+ punpcklwd xmm3, xmm6 ; c0 d0 e0 f0
+ punpckhwd xmm8, xmm6 ; c4 d4 e4 f4
+
+ movdqa xmm6, xmm5
+ punpcklwd xmm5, xmm7 ; c8 d8 e8 f8
+ punpckhwd xmm6, xmm7 ; cc dc ec fc
+
+ ; pull the third and fourth sets together
+
+ movdqa xmm7, xmm0
+ punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0
+ punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2
+
+ movdqa xmm3, xmm4
+ punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4
+ punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6
+
+ movdqa xmm8, xmm1
+ punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8
+ punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa
+
+ movdqa xmm5, xmm2
+ punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc
+ punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe
+
+ ; save the calculations. we only have 15 registers ...
+ movdqa i8, xmm0
+ movdqa i9, xmm7
+ movdqa i10, xmm4
+ movdqa i11, xmm3
+ movdqa i12, xmm1
+ movdqa i13, xmm8
+ movdqa i14, xmm2
+ movdqa i15, xmm5
+
+ ; 0-7
+ movdqa xmm0, i0
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, i1 ; 00 10
+ punpckhbw xmm1, i1 ; 08 18
+
+ movdqa xmm2, i2
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, i3 ; 20 30
+ punpckhbw xmm3, i3 ; 28 38
+
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm2 ; 00 10 20 30
+ punpckhwd xmm4, xmm2 ; 04 14 24 34
+
+ movdqa xmm2, xmm1
+ punpcklwd xmm1, xmm3 ; 08 18 28 38
+ punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c
+
+ ; using xmm[0124]
+ ; work on next 4 rows
+
+ movdqa xmm3, i4
+ movdqa xmm5, xmm3
+ punpcklbw xmm3, i5 ; 40 50
+ punpckhbw xmm5, i5 ; 48 58
+
+ movdqa xmm6, i6
+ movdqa xmm7, xmm6
+ punpcklbw xmm6, i7 ; 60 70
+ punpckhbw xmm7, i7 ; 68 78
+
+ movdqa xmm8, xmm3
+ punpcklwd xmm3, xmm6 ; 40 50 60 70
+ punpckhwd xmm8, xmm6 ; 44 54 64 74
+
+ movdqa xmm6, xmm5
+ punpcklwd xmm5, xmm7 ; 48 58 68 78
+ punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c
+
+ ; pull the first two sets together
+
+ movdqa xmm7, xmm0
+ punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70
+ punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72
+
+ movdqa xmm3, xmm4
+ punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74
+ punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76
+
+ movdqa xmm8, xmm1
+ punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78
+ punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a
+
+ movdqa xmm5, xmm2
+ punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c
+ punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e
+ ; final combination
+
+ movdqa xmm6, xmm0
+ punpcklqdq xmm0, i8
+ punpckhqdq xmm6, i8
+
+ movdqa xmm9, xmm7
+ punpcklqdq xmm7, i9
+ punpckhqdq xmm9, i9
+
+ movdqa xmm10, xmm4
+ punpcklqdq xmm4, i10
+ punpckhqdq xmm10, i10
+
+ movdqa xmm11, xmm3
+ punpcklqdq xmm3, i11
+ punpckhqdq xmm11, i11
+
+ movdqa xmm12, xmm1
+ punpcklqdq xmm1, i12
+ punpckhqdq xmm12, i12
+
+ movdqa xmm13, xmm8
+ punpcklqdq xmm8, i13
+ punpckhqdq xmm13, i13
+
+ movdqa xmm14, xmm2
+ punpcklqdq xmm2, i14
+ punpckhqdq xmm14, i14
+
+ movdqa xmm15, xmm5
+ punpcklqdq xmm5, i15
+ punpckhqdq xmm15, i15
+
+ movdqa s0, xmm0
+ movdqa s1, xmm6
+ movdqa s2, xmm7
+ movdqa s3, xmm9
+ movdqa s4, xmm4
+ movdqa s5, xmm10
+ movdqa s6, xmm3
+ movdqa s7, xmm11
+ movdqa s8, xmm1
+ movdqa s9, xmm12
+ movdqa s10, xmm8
+ movdqa s11, xmm13
+ movdqa s12, xmm2
+ movdqa s13, xmm14
+ movdqa s14, xmm5
+ movdqa s15, xmm15
+
+ ; free stack space
+ add rsp, stack_size
+
+ ; un-ALIGN_STACK
+ pop rsp
+
+%if LIBVPX_YASM_WIN64
+ pop r13
+ pop r12
+ RESTORE_XMM
+ pop rbp
+%endif
+
+ ret
+
+SECTION_RODATA
+align 16
+te0:
+ times 16 db 0xe0
+align 16
+t7f:
+ times 16 db 0x7f
+align 16
+tfe:
+ times 16 db 0xfe
+align 16
+t1f:
+ times 16 db 0x1f
+align 16
+t80:
+ times 16 db 0x80
+align 16
+t1:
+ times 16 db 0x01
+align 16
+t3:
+ times 16 db 0x03
+align 16
+t4:
+ times 16 db 0x04
diff --git a/thirdparty/libvpx/vp8/common/x86/loopfilter_sse2.asm b/thirdparty/libvpx/vp8/common/x86/loopfilter_sse2.asm
new file mode 100644
index 0000000000..1913abc69b
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/x86/loopfilter_sse2.asm
@@ -0,0 +1,1640 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+%define _t0 0
+%define _t1 _t0 + 16
+%define _p3 _t1 + 16
+%define _p2 _p3 + 16
+%define _p1 _p2 + 16
+%define _p0 _p1 + 16
+%define _q0 _p0 + 16
+%define _q1 _q0 + 16
+%define _q2 _q1 + 16
+%define _q3 _q2 + 16
+%define lf_var_size 160
+
+; Use of pmaxub instead of psubusb to compute filter mask was seen
+; in ffvp8
+
+%macro LFH_FILTER_AND_HEV_MASK 1
+%if %1
+ movdqa xmm2, [rdi+2*rax] ; q3
+ movdqa xmm1, [rsi+2*rax] ; q2
+ movdqa xmm4, [rsi+rax] ; q1
+ movdqa xmm5, [rsi] ; q0
+ neg rax ; negate pitch to deal with above border
+%else
+ movlps xmm2, [rsi + rcx*2] ; q3
+ movlps xmm1, [rsi + rcx] ; q2
+ movlps xmm4, [rsi] ; q1
+ movlps xmm5, [rsi + rax] ; q0
+
+ movhps xmm2, [rdi + rcx*2]
+ movhps xmm1, [rdi + rcx]
+ movhps xmm4, [rdi]
+ movhps xmm5, [rdi + rax]
+
+ lea rsi, [rsi + rax*4]
+ lea rdi, [rdi + rax*4]
+
+ movdqa [rsp+_q2], xmm1 ; store q2
+ movdqa [rsp+_q1], xmm4 ; store q1
+%endif
+ movdqa xmm7, [rdx] ;limit
+
+ movdqa xmm6, xmm1 ; q2
+ movdqa xmm3, xmm4 ; q1
+
+ psubusb xmm1, xmm2 ; q2-=q3
+ psubusb xmm2, xmm6 ; q3-=q2
+
+ psubusb xmm4, xmm6 ; q1-=q2
+ psubusb xmm6, xmm3 ; q2-=q1
+
+ por xmm4, xmm6 ; abs(q2-q1)
+ por xmm1, xmm2 ; abs(q3-q2)
+
+ movdqa xmm0, xmm5 ; q0
+ pmaxub xmm1, xmm4
+
+ psubusb xmm5, xmm3 ; q0-=q1
+ psubusb xmm3, xmm0 ; q1-=q0
+
+ por xmm5, xmm3 ; abs(q0-q1)
+ movdqa [rsp+_t0], xmm5 ; save to t0
+
+ pmaxub xmm1, xmm5
+
+%if %1
+ movdqa xmm2, [rsi+4*rax] ; p3
+ movdqa xmm4, [rdi+4*rax] ; p2
+ movdqa xmm6, [rsi+2*rax] ; p1
+%else
+ movlps xmm2, [rsi + rax] ; p3
+ movlps xmm4, [rsi] ; p2
+ movlps xmm6, [rsi + rcx] ; p1
+
+ movhps xmm2, [rdi + rax]
+ movhps xmm4, [rdi]
+ movhps xmm6, [rdi + rcx]
+
+ movdqa [rsp+_p2], xmm4 ; store p2
+ movdqa [rsp+_p1], xmm6 ; store p1
+%endif
+
+ movdqa xmm5, xmm4 ; p2
+ movdqa xmm3, xmm6 ; p1
+
+ psubusb xmm4, xmm2 ; p2-=p3
+ psubusb xmm2, xmm5 ; p3-=p2
+
+ psubusb xmm3, xmm5 ; p1-=p2
+ pmaxub xmm1, xmm4 ; abs(p3 - p2)
+
+ psubusb xmm5, xmm6 ; p2-=p1
+ pmaxub xmm1, xmm2 ; abs(p3 - p2)
+
+ pmaxub xmm1, xmm5 ; abs(p2 - p1)
+ movdqa xmm2, xmm6 ; p1
+
+ pmaxub xmm1, xmm3 ; abs(p2 - p1)
+%if %1
+ movdqa xmm4, [rsi+rax] ; p0
+ movdqa xmm3, [rdi] ; q1
+%else
+ movlps xmm4, [rsi + rcx*2] ; p0
+ movhps xmm4, [rdi + rcx*2]
+ movdqa xmm3, [rsp+_q1] ; q1
+%endif
+
+ movdqa xmm5, xmm4 ; p0
+ psubusb xmm4, xmm6 ; p0-=p1
+
+ psubusb xmm6, xmm5 ; p1-=p0
+
+ por xmm6, xmm4 ; abs(p1 - p0)
+ mov rdx, arg(2) ; get blimit
+
+ movdqa [rsp+_t1], xmm6 ; save to t1
+
+ movdqa xmm4, xmm3 ; q1
+ pmaxub xmm1, xmm6
+
+ psubusb xmm3, xmm2 ; q1-=p1
+ psubusb xmm2, xmm4 ; p1-=q1
+
+ psubusb xmm1, xmm7
+ por xmm2, xmm3 ; abs(p1-q1)
+
+ movdqa xmm7, [rdx] ; blimit
+ mov rdx, arg(4) ; hev get thresh
+
+ movdqa xmm3, xmm0 ; q0
+ pand xmm2, [GLOBAL(tfe)] ; set lsb of each byte to zero
+
+ movdqa xmm6, xmm5 ; p0
+ psrlw xmm2, 1 ; abs(p1-q1)/2
+
+ psubusb xmm5, xmm3 ; p0-=q0
+ psubusb xmm3, xmm6 ; q0-=p0
+ por xmm5, xmm3 ; abs(p0 - q0)
+
+ paddusb xmm5, xmm5 ; abs(p0-q0)*2
+
+ movdqa xmm4, [rsp+_t0] ; hev get abs (q1 - q0)
+ movdqa xmm3, [rsp+_t1] ; get abs (p1 - p0)
+
+ paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ movdqa xmm2, [rdx] ; hev
+
+ psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ psubusb xmm4, xmm2 ; hev
+
+ psubusb xmm3, xmm2 ; hev
+ por xmm1, xmm5
+
+ pxor xmm7, xmm7
+ paddb xmm4, xmm3 ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+
+ pcmpeqb xmm4, xmm5 ; hev
+ pcmpeqb xmm3, xmm3 ; hev
+
+ pcmpeqb xmm1, xmm7 ; mask xmm1
+ pxor xmm4, xmm3 ; hev
+%endmacro
+
+%macro B_FILTER 1
+ movdqa xmm3, [GLOBAL(t80)]
+%if %1 == 0
+ movdqa xmm2, [rsp+_p1] ; p1
+ movdqa xmm7, [rsp+_q1] ; q1
+%elif %1 == 1
+ movdqa xmm2, [rsi+2*rax] ; p1
+ movdqa xmm7, [rdi] ; q1
+%elif %1 == 2
+ movdqa xmm2, [rsp+_p1] ; p1
+ movdqa xmm6, [rsp+_p0] ; p0
+ movdqa xmm0, [rsp+_q0] ; q0
+ movdqa xmm7, [rsp+_q1] ; q1
+%endif
+
+ pxor xmm2, xmm3 ; p1 offset to convert to signed values
+ pxor xmm7, xmm3 ; q1 offset to convert to signed values
+
+ psubsb xmm2, xmm7 ; p1 - q1
+ pxor xmm6, xmm3 ; offset to convert to signed values
+
+ pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1)
+ pxor xmm0, xmm3 ; offset to convert to signed values
+
+ movdqa xmm3, xmm0 ; q0
+ psubsb xmm0, xmm6 ; q0 - p0
+ paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
+ paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
+ paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
+ pand xmm1, xmm2 ; mask filter values we don't care about
+
+ movdqa xmm2, xmm1
+ paddsb xmm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+ paddsb xmm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+
+ punpckhbw xmm5, xmm2 ; axbxcxdx
+ punpcklbw xmm2, xmm2 ; exfxgxhx
+
+ punpcklbw xmm0, xmm1 ; exfxgxhx
+ psraw xmm5, 11 ; sign extended shift right by 3
+
+ punpckhbw xmm1, xmm1 ; axbxcxdx
+ psraw xmm2, 11 ; sign extended shift right by 3
+
+ packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
+ psraw xmm0, 11 ; sign extended shift right by 3
+
+ psraw xmm1, 11 ; sign extended shift right by 3
+ movdqa xmm5, xmm0 ; save results
+
+ packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+
+ paddsb xmm6, xmm2 ; p0+= p0 add
+
+ movdqa xmm2, [GLOBAL(ones)]
+ paddsw xmm5, xmm2
+ paddsw xmm1, xmm2
+ psraw xmm5, 1 ; partial shifted one more time for 2nd tap
+ psraw xmm1, 1 ; partial shifted one more time for 2nd tap
+ packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+ movdqa xmm2, [GLOBAL(t80)]
+
+%if %1 == 0
+ movdqa xmm1, [rsp+_p1] ; p1
+ lea rsi, [rsi + rcx*2]
+ lea rdi, [rdi + rcx*2]
+%elif %1 == 1
+ movdqa xmm1, [rsi+2*rax] ; p1
+%elif %1 == 2
+ movdqa xmm1, [rsp+_p1] ; p1
+%endif
+
+ pandn xmm4, xmm5 ; high edge variance additive
+ pxor xmm6, xmm2 ; unoffset
+
+ pxor xmm1, xmm2 ; reoffset
+ psubsb xmm3, xmm0 ; q0-= q0 add
+
+ paddsb xmm1, xmm4 ; p1+= p1 add
+ pxor xmm3, xmm2 ; unoffset
+
+ pxor xmm1, xmm2 ; unoffset
+ psubsb xmm7, xmm4 ; q1-= q1 add
+
+ pxor xmm7, xmm2 ; unoffset
+%if %1 == 0
+ movq [rsi], xmm6 ; p0
+ movhps [rdi], xmm6
+ movq [rsi + rax], xmm1 ; p1
+ movhps [rdi + rax], xmm1
+ movq [rsi + rcx], xmm3 ; q0
+ movhps [rdi + rcx], xmm3
+ movq [rsi + rcx*2], xmm7 ; q1
+ movhps [rdi + rcx*2], xmm7
+%elif %1 == 1
+ movdqa [rsi+rax], xmm6 ; write back
+ movdqa [rsi+2*rax], xmm1 ; write back
+ movdqa [rsi], xmm3 ; write back
+ movdqa [rdi], xmm7 ; write back
+%endif
+
+%endmacro
+
+%if ABI_IS_32BIT
+
+;void vp8_loop_filter_horizontal_edge_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+;)
+global sym(vp8_loop_filter_horizontal_edge_sse2) PRIVATE
+sym(vp8_loop_filter_horizontal_edge_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, lf_var_size
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step
+
+ mov rdx, arg(3) ;limit
+
+ lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing
+
+ ; calculate breakout conditions and high edge variance
+ LFH_FILTER_AND_HEV_MASK 1
+ ; filter and write back the result
+ B_FILTER 1
+
+ add rsp, lf_var_size
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+%endif
+
+;void vp8_loop_filter_horizontal_edge_uv_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+; int count
+;)
+global sym(vp8_loop_filter_horizontal_edge_uv_sse2) PRIVATE
+sym(vp8_loop_filter_horizontal_edge_uv_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, lf_var_size
+
+ mov rsi, arg(0) ; u
+ mov rdi, arg(5) ; v
+ movsxd rax, dword ptr arg(1) ; src_pixel_step
+ mov rcx, rax
+ neg rax ; negate pitch to deal with above border
+
+ mov rdx, arg(3) ;limit
+
+ lea rsi, [rsi + rcx]
+ lea rdi, [rdi + rcx]
+
+ ; calculate breakout conditions and high edge variance
+ LFH_FILTER_AND_HEV_MASK 0
+ ; filter and write back the result
+ B_FILTER 0
+
+ add rsp, lf_var_size
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+%macro MB_FILTER_AND_WRITEBACK 1
+ movdqa xmm3, [GLOBAL(t80)]
+%if %1 == 0
+ movdqa xmm2, [rsp+_p1] ; p1
+ movdqa xmm7, [rsp+_q1] ; q1
+%elif %1 == 1
+ movdqa xmm2, [rsi+2*rax] ; p1
+ movdqa xmm7, [rdi] ; q1
+
+ mov rcx, rax
+ neg rcx
+%elif %1 == 2
+ movdqa xmm2, [rsp+_p1] ; p1
+ movdqa xmm6, [rsp+_p0] ; p0
+ movdqa xmm0, [rsp+_q0] ; q0
+ movdqa xmm7, [rsp+_q1] ; q1
+%endif
+
+ pxor xmm2, xmm3 ; p1 offset to convert to signed values
+ pxor xmm7, xmm3 ; q1 offset to convert to signed values
+ pxor xmm6, xmm3 ; offset to convert to signed values
+ pxor xmm0, xmm3 ; offset to convert to signed values
+
+ psubsb xmm2, xmm7 ; p1 - q1
+
+ movdqa xmm3, xmm0 ; q0
+ psubsb xmm0, xmm6 ; q0 - p0
+ paddsb xmm2, xmm0 ; 1 * (q0 - p0) + (p1 - q1)
+ paddsb xmm2, xmm0 ; 2 * (q0 - p0)
+ paddsb xmm2, xmm0 ; 3 * (q0 - p0) + (p1 - q1)
+ pand xmm1, xmm2 ; mask filter values we don't care about
+
+ movdqa xmm2, xmm1 ; vp8_filter
+
+ pand xmm2, xmm4 ; Filter2 = vp8_filter & hev
+ pxor xmm0, xmm0
+
+ pandn xmm4, xmm1 ; vp8_filter&=~hev
+ pxor xmm1, xmm1
+
+ punpcklbw xmm0, xmm4 ; Filter 2 (hi)
+ punpckhbw xmm1, xmm4 ; Filter 2 (lo)
+
+ movdqa xmm5, xmm2
+
+ movdqa xmm4, [GLOBAL(s9)]
+ paddsb xmm5, [GLOBAL(t3)] ; vp8_signed_char_clamp(Filter2 + 3)
+ paddsb xmm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4)
+
+ pmulhw xmm1, xmm4 ; Filter 2 (lo) * 9
+ pmulhw xmm0, xmm4 ; Filter 2 (hi) * 9
+
+ punpckhbw xmm7, xmm5 ; axbxcxdx
+ punpcklbw xmm5, xmm5 ; exfxgxhx
+
+ psraw xmm7, 11 ; sign extended shift right by 3
+
+ psraw xmm5, 11 ; sign extended shift right by 3
+ punpckhbw xmm4, xmm2 ; axbxcxdx
+
+ punpcklbw xmm2, xmm2 ; exfxgxhx
+ psraw xmm4, 11 ; sign extended shift right by 3
+
+ packsswb xmm5, xmm7 ; Filter2 >>=3;
+ psraw xmm2, 11 ; sign extended shift right by 3
+
+ packsswb xmm2, xmm4 ; Filter1 >>=3;
+
+ paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2
+
+ psubsb xmm3, xmm2 ; qs0 =qs0 - Filter1
+ movdqa xmm7, xmm1
+
+ movdqa xmm4, [GLOBAL(s63)]
+ movdqa xmm5, xmm0
+ movdqa xmm2, xmm5
+ paddw xmm0, xmm4 ; Filter 2 (hi) * 9 + 63
+ paddw xmm1, xmm4 ; Filter 2 (lo) * 9 + 63
+ movdqa xmm4, xmm7
+
+ paddw xmm5, xmm5 ; Filter 2 (hi) * 18
+
+ paddw xmm7, xmm7 ; Filter 2 (lo) * 18
+ paddw xmm5, xmm0 ; Filter 2 (hi) * 27 + 63
+
+ paddw xmm7, xmm1 ; Filter 2 (lo) * 27 + 63
+ paddw xmm2, xmm0 ; Filter 2 (hi) * 18 + 63
+ psraw xmm0, 7 ; (Filter 2 (hi) * 9 + 63) >> 7
+
+ paddw xmm4, xmm1 ; Filter 2 (lo) * 18 + 63
+ psraw xmm1, 7 ; (Filter 2 (lo) * 9 + 63) >> 7
+ psraw xmm2, 7 ; (Filter 2 (hi) * 18 + 63) >> 7
+
+ packsswb xmm0, xmm1 ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
+
+ psraw xmm4, 7 ; (Filter 2 (lo) * 18 + 63) >> 7
+ psraw xmm5, 7 ; (Filter 2 (hi) * 27 + 63) >> 7
+ psraw xmm7, 7 ; (Filter 2 (lo) * 27 + 63) >> 7
+
+ packsswb xmm5, xmm7 ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
+ packsswb xmm2, xmm4 ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
+ movdqa xmm7, [GLOBAL(t80)]
+
+%if %1 == 0
+ movdqa xmm1, [rsp+_q1] ; q1
+ movdqa xmm4, [rsp+_p1] ; p1
+ lea rsi, [rsi+rcx*2]
+ lea rdi, [rdi+rcx*2]
+
+%elif %1 == 1
+ movdqa xmm1, [rdi] ; q1
+ movdqa xmm4, [rsi+rax*2] ; p1
+%elif %1 == 2
+ movdqa xmm4, [rsp+_p1] ; p1
+ movdqa xmm1, [rsp+_q1] ; q1
+%endif
+
+ pxor xmm1, xmm7
+ pxor xmm4, xmm7
+
+ psubsb xmm3, xmm5 ; sq = vp8_signed_char_clamp(qs0 - u3)
+ paddsb xmm6, xmm5 ; sp = vp8_signed_char_clamp(ps0 - u3)
+ psubsb xmm1, xmm2 ; sq = vp8_signed_char_clamp(qs1 - u2)
+ paddsb xmm4, xmm2 ; sp = vp8_signed_char_clamp(ps1 - u2)
+
+%if %1 == 1
+ movdqa xmm2, [rdi+rax*4] ; p2
+ movdqa xmm5, [rdi+rcx] ; q2
+%else
+ movdqa xmm2, [rsp+_p2] ; p2
+ movdqa xmm5, [rsp+_q2] ; q2
+%endif
+
+ pxor xmm1, xmm7 ; *oq1 = sq^0x80;
+ pxor xmm4, xmm7 ; *op1 = sp^0x80;
+ pxor xmm2, xmm7
+ pxor xmm5, xmm7
+ paddsb xmm2, xmm0 ; sp = vp8_signed_char_clamp(ps2 - u)
+ psubsb xmm5, xmm0 ; sq = vp8_signed_char_clamp(qs2 - u)
+ pxor xmm2, xmm7 ; *op2 = sp^0x80;
+ pxor xmm5, xmm7 ; *oq2 = sq^0x80;
+ pxor xmm3, xmm7 ; *oq0 = sq^0x80
+ pxor xmm6, xmm7 ; *oq0 = sp^0x80
+%if %1 == 0
+ movq [rsi], xmm6 ; p0
+ movhps [rdi], xmm6
+ movq [rsi + rcx], xmm3 ; q0
+ movhps [rdi + rcx], xmm3
+ lea rdx, [rcx + rcx*2]
+ movq [rsi+rcx*2], xmm1 ; q1
+ movhps [rdi+rcx*2], xmm1
+
+ movq [rsi + rax], xmm4 ; p1
+ movhps [rdi + rax], xmm4
+
+ movq [rsi+rax*2], xmm2 ; p2
+ movhps [rdi+rax*2], xmm2
+
+ movq [rsi+rdx], xmm5 ; q2
+ movhps [rdi+rdx], xmm5
+%elif %1 == 1
+ movdqa [rdi+rcx], xmm5 ; q2
+ movdqa [rdi], xmm1 ; q1
+ movdqa [rsi], xmm3 ; q0
+ movdqa [rsi+rax ], xmm6 ; p0
+ movdqa [rsi+rax*2], xmm4 ; p1
+ movdqa [rdi+rax*4], xmm2 ; p2
+%elif %1 == 2
+ movdqa [rsp+_p1], xmm4 ; p1
+ movdqa [rsp+_p0], xmm6 ; p0
+ movdqa [rsp+_q0], xmm3 ; q0
+ movdqa [rsp+_q1], xmm1 ; q1
+%endif
+
+%endmacro
+
+
+;void vp8_mbloop_filter_horizontal_edge_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+;)
+global sym(vp8_mbloop_filter_horizontal_edge_sse2) PRIVATE
+sym(vp8_mbloop_filter_horizontal_edge_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, lf_var_size
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step
+ mov rdx, arg(3) ;limit
+
+ lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing
+
+ ; calculate breakout conditions and high edge variance
+ LFH_FILTER_AND_HEV_MASK 1
+ ; filter and write back the results
+ MB_FILTER_AND_WRITEBACK 1
+
+ add rsp, lf_var_size
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_mbloop_filter_horizontal_edge_uv_sse2
+;(
+; unsigned char *u,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+; unsigned char *v
+;)
+global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2) PRIVATE
+sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, lf_var_size
+
+ mov rsi, arg(0) ; u
+ mov rdi, arg(5) ; v
+ movsxd rax, dword ptr arg(1) ; src_pixel_step
+ mov rcx, rax
+ neg rax ; negate pitch to deal with above border
+ mov rdx, arg(3) ;limit
+
+ lea rsi, [rsi + rcx]
+ lea rdi, [rdi + rcx]
+
+ ; calculate breakout conditions and high edge variance
+ LFH_FILTER_AND_HEV_MASK 0
+ ; filter and write back the results
+ MB_FILTER_AND_WRITEBACK 0
+
+ add rsp, lf_var_size
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+%macro TRANSPOSE_16X8 2
+ movq xmm4, [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
+ movq xmm1, [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
+ movq xmm0, [rsi+2*rax] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
+ movq xmm7, [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
+ movq xmm5, [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
+ movq xmm2, [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
+
+ punpcklbw xmm4, xmm1 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
+
+ movq xmm1, [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
+
+ movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
+ punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
+
+ movq xmm7, [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
+
+ punpcklbw xmm5, xmm2 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
+%if %1
+ lea rsi, [rsi+rax*8]
+ lea rdi, [rdi+rax*8]
+%else
+ mov rsi, arg(5) ; v_ptr
+%endif
+
+ movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
+ punpcklbw xmm7, xmm1 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
+ punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
+ punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
+ punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+
+%if %1 == 0
+ lea rdi, [rsi + rax - 4] ; rdi points to row +1 for indirect addressing
+ lea rsi, [rsi - 4]
+%endif
+
+ movdqa xmm2, xmm3 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+ punpckhwd xmm4, xmm0 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
+
+ movdqa xmm7, xmm4 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
+ punpckhdq xmm3, xmm5 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+
+ punpckhdq xmm7, xmm6 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
+
+ punpckldq xmm4, xmm6 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
+
+ punpckldq xmm2, xmm5 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
+
+ movdqa [rsp+_t0], xmm2 ; save to free XMM2
+
+ movq xmm2, [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
+ movq xmm6, [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
+ movq xmm0, [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
+ movq xmm5, [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
+ movq xmm1, [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
+
+ punpcklbw xmm2, xmm6 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
+
+ movq xmm6, [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
+
+ punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
+
+ movq xmm5, [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
+
+ punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
+
+ movq xmm6, [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
+
+ punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
+
+ movdqa xmm6, xmm1 ;
+ punpckhwd xmm6, xmm5 ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
+
+ punpcklwd xmm1, xmm5 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
+ movdqa xmm5, xmm2 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
+
+ punpcklwd xmm5, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
+
+ punpckhwd xmm2, xmm0 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
+
+ movdqa xmm0, xmm5
+ punpckldq xmm0, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
+
+ punpckhdq xmm5, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
+ movdqa xmm1, xmm2 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
+
+ punpckldq xmm1, xmm6 ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
+
+ punpckhdq xmm2, xmm6 ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
+ movdqa xmm6, xmm7 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
+
+ punpcklqdq xmm6, xmm2 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
+
+ punpckhqdq xmm7, xmm2 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
+
+%if %2 == 0
+ movdqa [rsp+_q3], xmm7 ; save 7
+ movdqa [rsp+_q2], xmm6 ; save 6
+%endif
+ movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+ punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+ punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ movdqa [rsp+_p1], xmm2 ; save 2
+
+ movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
+ punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+ movdqa [rsp+_p0], xmm3 ; save 3
+
+ punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
+
+ movdqa [rsp+_q0], xmm4 ; save 4
+ movdqa [rsp+_q1], xmm5 ; save 5
+ movdqa xmm1, [rsp+_t0]
+
+ movdqa xmm2, xmm1 ;
+ punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+ punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+
+%if %2 == 0
+ movdqa [rsp+_p2], xmm1
+ movdqa [rsp+_p3], xmm2
+%endif
+
+%endmacro
+
+%macro LFV_FILTER_MASK_HEV_MASK 0
+ movdqa xmm0, xmm6 ; q2
+ psubusb xmm0, xmm7 ; q2-q3
+
+ psubusb xmm7, xmm6 ; q3-q2
+ movdqa xmm4, xmm5 ; q1
+
+ por xmm7, xmm0 ; abs (q3-q2)
+ psubusb xmm4, xmm6 ; q1-q2
+
+ movdqa xmm0, xmm1
+ psubusb xmm6, xmm5 ; q2-q1
+
+ por xmm6, xmm4 ; abs (q2-q1)
+ psubusb xmm0, xmm2 ; p2 - p3;
+
+ psubusb xmm2, xmm1 ; p3 - p2;
+ por xmm0, xmm2 ; abs(p2-p3)
+
+ movdqa xmm5, [rsp+_p1] ; p1
+ pmaxub xmm0, xmm7
+
+ movdqa xmm2, xmm5 ; p1
+ psubusb xmm5, xmm1 ; p1-p2
+ psubusb xmm1, xmm2 ; p2-p1
+
+ movdqa xmm7, xmm3 ; p0
+ psubusb xmm7, xmm2 ; p0-p1
+
+ por xmm1, xmm5 ; abs(p2-p1)
+ pmaxub xmm0, xmm6
+
+ pmaxub xmm0, xmm1
+ movdqa xmm1, xmm2 ; p1
+
+ psubusb xmm2, xmm3 ; p1-p0
+
+ por xmm2, xmm7 ; abs(p1-p0)
+
+ pmaxub xmm0, xmm2
+
+ movdqa xmm5, [rsp+_q0] ; q0
+ movdqa xmm7, [rsp+_q1] ; q1
+
+ mov rdx, arg(3) ; limit
+
+ movdqa xmm6, xmm5 ; q0
+ movdqa xmm4, xmm7 ; q1
+
+ psubusb xmm5, xmm7 ; q0-q1
+ psubusb xmm7, xmm6 ; q1-q0
+
+ por xmm7, xmm5 ; abs(q1-q0)
+
+ pmaxub xmm0, xmm7
+
+ psubusb xmm0, [rdx] ; limit
+
+ mov rdx, arg(2) ; blimit
+ movdqa xmm5, xmm4 ; q1
+
+ psubusb xmm5, xmm1 ; q1-=p1
+ psubusb xmm1, xmm4 ; p1-=q1
+
+ por xmm5, xmm1 ; abs(p1-q1)
+ movdqa xmm1, xmm3 ; p0
+
+ pand xmm5, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psubusb xmm1, xmm6 ; p0-q0
+
+ movdqa xmm4, [rdx] ; blimit
+ mov rdx, arg(4) ; get thresh
+
+ psrlw xmm5, 1 ; abs(p1-q1)/2
+ psubusb xmm6, xmm3 ; q0-p0
+
+ por xmm1, xmm6 ; abs(q0-p0)
+ paddusb xmm1, xmm1 ; abs(q0-p0)*2
+ movdqa xmm3, [rdx]
+
+ paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+ psubusb xmm2, xmm3 ; abs(q1 - q0) > thresh
+
+ psubusb xmm7, xmm3 ; abs(p1 - p0)> thresh
+
+ psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ por xmm2, xmm7 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+
+ por xmm1, xmm0 ; mask
+ pcmpeqb xmm2, xmm0
+
+ pxor xmm0, xmm0
+ pcmpeqb xmm4, xmm4
+
+ pcmpeqb xmm1, xmm0
+ pxor xmm4, xmm2
+%endmacro
+
+%macro BV_TRANSPOSE 0
+ ; xmm1 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ ; xmm6 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+ ; xmm3 = f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+ ; xmm7 = f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
+ movdqa xmm2, xmm1 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ punpcklbw xmm2, xmm6 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+
+ movdqa xmm4, xmm3 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+ punpckhbw xmm1, xmm6 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+ punpcklbw xmm4, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
+
+ punpckhbw xmm3, xmm7 ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
+
+ movdqa xmm6, xmm2 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+ punpcklwd xmm2, xmm4 ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
+
+ punpckhwd xmm6, xmm4 ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
+ movdqa xmm5, xmm1 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+ punpcklwd xmm1, xmm3 ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
+
+ punpckhwd xmm5, xmm3 ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
+ ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
+ ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
+ ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
+ ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
+%endmacro
+
+%macro BV_WRITEBACK 2
+ movd [rsi+2], %1
+ movd [rsi+4*rax+2], %2
+ psrldq %1, 4
+ psrldq %2, 4
+ movd [rdi+2], %1
+ movd [rdi+4*rax+2], %2
+ psrldq %1, 4
+ psrldq %2, 4
+ movd [rsi+2*rax+2], %1
+ movd [rsi+2*rcx+2], %2
+ psrldq %1, 4
+ psrldq %2, 4
+ movd [rdi+2*rax+2], %1
+ movd [rdi+2*rcx+2], %2
+%endmacro
+
+%if ABI_IS_32BIT
+
+;void vp8_loop_filter_vertical_edge_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+;)
+global sym(vp8_loop_filter_vertical_edge_sse2) PRIVATE
+sym(vp8_loop_filter_vertical_edge_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, lf_var_size
+
+ mov rsi, arg(0) ; src_ptr
+ movsxd rax, dword ptr arg(1) ; src_pixel_step
+
+ lea rsi, [rsi - 4]
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+ lea rcx, [rax*2+rax]
+
+ ;transpose 16x8 to 8x16, and store the 8-line result on stack.
+ TRANSPOSE_16X8 1, 1
+
+ ; calculate filter mask and high edge variance
+ LFV_FILTER_MASK_HEV_MASK
+
+ ; start work on filters
+ B_FILTER 2
+
+ ; transpose and write back - only work on q1, q0, p0, p1
+ BV_TRANSPOSE
+ ; store 16-line result
+
+ lea rdx, [rax]
+ neg rdx
+
+ BV_WRITEBACK xmm1, xmm5
+
+ lea rsi, [rsi+rdx*8]
+ lea rdi, [rdi+rdx*8]
+ BV_WRITEBACK xmm2, xmm6
+
+ add rsp, lf_var_size
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+%endif
+
+;void vp8_loop_filter_vertical_edge_uv_sse2
+;(
+; unsigned char *u,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+; unsigned char *v
+;)
+global sym(vp8_loop_filter_vertical_edge_uv_sse2) PRIVATE
+sym(vp8_loop_filter_vertical_edge_uv_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, lf_var_size
+
+ mov rsi, arg(0) ; u_ptr
+ movsxd rax, dword ptr arg(1) ; src_pixel_step
+
+ lea rsi, [rsi - 4]
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+ lea rcx, [rax+2*rax]
+
+ ;transpose 16x8 to 8x16, and store the 8-line result on stack.
+ TRANSPOSE_16X8 0, 1
+
+ ; calculate filter mask and high edge variance
+ LFV_FILTER_MASK_HEV_MASK
+
+ ; start work on filters
+ B_FILTER 2
+
+ ; transpose and write back - only work on q1, q0, p0, p1
+ BV_TRANSPOSE
+
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+
+ ; store 16-line result
+ BV_WRITEBACK xmm1, xmm5
+
+ mov rsi, arg(0) ; u_ptr
+ lea rsi, [rsi - 4]
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+ BV_WRITEBACK xmm2, xmm6
+
+ add rsp, lf_var_size
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+%macro MBV_TRANSPOSE 0
+ movdqa xmm0, [rsp+_p3] ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+ movdqa xmm1, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+
+ punpcklbw xmm0, xmm2 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+ punpckhbw xmm1, xmm2 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+
+ movdqa xmm7, [rsp+_p1] ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ movdqa xmm6, xmm7 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+
+ punpcklbw xmm7, [rsp+_p0] ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+ punpckhbw xmm6, [rsp+_p0] ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+ movdqa xmm3, xmm0 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+ punpcklwd xmm0, xmm7 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
+
+ punpckhwd xmm3, xmm7 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
+ movdqa xmm4, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+
+ punpcklwd xmm1, xmm6 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
+ punpckhwd xmm4, xmm6 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
+
+ movdqa xmm7, [rsp+_q0] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+ punpcklbw xmm7, [rsp+_q1] ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
+
+ movdqa xmm6, xmm5 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
+ punpcklbw xmm6, [rsp+_q3] ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06
+
+ movdqa xmm2, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
+ punpcklwd xmm7, xmm6 ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04
+
+ punpckhwd xmm2, xmm6 ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44
+ movdqa xmm6, xmm0 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
+
+ punpckldq xmm0, xmm7 ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00
+ punpckhdq xmm6, xmm7 ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20
+%endmacro
+
+%macro MBV_WRITEBACK_1 0
+ movq [rsi], xmm0
+ movhps [rdi], xmm0
+
+ movq [rsi+2*rax], xmm6
+ movhps [rdi+2*rax], xmm6
+
+ movdqa xmm0, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
+ punpckldq xmm0, xmm2 ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40
+ punpckhdq xmm3, xmm2 ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
+
+ movq [rsi+4*rax], xmm0
+ movhps [rdi+4*rax], xmm0
+
+ movq [rsi+2*rcx], xmm3
+ movhps [rdi+2*rcx], xmm3
+
+ movdqa xmm7, [rsp+_q0] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+ punpckhbw xmm7, [rsp+_q1] ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
+ punpckhbw xmm5, [rsp+_q3] ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
+
+ movdqa xmm0, xmm7
+ punpcklwd xmm0, xmm5 ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84
+ punpckhwd xmm7, xmm5 ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
+
+ movdqa xmm5, xmm1 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
+ punpckldq xmm1, xmm0 ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80
+ punpckhdq xmm5, xmm0 ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0
+%endmacro
+
+%macro MBV_WRITEBACK_2 0
+ movq [rsi], xmm1
+ movhps [rdi], xmm1
+
+ movq [rsi+2*rax], xmm5
+ movhps [rdi+2*rax], xmm5
+
+ movdqa xmm1, xmm4 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
+ punpckldq xmm1, xmm7 ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0
+ punpckhdq xmm4, xmm7 ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0
+
+ movq [rsi+4*rax], xmm1
+ movhps [rdi+4*rax], xmm1
+
+ movq [rsi+2*rcx], xmm4
+ movhps [rdi+2*rcx], xmm4
+%endmacro
+
+
+;void vp8_mbloop_filter_vertical_edge_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+;)
+global sym(vp8_mbloop_filter_vertical_edge_sse2) PRIVATE
+sym(vp8_mbloop_filter_vertical_edge_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, lf_var_size
+
+ mov rsi, arg(0) ; src_ptr
+ movsxd rax, dword ptr arg(1) ; src_pixel_step
+
+ lea rsi, [rsi - 4]
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+ lea rcx, [rax*2+rax]
+
+ ; Transpose
+ TRANSPOSE_16X8 1, 0
+
+ ; calculate filter mask and high edge variance
+ LFV_FILTER_MASK_HEV_MASK
+
+ neg rax
+ ; start work on filters
+ MB_FILTER_AND_WRITEBACK 2
+
+ lea rsi, [rsi+rax*8]
+ lea rdi, [rdi+rax*8]
+
+ ; transpose and write back
+ MBV_TRANSPOSE
+
+ neg rax
+
+ MBV_WRITEBACK_1
+
+
+ lea rsi, [rsi+rax*8]
+ lea rdi, [rdi+rax*8]
+ MBV_WRITEBACK_2
+
+ add rsp, lf_var_size
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_mbloop_filter_vertical_edge_uv_sse2
+;(
+; unsigned char *u,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+; unsigned char *v
+;)
+global sym(vp8_mbloop_filter_vertical_edge_uv_sse2) PRIVATE
+sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, lf_var_size
+
+ mov rsi, arg(0) ; u_ptr
+ movsxd rax, dword ptr arg(1) ; src_pixel_step
+
+ lea rsi, [rsi - 4]
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+ lea rcx, [rax+2*rax]
+
+ ; Transpose
+ TRANSPOSE_16X8 0, 0
+
+ ; calculate filter mask and high edge variance
+ LFV_FILTER_MASK_HEV_MASK
+
+ ; start work on filters
+ MB_FILTER_AND_WRITEBACK 2
+
+ ; transpose and write back
+ MBV_TRANSPOSE
+
+ mov rsi, arg(0) ;u_ptr
+ lea rsi, [rsi - 4]
+ lea rdi, [rsi + rax]
+ MBV_WRITEBACK_1
+ mov rsi, arg(5) ;v_ptr
+ lea rsi, [rsi - 4]
+ lea rdi, [rsi + rax]
+ MBV_WRITEBACK_2
+
+ add rsp, lf_var_size
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_loop_filter_simple_horizontal_edge_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+;)
+global sym(vp8_loop_filter_simple_horizontal_edge_sse2) PRIVATE
+sym(vp8_loop_filter_simple_horizontal_edge_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ SAVE_XMM 7
+ GET_GOT rbx
+ ; end prolog
+
+ mov rcx, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+ movdqa xmm6, [GLOBAL(tfe)]
+ lea rdx, [rcx + rax]
+ neg rax
+
+ ; calculate mask
+ movdqa xmm0, [rdx] ; q1
+ mov rdx, arg(2) ;blimit
+ movdqa xmm1, [rcx+2*rax] ; p1
+
+ movdqa xmm2, xmm1
+ movdqa xmm3, xmm0
+
+ psubusb xmm0, xmm1 ; q1-=p1
+ psubusb xmm1, xmm3 ; p1-=q1
+ por xmm1, xmm0 ; abs(p1-q1)
+ pand xmm1, xmm6 ; set lsb of each byte to zero
+ psrlw xmm1, 1 ; abs(p1-q1)/2
+
+ movdqa xmm7, XMMWORD PTR [rdx]
+
+ movdqa xmm5, [rcx+rax] ; p0
+ movdqa xmm4, [rcx] ; q0
+ movdqa xmm0, xmm4 ; q0
+ movdqa xmm6, xmm5 ; p0
+ psubusb xmm5, xmm4 ; p0-=q0
+ psubusb xmm4, xmm6 ; q0-=p0
+ por xmm5, xmm4 ; abs(p0 - q0)
+
+ movdqa xmm4, [GLOBAL(t80)]
+
+ paddusb xmm5, xmm5 ; abs(p0-q0)*2
+ paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+ psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ pxor xmm7, xmm7
+ pcmpeqb xmm5, xmm7
+
+
+ ; start work on filters
+ pxor xmm2, xmm4 ; p1 offset to convert to signed values
+ pxor xmm3, xmm4 ; q1 offset to convert to signed values
+ psubsb xmm2, xmm3 ; p1 - q1
+
+ pxor xmm6, xmm4 ; offset to convert to signed values
+ pxor xmm0, xmm4 ; offset to convert to signed values
+ movdqa xmm3, xmm0 ; q0
+ psubsb xmm0, xmm6 ; q0 - p0
+ paddsb xmm2, xmm0 ; p1 - q1 + 1 * (q0 - p0)
+ paddsb xmm2, xmm0 ; p1 - q1 + 2 * (q0 - p0)
+ paddsb xmm2, xmm0 ; p1 - q1 + 3 * (q0 - p0)
+ pand xmm5, xmm2 ; mask filter values we don't care about
+
+ movdqa xmm0, xmm5
+ paddsb xmm5, [GLOBAL(t3)] ; 3* (q0 - p0) + (p1 - q1) + 4
+ paddsb xmm0, [GLOBAL(t4)] ; +3 instead of +4
+
+ movdqa xmm1, [GLOBAL(te0)]
+ movdqa xmm2, [GLOBAL(t1f)]
+
+; pxor xmm7, xmm7
+ pcmpgtb xmm7, xmm0 ;save sign
+ pand xmm7, xmm1 ;preserve the upper 3 bits
+ psrlw xmm0, 3
+ pand xmm0, xmm2 ;clear out upper 3 bits
+ por xmm0, xmm7 ;add sign
+ psubsb xmm3, xmm0 ; q0-= q0sz add
+
+ pxor xmm7, xmm7
+ pcmpgtb xmm7, xmm5 ;save sign
+ pand xmm7, xmm1 ;preserve the upper 3 bits
+ psrlw xmm5, 3
+ pand xmm5, xmm2 ;clear out upper 3 bits
+ por xmm5, xmm7 ;add sign
+ paddsb xmm6, xmm5 ; p0+= p0 add
+
+ pxor xmm3, xmm4 ; unoffset
+ movdqa [rcx], xmm3 ; write back
+
+ pxor xmm6, xmm4 ; unoffset
+ movdqa [rcx+rax], xmm6 ; write back
+
+ ; begin epilog
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_loop_filter_simple_vertical_edge_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+;)
+global sym(vp8_loop_filter_simple_vertical_edge_sse2) PRIVATE
+sym(vp8_loop_filter_simple_vertical_edge_sse2):
+ push rbp ; save old base pointer value.
+ mov rbp, rsp ; set new base pointer value.
+ SHADOW_ARGS_TO_STACK 3
+ SAVE_XMM 7
+ GET_GOT rbx ; save callee-saved reg
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 32 ; reserve 32 bytes
+ %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
+ %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+
+ lea rsi, [rsi - 2 ]
+ lea rdi, [rsi + rax]
+ lea rdx, [rsi + rax*4]
+ lea rcx, [rdx + rax]
+
+ movd xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00
+ movd xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40
+ movd xmm2, [rdi] ; 13 12 11 10
+ movd xmm3, [rcx] ; 53 52 51 50
+ punpckldq xmm0, xmm1 ; (high 64 bits unused) 43 42 41 40 03 02 01 00
+ punpckldq xmm2, xmm3 ; 53 52 51 50 13 12 11 10
+
+ movd xmm4, [rsi + rax*2] ; 23 22 21 20
+ movd xmm5, [rdx + rax*2] ; 63 62 61 60
+ movd xmm6, [rdi + rax*2] ; 33 32 31 30
+ movd xmm7, [rcx + rax*2] ; 73 72 71 70
+ punpckldq xmm4, xmm5 ; 63 62 61 60 23 22 21 20
+ punpckldq xmm6, xmm7 ; 73 72 71 70 33 32 31 30
+
+ punpcklbw xmm0, xmm2 ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
+ punpcklbw xmm4, xmm6 ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
+
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm4 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+ punpckhwd xmm1, xmm4 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
+
+ movdqa xmm2, xmm0
+ punpckldq xmm0, xmm1 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
+ punpckhdq xmm2, xmm1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+
+ lea rsi, [rsi + rax*8]
+ lea rdi, [rsi + rax]
+ lea rdx, [rsi + rax*4]
+ lea rcx, [rdx + rax]
+
+ movd xmm4, [rsi] ; 83 82 81 80
+ movd xmm1, [rdx] ; c3 c2 c1 c0
+ movd xmm6, [rdi] ; 93 92 91 90
+ movd xmm3, [rcx] ; d3 d2 d1 d0
+ punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80
+ punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90
+
+ movd xmm1, [rsi + rax*2] ; a3 a2 a1 a0
+ movd xmm5, [rdx + rax*2] ; e3 e2 e1 e0
+ movd xmm3, [rdi + rax*2] ; b3 b2 b1 b0
+ movd xmm7, [rcx + rax*2] ; f3 f2 f1 f0
+ punpckldq xmm1, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0
+ punpckldq xmm3, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0
+
+ punpcklbw xmm4, xmm6 ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
+ punpcklbw xmm1, xmm3 ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
+
+ movdqa xmm7, xmm4
+ punpcklwd xmm4, xmm1 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
+ punpckhwd xmm7, xmm1 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
+
+ movdqa xmm6, xmm4
+ punpckldq xmm4, xmm7 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
+ punpckhdq xmm6, xmm7 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
+
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+
+ punpcklqdq xmm0, xmm4 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+ punpckhqdq xmm1, xmm4 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+ punpcklqdq xmm2, xmm6 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ punpckhqdq xmm3, xmm6 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+
+ mov rdx, arg(2) ;blimit
+
+ ; calculate mask
+ movdqa xmm6, xmm0 ; p1
+ movdqa xmm7, xmm3 ; q1
+ psubusb xmm7, xmm0 ; q1-=p1
+ psubusb xmm6, xmm3 ; p1-=q1
+ por xmm6, xmm7 ; abs(p1-q1)
+ pand xmm6, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psrlw xmm6, 1 ; abs(p1-q1)/2
+
+ movdqa xmm7, [rdx]
+
+ movdqa xmm5, xmm1 ; p0
+ movdqa xmm4, xmm2 ; q0
+ psubusb xmm5, xmm2 ; p0-=q0
+ psubusb xmm4, xmm1 ; q0-=p0
+ por xmm5, xmm4 ; abs(p0 - q0)
+ paddusb xmm5, xmm5 ; abs(p0-q0)*2
+ paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ movdqa xmm4, [GLOBAL(t80)]
+
+ psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ pxor xmm7, xmm7
+ pcmpeqb xmm5, xmm7 ; mm5 = mask
+
+ ; start work on filters
+ movdqa t0, xmm0
+ movdqa t1, xmm3
+
+ pxor xmm0, xmm4 ; p1 offset to convert to signed values
+ pxor xmm3, xmm4 ; q1 offset to convert to signed values
+ psubsb xmm0, xmm3 ; p1 - q1
+
+ pxor xmm1, xmm4 ; offset to convert to signed values
+ pxor xmm2, xmm4 ; offset to convert to signed values
+
+ movdqa xmm3, xmm2 ; offseted ; q0
+ psubsb xmm2, xmm1 ; q0 - p0
+ paddsb xmm0, xmm2 ; p1 - q1 + 1 * (q0 - p0)
+ paddsb xmm0, xmm2 ; p1 - q1 + 2 * (q0 - p0)
+ paddsb xmm0, xmm2 ; p1 - q1 + 3 * (q0 - p0)
+ pand xmm5, xmm0 ; mask filter values we don't care about
+
+ movdqa xmm0, xmm5
+ paddsb xmm5, [GLOBAL(t3)] ; 3* (q0 - p0) + (p1 - q1) + 4
+ paddsb xmm0, [GLOBAL(t4)] ; +3 instead of +4
+
+ movdqa xmm6, [GLOBAL(te0)]
+ movdqa xmm2, [GLOBAL(t1f)]
+
+; pxor xmm7, xmm7
+ pcmpgtb xmm7, xmm0 ;save sign
+ pand xmm7, xmm6 ;preserve the upper 3 bits
+ psrlw xmm0, 3
+ pand xmm0, xmm2 ;clear out upper 3 bits
+ por xmm0, xmm7 ;add sign
+ psubsb xmm3, xmm0 ; q0-= q0sz add
+
+ pxor xmm7, xmm7
+ pcmpgtb xmm7, xmm5 ;save sign
+ pand xmm7, xmm6 ;preserve the upper 3 bits
+ psrlw xmm5, 3
+ pand xmm5, xmm2 ;clear out upper 3 bits
+ por xmm5, xmm7 ;add sign
+ paddsb xmm1, xmm5 ; p0+= p0 add
+
+ pxor xmm3, xmm4 ; unoffset q0
+ pxor xmm1, xmm4 ; unoffset p0
+
+ movdqa xmm0, t0 ; p1
+ movdqa xmm4, t1 ; q1
+
+ ; write out order: xmm0 xmm2 xmm1 xmm3
+ lea rdx, [rsi + rax*4]
+
+ ; transpose back to write out
+ ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+ ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+ ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+ movdqa xmm6, xmm0
+ punpcklbw xmm0, xmm1 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+ punpckhbw xmm6, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+
+ movdqa xmm5, xmm3
+ punpcklbw xmm3, xmm4 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+ punpckhbw xmm5, xmm4 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+ movdqa xmm2, xmm0
+ punpcklwd xmm0, xmm3 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
+ punpckhwd xmm2, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
+
+ movdqa xmm3, xmm6
+ punpcklwd xmm6, xmm5 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
+ punpckhwd xmm3, xmm5 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
+
+ movd [rsi], xmm6 ; write the second 8-line result
+ movd [rdx], xmm3
+ psrldq xmm6, 4
+ psrldq xmm3, 4
+ movd [rdi], xmm6
+ movd [rcx], xmm3
+ psrldq xmm6, 4
+ psrldq xmm3, 4
+ movd [rsi + rax*2], xmm6
+ movd [rdx + rax*2], xmm3
+ psrldq xmm6, 4
+ psrldq xmm3, 4
+ movd [rdi + rax*2], xmm6
+ movd [rcx + rax*2], xmm3
+
+ neg rax
+ lea rsi, [rsi + rax*8]
+ neg rax
+ lea rdi, [rsi + rax]
+ lea rdx, [rsi + rax*4]
+ lea rcx, [rdx + rax]
+
+ movd [rsi], xmm0 ; write the first 8-line result
+ movd [rdx], xmm2
+ psrldq xmm0, 4
+ psrldq xmm2, 4
+ movd [rdi], xmm0
+ movd [rcx], xmm2
+ psrldq xmm0, 4
+ psrldq xmm2, 4
+ movd [rsi + rax*2], xmm0
+ movd [rdx + rax*2], xmm2
+ psrldq xmm0, 4
+ psrldq xmm2, 4
+ movd [rdi + rax*2], xmm0
+ movd [rcx + rax*2], xmm2
+
+ add rsp, 32
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+tfe:
+ times 16 db 0xfe
+align 16
+t80:
+ times 16 db 0x80
+align 16
+t1s:
+ times 16 db 0x01
+align 16
+t3:
+ times 16 db 0x03
+align 16
+t4:
+ times 16 db 0x04
+align 16
+ones:
+ times 8 dw 0x0001
+align 16
+s9:
+ times 8 dw 0x0900
+align 16
+s63:
+ times 8 dw 0x003f
+align 16
+te0:
+ times 16 db 0xe0
+align 16
+t1f:
+ times 16 db 0x1f
diff --git a/thirdparty/libvpx/vp8/common/x86/loopfilter_x86.c b/thirdparty/libvpx/vp8/common/x86/loopfilter_x86.c
new file mode 100644
index 0000000000..6586004600
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/x86/loopfilter_x86.c
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp8/common/loopfilter.h"
+
+#define prototype_loopfilter(sym) \
+ void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
+ const unsigned char *limit, const unsigned char *thresh, int count)
+
+#define prototype_loopfilter_nc(sym) \
+ void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
+ const unsigned char *limit, const unsigned char *thresh)
+
+#define prototype_simple_loopfilter(sym) \
+ void sym(unsigned char *y, int ystride, const unsigned char *blimit)
+
+prototype_loopfilter(vp8_mbloop_filter_vertical_edge_mmx);
+prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_mmx);
+prototype_loopfilter(vp8_loop_filter_vertical_edge_mmx);
+prototype_loopfilter(vp8_loop_filter_horizontal_edge_mmx);
+prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_mmx);
+prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_mmx);
+
+#if HAVE_SSE2 && ARCH_X86_64
+prototype_loopfilter(vp8_loop_filter_bv_y_sse2);
+prototype_loopfilter(vp8_loop_filter_bh_y_sse2);
+#else
+prototype_loopfilter_nc(vp8_loop_filter_vertical_edge_sse2);
+prototype_loopfilter_nc(vp8_loop_filter_horizontal_edge_sse2);
+#endif
+prototype_loopfilter_nc(vp8_mbloop_filter_vertical_edge_sse2);
+prototype_loopfilter_nc(vp8_mbloop_filter_horizontal_edge_sse2);
+
+extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_sse2;
+extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_sse2;
+extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_sse2;
+extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_sse2;
+
+#if HAVE_MMX
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+
+void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
+{
+ vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, blimit);
+ vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, blimit);
+ vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, blimit);
+}
+
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ vp8_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+
+void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
+{
+ vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit);
+ vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit);
+ vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit);
+}
+#endif
+
+
+/* Horizontal MB filtering */
+#if HAVE_SSE2
+void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr);
+
+ if (u_ptr)
+ vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr);
+}
+
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr);
+
+ if (u_ptr)
+ vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr);
+}
+
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+#if ARCH_X86_64
+ vp8_loop_filter_bh_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+#else
+ vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+ vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+ vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+#endif
+
+ if (u_ptr)
+ vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4 * uv_stride);
+}
+
+
+void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
+{
+ vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, blimit);
+ vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, blimit);
+ vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, blimit);
+}
+
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+#if ARCH_X86_64
+ vp8_loop_filter_bv_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+#else
+ vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+ vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+ vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+#endif
+
+ if (u_ptr)
+ vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4);
+}
+
+
+void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
+{
+ vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit);
+ vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit);
+ vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit);
+}
+
+#endif
diff --git a/thirdparty/libvpx/vp8/common/x86/recon_mmx.asm b/thirdparty/libvpx/vp8/common/x86/recon_mmx.asm
new file mode 100644
index 0000000000..15e98713c7
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/x86/recon_mmx.asm
@@ -0,0 +1,274 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;void copy_mem8x8_mmx(
+; unsigned char *src,
+; int src_stride,
+; unsigned char *dst,
+; int dst_stride
+; )
+global sym(vp8_copy_mem8x8_mmx) PRIVATE
+sym(vp8_copy_mem8x8_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src;
+ movq mm0, [rsi]
+
+ movsxd rax, dword ptr arg(1) ;src_stride;
+ mov rdi, arg(2) ;dst;
+
+ movq mm1, [rsi+rax]
+ movq mm2, [rsi+rax*2]
+
+ movsxd rcx, dword ptr arg(3) ;dst_stride
+ lea rsi, [rsi+rax*2]
+
+ movq [rdi], mm0
+ add rsi, rax
+
+ movq [rdi+rcx], mm1
+ movq [rdi+rcx*2], mm2
+
+
+ lea rdi, [rdi+rcx*2]
+ movq mm3, [rsi]
+
+ add rdi, rcx
+ movq mm4, [rsi+rax]
+
+ movq mm5, [rsi+rax*2]
+ movq [rdi], mm3
+
+ lea rsi, [rsi+rax*2]
+ movq [rdi+rcx], mm4
+
+ movq [rdi+rcx*2], mm5
+ lea rdi, [rdi+rcx*2]
+
+ movq mm0, [rsi+rax]
+ movq mm1, [rsi+rax*2]
+
+ movq [rdi+rcx], mm0
+ movq [rdi+rcx*2],mm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void copy_mem8x4_mmx(
+; unsigned char *src,
+; int src_stride,
+; unsigned char *dst,
+; int dst_stride
+; )
+global sym(vp8_copy_mem8x4_mmx) PRIVATE
+sym(vp8_copy_mem8x4_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src;
+ movq mm0, [rsi]
+
+ movsxd rax, dword ptr arg(1) ;src_stride;
+ mov rdi, arg(2) ;dst;
+
+ movq mm1, [rsi+rax]
+ movq mm2, [rsi+rax*2]
+
+ movsxd rcx, dword ptr arg(3) ;dst_stride
+ lea rsi, [rsi+rax*2]
+
+ movq [rdi], mm0
+ movq [rdi+rcx], mm1
+
+ movq [rdi+rcx*2], mm2
+ lea rdi, [rdi+rcx*2]
+
+ movq mm3, [rsi+rax]
+ movq [rdi+rcx], mm3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void copy_mem16x16_mmx(
+; unsigned char *src,
+; int src_stride,
+; unsigned char *dst,
+; int dst_stride
+; )
+global sym(vp8_copy_mem16x16_mmx) PRIVATE
+sym(vp8_copy_mem16x16_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src;
+ movsxd rax, dword ptr arg(1) ;src_stride;
+
+ mov rdi, arg(2) ;dst;
+ movsxd rcx, dword ptr arg(3) ;dst_stride
+
+ movq mm0, [rsi]
+ movq mm3, [rsi+8];
+
+ movq mm1, [rsi+rax]
+ movq mm4, [rsi+rax+8]
+
+ movq mm2, [rsi+rax*2]
+ movq mm5, [rsi+rax*2+8]
+
+ lea rsi, [rsi+rax*2]
+ add rsi, rax
+
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+ movq [rdi+rcx], mm1
+ movq [rdi+rcx+8], mm4
+
+ movq [rdi+rcx*2], mm2
+ movq [rdi+rcx*2+8], mm5
+
+ lea rdi, [rdi+rcx*2]
+ add rdi, rcx
+
+ movq mm0, [rsi]
+ movq mm3, [rsi+8];
+
+ movq mm1, [rsi+rax]
+ movq mm4, [rsi+rax+8]
+
+ movq mm2, [rsi+rax*2]
+ movq mm5, [rsi+rax*2+8]
+
+ lea rsi, [rsi+rax*2]
+ add rsi, rax
+
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+ movq [rdi+rcx], mm1
+ movq [rdi+rcx+8], mm4
+
+ movq [rdi+rcx*2], mm2
+ movq [rdi+rcx*2+8], mm5
+
+ lea rdi, [rdi+rcx*2]
+ add rdi, rcx
+
+ movq mm0, [rsi]
+ movq mm3, [rsi+8];
+
+ movq mm1, [rsi+rax]
+ movq mm4, [rsi+rax+8]
+
+ movq mm2, [rsi+rax*2]
+ movq mm5, [rsi+rax*2+8]
+
+ lea rsi, [rsi+rax*2]
+ add rsi, rax
+
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+ movq [rdi+rcx], mm1
+ movq [rdi+rcx+8], mm4
+
+ movq [rdi+rcx*2], mm2
+ movq [rdi+rcx*2+8], mm5
+
+ lea rdi, [rdi+rcx*2]
+ add rdi, rcx
+
+ movq mm0, [rsi]
+ movq mm3, [rsi+8];
+
+ movq mm1, [rsi+rax]
+ movq mm4, [rsi+rax+8]
+
+ movq mm2, [rsi+rax*2]
+ movq mm5, [rsi+rax*2+8]
+
+ lea rsi, [rsi+rax*2]
+ add rsi, rax
+
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+ movq [rdi+rcx], mm1
+ movq [rdi+rcx+8], mm4
+
+ movq [rdi+rcx*2], mm2
+ movq [rdi+rcx*2+8], mm5
+
+ lea rdi, [rdi+rcx*2]
+ add rdi, rcx
+
+ movq mm0, [rsi]
+ movq mm3, [rsi+8];
+
+ movq mm1, [rsi+rax]
+ movq mm4, [rsi+rax+8]
+
+ movq mm2, [rsi+rax*2]
+ movq mm5, [rsi+rax*2+8]
+
+ lea rsi, [rsi+rax*2]
+ add rsi, rax
+
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+ movq [rdi+rcx], mm1
+ movq [rdi+rcx+8], mm4
+
+ movq [rdi+rcx*2], mm2
+ movq [rdi+rcx*2+8], mm5
+
+ lea rdi, [rdi+rcx*2]
+ add rdi, rcx
+
+ movq mm0, [rsi]
+ movq mm3, [rsi+8];
+
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/thirdparty/libvpx/vp8/common/x86/recon_sse2.asm b/thirdparty/libvpx/vp8/common/x86/recon_sse2.asm
new file mode 100644
index 0000000000..cb89537f76
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/x86/recon_sse2.asm
@@ -0,0 +1,116 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void copy_mem16x16_sse2(
+; unsigned char *src,
+; int src_stride,
+; unsigned char *dst,
+; int dst_stride
+; )
+global sym(vp8_copy_mem16x16_sse2) PRIVATE
+sym(vp8_copy_mem16x16_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src;
+ movdqu xmm0, [rsi]
+
+ movsxd rax, dword ptr arg(1) ;src_stride;
+ mov rdi, arg(2) ;dst;
+
+ movdqu xmm1, [rsi+rax]
+ movdqu xmm2, [rsi+rax*2]
+
+ movsxd rcx, dword ptr arg(3) ;dst_stride
+ lea rsi, [rsi+rax*2]
+
+ movdqa [rdi], xmm0
+ add rsi, rax
+
+ movdqa [rdi+rcx], xmm1
+ movdqa [rdi+rcx*2],xmm2
+
+ lea rdi, [rdi+rcx*2]
+ movdqu xmm3, [rsi]
+
+ add rdi, rcx
+ movdqu xmm4, [rsi+rax]
+
+ movdqu xmm5, [rsi+rax*2]
+ lea rsi, [rsi+rax*2]
+
+ movdqa [rdi], xmm3
+ add rsi, rax
+
+ movdqa [rdi+rcx], xmm4
+ movdqa [rdi+rcx*2],xmm5
+
+ lea rdi, [rdi+rcx*2]
+ movdqu xmm0, [rsi]
+
+ add rdi, rcx
+ movdqu xmm1, [rsi+rax]
+
+ movdqu xmm2, [rsi+rax*2]
+ lea rsi, [rsi+rax*2]
+
+ movdqa [rdi], xmm0
+ add rsi, rax
+
+ movdqa [rdi+rcx], xmm1
+
+ movdqa [rdi+rcx*2], xmm2
+ movdqu xmm3, [rsi]
+
+ movdqu xmm4, [rsi+rax]
+ lea rdi, [rdi+rcx*2]
+
+ add rdi, rcx
+ movdqu xmm5, [rsi+rax*2]
+
+ lea rsi, [rsi+rax*2]
+ movdqa [rdi], xmm3
+
+ add rsi, rax
+ movdqa [rdi+rcx], xmm4
+
+ movdqa [rdi+rcx*2],xmm5
+ movdqu xmm0, [rsi]
+
+ lea rdi, [rdi+rcx*2]
+ movdqu xmm1, [rsi+rax]
+
+ add rdi, rcx
+ movdqu xmm2, [rsi+rax*2]
+
+ lea rsi, [rsi+rax*2]
+ movdqa [rdi], xmm0
+
+ movdqa [rdi+rcx], xmm1
+ movdqa [rdi+rcx*2],xmm2
+
+ movdqu xmm3, [rsi+rax]
+ lea rdi, [rdi+rcx*2]
+
+ movdqa [rdi+rcx], xmm3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/thirdparty/libvpx/vp8/common/x86/subpixel_mmx.asm b/thirdparty/libvpx/vp8/common/x86/subpixel_mmx.asm
new file mode 100644
index 0000000000..47dd452297
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/x86/subpixel_mmx.asm
@@ -0,0 +1,702 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+extern sym(vp8_bilinear_filters_x86_8)
+
+
+%define BLOCK_HEIGHT_WIDTH 4
+%define vp8_filter_weight 128
+%define VP8_FILTER_SHIFT 7
+
+
+;void vp8_filter_block1d_h6_mmx
+;(
+; unsigned char *src_ptr,
+; unsigned short *output_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned int pixel_step,
+; unsigned int output_height,
+; unsigned int output_width,
+; short * vp8_filter
+;)
+global sym(vp8_filter_block1d_h6_mmx) PRIVATE
+sym(vp8_filter_block1d_h6_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdx, arg(6) ;vp8_filter
+
+ movq mm1, [rdx + 16] ; do both the negative taps first!!!
+ movq mm2, [rdx + 32] ;
+ movq mm6, [rdx + 48] ;
+ movq mm7, [rdx + 64] ;
+
+ mov rdi, arg(1) ;output_ptr
+ mov rsi, arg(0) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rax, dword ptr arg(5) ;output_width ; destination pitch?
+ pxor mm0, mm0 ; mm0 = 00000000
+
+.nextrow:
+ movq mm3, [rsi-2] ; mm3 = p-2..p5
+ movq mm4, mm3 ; mm4 = p-2..p5
+ psrlq mm3, 8 ; mm3 = p-1..p5
+ punpcklbw mm3, mm0 ; mm3 = p-1..p2
+ pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
+
+ movq mm5, mm4 ; mm5 = p-2..p5
+ punpckhbw mm4, mm0 ; mm5 = p2..p5
+ pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers
+ paddsw mm3, mm4 ; mm3 += mm5
+
+ movq mm4, mm5 ; mm4 = p-2..p5;
+ psrlq mm5, 16 ; mm5 = p0..p5;
+ punpcklbw mm5, mm0 ; mm5 = p0..p3
+ pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers
+ paddsw mm3, mm5 ; mm3 += mm5
+
+ movq mm5, mm4 ; mm5 = p-2..p5
+ psrlq mm4, 24 ; mm4 = p1..p5
+ punpcklbw mm4, mm0 ; mm4 = p1..p4
+ pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers
+ paddsw mm3, mm4 ; mm3 += mm5
+
+ ; do outer positive taps
+ movd mm4, [rsi+3]
+ punpcklbw mm4, mm0 ; mm5 = p3..p6
+ pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers
+ paddsw mm3, mm4 ; mm3 += mm5
+
+ punpcklbw mm5, mm0 ; mm5 = p-2..p1
+ pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers
+ paddsw mm3, mm5 ; mm3 += mm5
+
+ paddsw mm3, [GLOBAL(rd)] ; mm3 += round value
+ psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
+ packuswb mm3, mm0 ; pack and unpack to saturate
+ punpcklbw mm3, mm0 ;
+
+ movq [rdi], mm3 ; store the results in the destination
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line
+ add rdi, rax;
+%else
+ movsxd r8, dword ptr arg(2) ;src_pixels_per_line
+ add rdi, rax;
+
+ add rsi, r8 ; next line
+%endif
+
+ dec rcx ; decrement count
+ jnz .nextrow ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_filter_block1dc_v6_mmx
+;(
+; short *src_ptr,
+; unsigned char *output_ptr,
+; int output_pitch,
+; unsigned int pixels_per_line,
+; unsigned int pixel_step,
+; unsigned int output_height,
+; unsigned int output_width,
+; short * vp8_filter
+;)
+global sym(vp8_filter_block1dc_v6_mmx) PRIVATE
+sym(vp8_filter_block1dc_v6_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movq mm5, [GLOBAL(rd)]
+ push rbx
+ mov rbx, arg(7) ;vp8_filter
+ movq mm1, [rbx + 16] ; do both the negative taps first!!!
+ movq mm2, [rbx + 32] ;
+ movq mm6, [rbx + 48] ;
+ movq mm7, [rbx + 64] ;
+
+ movsxd rdx, dword ptr arg(3) ;pixels_per_line
+ mov rdi, arg(1) ;output_ptr
+ mov rsi, arg(0) ;src_ptr
+ sub rsi, rdx
+ sub rsi, rdx
+ movsxd rcx, DWORD PTR arg(5) ;output_height
+ movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch?
+ pxor mm0, mm0 ; mm0 = 00000000
+
+
+.nextrow_cv:
+ movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1
+ pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
+
+
+ movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2
+ pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers.
+ paddsw mm3, mm4 ; mm3 += mm4
+
+ movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0
+ pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers.
+ paddsw mm3, mm4 ; mm3 += mm4
+
+ movq mm4, [rsi] ; mm4 = p0..p3 = row -2
+ pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers.
+ paddsw mm3, mm4 ; mm3 += mm4
+
+
+ add rsi, rdx ; move source forward 1 line to avoid 3 * pitch
+ movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1
+ pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers.
+ paddsw mm3, mm4 ; mm3 += mm4
+
+ movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3
+ pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers.
+ paddsw mm3, mm4 ; mm3 += mm4
+
+
+ paddsw mm3, mm5 ; mm3 += round value
+ psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
+ packuswb mm3, mm0 ; pack and saturate
+
+ movd [rdi],mm3 ; store the results in the destination
+ ; the subsequent iterations repeat 3 out of 4 of these reads. Since the
+ ; recon block should be in cache this shouldn't cost much. Its obviously
+ ; avoidable!!!.
+ lea rdi, [rdi+rax] ;
+ dec rcx ; decrement count
+ jnz .nextrow_cv ; next row
+
+ pop rbx
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void bilinear_predict8x8_mmx
+;(
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; int xoffset,
+; int yoffset,
+; unsigned char *dst_ptr,
+; int dst_pitch
+;)
+global sym(vp8_bilinear_predict8x8_mmx) PRIVATE
+sym(vp8_bilinear_predict8x8_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
+ ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
+
+ movsxd rax, dword ptr arg(2) ;xoffset
+ mov rdi, arg(4) ;dst_ptr ;
+
+ shl rax, 5 ; offset * 32
+ lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
+
+ add rax, rcx ; HFilter
+ mov rsi, arg(0) ;src_ptr ;
+
+ movsxd rdx, dword ptr arg(5) ;dst_pitch
+ movq mm1, [rax] ;
+
+ movq mm2, [rax+16] ;
+ movsxd rax, dword ptr arg(3) ;yoffset
+
+ pxor mm0, mm0 ;
+
+ shl rax, 5 ; offset*32
+ add rax, rcx ; VFilter
+
+ lea rcx, [rdi+rdx*8] ;
+ movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
+
+
+
+ ; get the first horizontal line done ;
+ movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ movq mm4, mm3 ; make a copy of current line
+
+ punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
+ punpckhbw mm4, mm0 ;
+
+ pmullw mm3, mm1 ;
+ pmullw mm4, mm1 ;
+
+ movq mm5, [rsi+1] ;
+ movq mm6, mm5 ;
+
+ punpcklbw mm5, mm0 ;
+ punpckhbw mm6, mm0 ;
+
+ pmullw mm5, mm2 ;
+ pmullw mm6, mm2 ;
+
+ paddw mm3, mm5 ;
+ paddw mm4, mm6 ;
+
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw mm4, [GLOBAL(rd)] ;
+ psraw mm4, VP8_FILTER_SHIFT ;
+
+ movq mm7, mm3 ;
+ packuswb mm7, mm4 ;
+
+ add rsi, rdx ; next line
+.next_row_8x8:
+ movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ movq mm4, mm3 ; make a copy of current line
+
+ punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
+ punpckhbw mm4, mm0 ;
+
+ pmullw mm3, mm1 ;
+ pmullw mm4, mm1 ;
+
+ movq mm5, [rsi+1] ;
+ movq mm6, mm5 ;
+
+ punpcklbw mm5, mm0 ;
+ punpckhbw mm6, mm0 ;
+
+ pmullw mm5, mm2 ;
+ pmullw mm6, mm2 ;
+
+ paddw mm3, mm5 ;
+ paddw mm4, mm6 ;
+
+ movq mm5, mm7 ;
+ movq mm6, mm7 ;
+
+ punpcklbw mm5, mm0 ;
+ punpckhbw mm6, mm0
+
+ pmullw mm5, [rax] ;
+ pmullw mm6, [rax] ;
+
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw mm4, [GLOBAL(rd)] ;
+ psraw mm4, VP8_FILTER_SHIFT ;
+
+ movq mm7, mm3 ;
+ packuswb mm7, mm4 ;
+
+
+ pmullw mm3, [rax+16] ;
+ pmullw mm4, [rax+16] ;
+
+ paddw mm3, mm5 ;
+ paddw mm4, mm6 ;
+
+
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw mm4, [GLOBAL(rd)] ;
+ psraw mm4, VP8_FILTER_SHIFT ;
+
+ packuswb mm3, mm4
+
+ movq [rdi], mm3 ; store the results in the destination
+
+%if ABI_IS_32BIT
+ add rsi, rdx ; next line
+ add rdi, dword ptr arg(5) ;dst_pitch ;
+%else
+ movsxd r8, dword ptr arg(5) ;dst_pitch
+ add rsi, rdx ; next line
+ add rdi, r8 ;dst_pitch
+%endif
+ cmp rdi, rcx ;
+ jne .next_row_8x8
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void bilinear_predict8x4_mmx
+;(
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; int xoffset,
+; int yoffset,
+; unsigned char *dst_ptr,
+; int dst_pitch
+;)
+global sym(vp8_bilinear_predict8x4_mmx) PRIVATE
+sym(vp8_bilinear_predict8x4_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
+ ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
+
+ movsxd rax, dword ptr arg(2) ;xoffset
+ mov rdi, arg(4) ;dst_ptr ;
+
+ lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
+ shl rax, 5
+
+ mov rsi, arg(0) ;src_ptr ;
+ add rax, rcx
+
+ movsxd rdx, dword ptr arg(5) ;dst_pitch
+ movq mm1, [rax] ;
+
+ movq mm2, [rax+16] ;
+ movsxd rax, dword ptr arg(3) ;yoffset
+
+ pxor mm0, mm0 ;
+ shl rax, 5
+
+ add rax, rcx
+ lea rcx, [rdi+rdx*4] ;
+
+ movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
+
+ ; get the first horizontal line done ;
+ movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ movq mm4, mm3 ; make a copy of current line
+
+ punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
+ punpckhbw mm4, mm0 ;
+
+ pmullw mm3, mm1 ;
+ pmullw mm4, mm1 ;
+
+ movq mm5, [rsi+1] ;
+ movq mm6, mm5 ;
+
+ punpcklbw mm5, mm0 ;
+ punpckhbw mm6, mm0 ;
+
+ pmullw mm5, mm2 ;
+ pmullw mm6, mm2 ;
+
+ paddw mm3, mm5 ;
+ paddw mm4, mm6 ;
+
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw mm4, [GLOBAL(rd)] ;
+ psraw mm4, VP8_FILTER_SHIFT ;
+
+ movq mm7, mm3 ;
+ packuswb mm7, mm4 ;
+
+ add rsi, rdx ; next line
+.next_row_8x4:
+ movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ movq mm4, mm3 ; make a copy of current line
+
+ punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
+ punpckhbw mm4, mm0 ;
+
+ pmullw mm3, mm1 ;
+ pmullw mm4, mm1 ;
+
+ movq mm5, [rsi+1] ;
+ movq mm6, mm5 ;
+
+ punpcklbw mm5, mm0 ;
+ punpckhbw mm6, mm0 ;
+
+ pmullw mm5, mm2 ;
+ pmullw mm6, mm2 ;
+
+ paddw mm3, mm5 ;
+ paddw mm4, mm6 ;
+
+ movq mm5, mm7 ;
+ movq mm6, mm7 ;
+
+ punpcklbw mm5, mm0 ;
+ punpckhbw mm6, mm0
+
+ pmullw mm5, [rax] ;
+ pmullw mm6, [rax] ;
+
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw mm4, [GLOBAL(rd)] ;
+ psraw mm4, VP8_FILTER_SHIFT ;
+
+ movq mm7, mm3 ;
+ packuswb mm7, mm4 ;
+
+
+ pmullw mm3, [rax+16] ;
+ pmullw mm4, [rax+16] ;
+
+ paddw mm3, mm5 ;
+ paddw mm4, mm6 ;
+
+
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw mm4, [GLOBAL(rd)] ;
+ psraw mm4, VP8_FILTER_SHIFT ;
+
+ packuswb mm3, mm4
+
+ movq [rdi], mm3 ; store the results in the destination
+
+%if ABI_IS_32BIT
+ add rsi, rdx ; next line
+ add rdi, dword ptr arg(5) ;dst_pitch ;
+%else
+ movsxd r8, dword ptr arg(5) ;dst_pitch
+ add rsi, rdx ; next line
+ add rdi, r8
+%endif
+ cmp rdi, rcx ;
+ jne .next_row_8x4
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void bilinear_predict4x4_mmx
+;(
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; int xoffset,
+; int yoffset,
+; unsigned char *dst_ptr,
+; int dst_pitch
+;)
+global sym(vp8_bilinear_predict4x4_mmx) PRIVATE
+sym(vp8_bilinear_predict4x4_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
+ ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
+
+ movsxd rax, dword ptr arg(2) ;xoffset
+ mov rdi, arg(4) ;dst_ptr ;
+
+ lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
+ shl rax, 5
+
+ add rax, rcx ; HFilter
+ mov rsi, arg(0) ;src_ptr ;
+
+ movsxd rdx, dword ptr arg(5) ;ldst_pitch
+ movq mm1, [rax] ;
+
+ movq mm2, [rax+16] ;
+ movsxd rax, dword ptr arg(3) ;yoffset
+
+ pxor mm0, mm0 ;
+ shl rax, 5
+
+ add rax, rcx
+ lea rcx, [rdi+rdx*4] ;
+
+ movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
+
+ ; get the first horizontal line done ;
+ movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
+
+ pmullw mm3, mm1 ;
+ movd mm5, [rsi+1] ;
+
+ punpcklbw mm5, mm0 ;
+ pmullw mm5, mm2 ;
+
+ paddw mm3, mm5 ;
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+
+ psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ movq mm7, mm3 ;
+ packuswb mm7, mm0 ;
+
+ add rsi, rdx ; next line
+.next_row_4x4:
+ movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
+
+ pmullw mm3, mm1 ;
+ movd mm5, [rsi+1] ;
+
+ punpcklbw mm5, mm0 ;
+ pmullw mm5, mm2 ;
+
+ paddw mm3, mm5 ;
+
+ movq mm5, mm7 ;
+ punpcklbw mm5, mm0 ;
+
+ pmullw mm5, [rax] ;
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+
+ psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+ movq mm7, mm3 ;
+
+ packuswb mm7, mm0 ;
+
+ pmullw mm3, [rax+16] ;
+ paddw mm3, mm5 ;
+
+
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ packuswb mm3, mm0
+ movd [rdi], mm3 ; store the results in the destination
+
+%if ABI_IS_32BIT
+ add rsi, rdx ; next line
+ add rdi, dword ptr arg(5) ;dst_pitch ;
+%else
+ movsxd r8, dword ptr arg(5) ;dst_pitch ;
+ add rsi, rdx ; next line
+ add rdi, r8
+%endif
+
+ cmp rdi, rcx ;
+ jne .next_row_4x4
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+SECTION_RODATA
+align 16
+rd:
+ times 4 dw 0x40
+
+align 16
+global HIDDEN_DATA(sym(vp8_six_tap_mmx))
+sym(vp8_six_tap_mmx):
+ times 8 dw 0
+ times 8 dw 0
+ times 8 dw 128
+ times 8 dw 0
+ times 8 dw 0
+ times 8 dw 0
+
+ times 8 dw 0
+ times 8 dw -6
+ times 8 dw 123
+ times 8 dw 12
+ times 8 dw -1
+ times 8 dw 0
+
+ times 8 dw 2
+ times 8 dw -11
+ times 8 dw 108
+ times 8 dw 36
+ times 8 dw -8
+ times 8 dw 1
+
+ times 8 dw 0
+ times 8 dw -9
+ times 8 dw 93
+ times 8 dw 50
+ times 8 dw -6
+ times 8 dw 0
+
+ times 8 dw 3
+ times 8 dw -16
+ times 8 dw 77
+ times 8 dw 77
+ times 8 dw -16
+ times 8 dw 3
+
+ times 8 dw 0
+ times 8 dw -6
+ times 8 dw 50
+ times 8 dw 93
+ times 8 dw -9
+ times 8 dw 0
+
+ times 8 dw 1
+ times 8 dw -8
+ times 8 dw 36
+ times 8 dw 108
+ times 8 dw -11
+ times 8 dw 2
+
+ times 8 dw 0
+ times 8 dw -1
+ times 8 dw 12
+ times 8 dw 123
+ times 8 dw -6
+ times 8 dw 0
+
+
diff --git a/thirdparty/libvpx/vp8/common/x86/subpixel_sse2.asm b/thirdparty/libvpx/vp8/common/x86/subpixel_sse2.asm
new file mode 100644
index 0000000000..69f8d103c1
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/x86/subpixel_sse2.asm
@@ -0,0 +1,1372 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+extern sym(vp8_bilinear_filters_x86_8)
+
+%define BLOCK_HEIGHT_WIDTH 4
+%define VP8_FILTER_WEIGHT 128
+%define VP8_FILTER_SHIFT 7
+
+
+;/************************************************************************************
+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
+; input pixel array has output_height rows. This routine assumes that output_height is an
+; even number. This function handles 8 pixels in horizontal direction, calculating ONE
+; rows each iteration to take advantage of the 128 bits operations.
+;*************************************************************************************/
+;void vp8_filter_block1d8_h6_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned short *output_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned int pixel_step,
+; unsigned int output_height,
+; unsigned int output_width,
+; short *vp8_filter
+;)
+global sym(vp8_filter_block1d8_h6_sse2) PRIVATE
+sym(vp8_filter_block1d8_h6_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdx, arg(6) ;vp8_filter
+ mov rsi, arg(0) ;src_ptr
+
+ mov rdi, arg(1) ;output_ptr
+
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(5) ;output_width
+%endif
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+
+.filter_block1d8_h6_rowloop:
+ movq xmm3, MMWORD PTR [rsi - 2]
+ movq xmm1, MMWORD PTR [rsi + 6]
+
+ prefetcht2 [rsi+rax-2]
+
+ pslldq xmm1, 8
+ por xmm1, xmm3
+
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm1
+
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm1
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
+
+
+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
+
+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
+
+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+ psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+
+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
+
+ punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+ pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
+
+
+ paddsw xmm4, xmm7
+ paddsw xmm4, xmm5
+
+ paddsw xmm4, xmm3
+ paddsw xmm4, xmm6
+
+ paddsw xmm4, xmm1
+ paddsw xmm4, [GLOBAL(rd)]
+
+ psraw xmm4, 7
+
+ packuswb xmm4, xmm0
+ punpcklbw xmm4, xmm0
+
+ movdqa XMMWORD Ptr [rdi], xmm4
+ lea rsi, [rsi + rax]
+
+%if ABI_IS_32BIT
+ add rdi, DWORD Ptr arg(5) ;[output_width]
+%else
+ add rdi, r8
+%endif
+ dec rcx
+
+ jnz .filter_block1d8_h6_rowloop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_filter_block1d16_h6_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned short *output_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned int pixel_step,
+; unsigned int output_height,
+; unsigned int output_width,
+; short *vp8_filter
+;)
+;/************************************************************************************
+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
+; input pixel array has output_height rows. This routine assumes that output_height is an
+; even number. This function handles 8 pixels in horizontal direction, calculating ONE
+; rows each iteration to take advantage of the 128 bits operations.
+;*************************************************************************************/
+global sym(vp8_filter_block1d16_h6_sse2) PRIVATE
+sym(vp8_filter_block1d16_h6_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdx, arg(6) ;vp8_filter
+ mov rsi, arg(0) ;src_ptr
+
+ mov rdi, arg(1) ;output_ptr
+
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(5) ;output_width
+%endif
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+
+.filter_block1d16_h6_sse2_rowloop:
+ movq xmm3, MMWORD PTR [rsi - 2]
+ movq xmm1, MMWORD PTR [rsi + 6]
+
+ movq xmm2, MMWORD PTR [rsi +14]
+ pslldq xmm2, 8
+
+ por xmm2, xmm1
+ prefetcht2 [rsi+rax-2]
+
+ pslldq xmm1, 8
+ por xmm1, xmm3
+
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm1
+
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm1
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
+
+
+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
+
+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
+
+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+ psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+
+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
+
+ punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+ pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
+
+ paddsw xmm4, xmm7
+ paddsw xmm4, xmm5
+
+ paddsw xmm4, xmm3
+ paddsw xmm4, xmm6
+
+ paddsw xmm4, xmm1
+ paddsw xmm4, [GLOBAL(rd)]
+
+ psraw xmm4, 7
+
+ packuswb xmm4, xmm0
+ punpcklbw xmm4, xmm0
+
+ movdqa XMMWORD Ptr [rdi], xmm4
+
+ movdqa xmm3, xmm2
+ movdqa xmm4, xmm2
+
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm2
+
+ movdqa xmm7, xmm2
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
+
+
+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
+
+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
+
+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+ psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
+
+ punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+ pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
+
+
+ paddsw xmm4, xmm7
+ paddsw xmm4, xmm5
+
+ paddsw xmm4, xmm3
+ paddsw xmm4, xmm6
+
+ paddsw xmm4, xmm2
+ paddsw xmm4, [GLOBAL(rd)]
+
+ psraw xmm4, 7
+
+ packuswb xmm4, xmm0
+ punpcklbw xmm4, xmm0
+
+ movdqa XMMWORD Ptr [rdi+16], xmm4
+
+ lea rsi, [rsi + rax]
+%if ABI_IS_32BIT
+ add rdi, DWORD Ptr arg(5) ;[output_width]
+%else
+ add rdi, r8
+%endif
+
+ dec rcx
+ jnz .filter_block1d16_h6_sse2_rowloop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_filter_block1d8_v6_sse2
+;(
+; short *src_ptr,
+; unsigned char *output_ptr,
+; int dst_ptich,
+; unsigned int pixels_per_line,
+; unsigned int pixel_step,
+; unsigned int output_height,
+; unsigned int output_width,
+; short * vp8_filter
+;)
+;/************************************************************************************
+; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
+; input pixel array has output_height rows.
+;*************************************************************************************/
+global sym(vp8_filter_block1d8_v6_sse2) PRIVATE
+sym(vp8_filter_block1d8_v6_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rax, arg(7) ;vp8_filter
+ movsxd rdx, dword ptr arg(3) ;pixels_per_line
+
+ mov rdi, arg(1) ;output_ptr
+ mov rsi, arg(0) ;src_ptr
+
+ sub rsi, rdx
+ sub rsi, rdx
+
+ movsxd rcx, DWORD PTR arg(5) ;[output_height]
+ pxor xmm0, xmm0 ; clear xmm0
+
+ movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(2) ; dst_ptich
+%endif
+
+.vp8_filter_block1d8_v6_sse2_loop:
+ movdqa xmm1, XMMWORD PTR [rsi]
+ pmullw xmm1, [rax]
+
+ movdqa xmm2, XMMWORD PTR [rsi + rdx]
+ pmullw xmm2, [rax + 16]
+
+ movdqa xmm3, XMMWORD PTR [rsi + rdx * 2]
+ pmullw xmm3, [rax + 32]
+
+ movdqa xmm5, XMMWORD PTR [rsi + rdx * 4]
+ pmullw xmm5, [rax + 64]
+
+ add rsi, rdx
+ movdqa xmm4, XMMWORD PTR [rsi + rdx * 2]
+
+ pmullw xmm4, [rax + 48]
+ movdqa xmm6, XMMWORD PTR [rsi + rdx * 4]
+
+ pmullw xmm6, [rax + 80]
+
+ paddsw xmm2, xmm5
+ paddsw xmm2, xmm3
+
+ paddsw xmm2, xmm1
+ paddsw xmm2, xmm4
+
+ paddsw xmm2, xmm6
+ paddsw xmm2, xmm7
+
+ psraw xmm2, 7
+ packuswb xmm2, xmm0 ; pack and saturate
+
+ movq QWORD PTR [rdi], xmm2 ; store the results in the destination
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(2) ;[dst_ptich]
+%else
+ add rdi, r8
+%endif
+ dec rcx ; decrement count
+ jnz .vp8_filter_block1d8_v6_sse2_loop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_filter_block1d16_v6_sse2
+;(
+; unsigned short *src_ptr,
+; unsigned char *output_ptr,
+; int dst_ptich,
+; unsigned int pixels_per_line,
+; unsigned int pixel_step,
+; unsigned int output_height,
+; unsigned int output_width,
+; const short *vp8_filter
+;)
+;/************************************************************************************
+; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
+; input pixel array has output_height rows.
+;*************************************************************************************/
+global sym(vp8_filter_block1d16_v6_sse2) PRIVATE
+sym(vp8_filter_block1d16_v6_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rax, arg(7) ;vp8_filter
+ movsxd rdx, dword ptr arg(3) ;pixels_per_line
+
+ mov rdi, arg(1) ;output_ptr
+ mov rsi, arg(0) ;src_ptr
+
+ sub rsi, rdx
+ sub rsi, rdx
+
+ movsxd rcx, DWORD PTR arg(5) ;[output_height]
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(2) ; dst_ptich
+%endif
+
+.vp8_filter_block1d16_v6_sse2_loop:
+; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
+ movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2
+ movdqa xmm2, XMMWORD PTR [rsi + rdx + 16]
+ pmullw xmm1, [rax + 16]
+ pmullw xmm2, [rax + 16]
+
+ movdqa xmm3, XMMWORD PTR [rsi + rdx * 4] ; line 5
+ movdqa xmm4, XMMWORD PTR [rsi + rdx * 4 + 16]
+ pmullw xmm3, [rax + 64]
+ pmullw xmm4, [rax + 64]
+
+ movdqa xmm5, XMMWORD PTR [rsi + rdx * 2] ; line 3
+ movdqa xmm6, XMMWORD PTR [rsi + rdx * 2 + 16]
+ pmullw xmm5, [rax + 32]
+ pmullw xmm6, [rax + 32]
+
+ movdqa xmm7, XMMWORD PTR [rsi] ; line 1
+ movdqa xmm0, XMMWORD PTR [rsi + 16]
+ pmullw xmm7, [rax]
+ pmullw xmm0, [rax]
+
+ paddsw xmm1, xmm3
+ paddsw xmm2, xmm4
+ paddsw xmm1, xmm5
+ paddsw xmm2, xmm6
+ paddsw xmm1, xmm7
+ paddsw xmm2, xmm0
+
+ add rsi, rdx
+
+ movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] ; line 4
+ movdqa xmm4, XMMWORD PTR [rsi + rdx * 2 + 16]
+ pmullw xmm3, [rax + 48]
+ pmullw xmm4, [rax + 48]
+
+ movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] ; line 6
+ movdqa xmm6, XMMWORD PTR [rsi + rdx * 4 + 16]
+ pmullw xmm5, [rax + 80]
+ pmullw xmm6, [rax + 80]
+
+ movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
+ pxor xmm0, xmm0 ; clear xmm0
+
+ paddsw xmm1, xmm3
+ paddsw xmm2, xmm4
+ paddsw xmm1, xmm5
+ paddsw xmm2, xmm6
+
+ paddsw xmm1, xmm7
+ paddsw xmm2, xmm7
+
+ psraw xmm1, 7
+ psraw xmm2, 7
+
+ packuswb xmm1, xmm2 ; pack and saturate
+ movdqa XMMWORD PTR [rdi], xmm1 ; store the results in the destination
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(2) ;[dst_ptich]
+%else
+ add rdi, r8
+%endif
+ dec rcx ; decrement count
+ jnz .vp8_filter_block1d16_v6_sse2_loop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_filter_block1d8_h6_only_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; int dst_ptich,
+; unsigned int output_height,
+; const short *vp8_filter
+;)
+; First-pass filter only when yoffset==0
+global sym(vp8_filter_block1d8_h6_only_sse2) PRIVATE
+sym(vp8_filter_block1d8_h6_only_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdx, arg(5) ;vp8_filter
+ mov rsi, arg(0) ;src_ptr
+
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(3) ;dst_ptich
+%endif
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+
+.filter_block1d8_h6_only_rowloop:
+ movq xmm3, MMWORD PTR [rsi - 2]
+ movq xmm1, MMWORD PTR [rsi + 6]
+
+ prefetcht2 [rsi+rax-2]
+
+ pslldq xmm1, 8
+ por xmm1, xmm3
+
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm1
+
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm1
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
+
+
+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
+
+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
+
+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+ psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+
+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
+
+ punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+ pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
+
+
+ paddsw xmm4, xmm7
+ paddsw xmm4, xmm5
+
+ paddsw xmm4, xmm3
+ paddsw xmm4, xmm6
+
+ paddsw xmm4, xmm1
+ paddsw xmm4, [GLOBAL(rd)]
+
+ psraw xmm4, 7
+
+ packuswb xmm4, xmm0
+
+ movq QWORD PTR [rdi], xmm4 ; store the results in the destination
+ lea rsi, [rsi + rax]
+
+%if ABI_IS_32BIT
+ add rdi, DWORD Ptr arg(3) ;dst_ptich
+%else
+ add rdi, r8
+%endif
+ dec rcx
+
+ jnz .filter_block1d8_h6_only_rowloop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_filter_block1d16_h6_only_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; int dst_ptich,
+; unsigned int output_height,
+; const short *vp8_filter
+;)
+; First-pass filter only when yoffset==0
+global sym(vp8_filter_block1d16_h6_only_sse2) PRIVATE
+sym(vp8_filter_block1d16_h6_only_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdx, arg(5) ;vp8_filter
+ mov rsi, arg(0) ;src_ptr
+
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(3) ;dst_ptich
+%endif
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+
+.filter_block1d16_h6_only_sse2_rowloop:
+ movq xmm3, MMWORD PTR [rsi - 2]
+ movq xmm1, MMWORD PTR [rsi + 6]
+
+ movq xmm2, MMWORD PTR [rsi +14]
+ pslldq xmm2, 8
+
+ por xmm2, xmm1
+ prefetcht2 [rsi+rax-2]
+
+ pslldq xmm1, 8
+ por xmm1, xmm3
+
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm1
+
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm1
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
+
+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
+
+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
+
+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+ psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
+
+ punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+ pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
+
+ paddsw xmm4, xmm7
+ paddsw xmm4, xmm5
+
+ paddsw xmm4, xmm3
+ paddsw xmm4, xmm6
+
+ paddsw xmm4, xmm1
+ paddsw xmm4, [GLOBAL(rd)]
+
+ psraw xmm4, 7
+
+ packuswb xmm4, xmm0 ; lower 8 bytes
+
+ movq QWORD Ptr [rdi], xmm4 ; store the results in the destination
+
+ movdqa xmm3, xmm2
+ movdqa xmm4, xmm2
+
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm2
+
+ movdqa xmm7, xmm2
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
+
+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
+
+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
+
+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+ psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
+
+ punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+ pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
+
+ paddsw xmm4, xmm7
+ paddsw xmm4, xmm5
+
+ paddsw xmm4, xmm3
+ paddsw xmm4, xmm6
+
+ paddsw xmm4, xmm2
+ paddsw xmm4, [GLOBAL(rd)]
+
+ psraw xmm4, 7
+
+ packuswb xmm4, xmm0 ; higher 8 bytes
+
+ movq QWORD Ptr [rdi+8], xmm4 ; store the results in the destination
+
+ lea rsi, [rsi + rax]
+%if ABI_IS_32BIT
+ add rdi, DWORD Ptr arg(3) ;dst_ptich
+%else
+ add rdi, r8
+%endif
+
+ dec rcx
+ jnz .filter_block1d16_h6_only_sse2_rowloop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_filter_block1d8_v6_only_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; int dst_ptich,
+; unsigned int output_height,
+; const short *vp8_filter
+;)
+; Second-pass filter only when xoffset==0
+global sym(vp8_filter_block1d8_v6_only_sse2) PRIVATE
+sym(vp8_filter_block1d8_v6_only_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
+
+ mov rax, arg(5) ;vp8_filter
+
+ pxor xmm0, xmm0 ; clear xmm0
+
+ movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(3) ; dst_ptich
+%endif
+
+.vp8_filter_block1d8_v6_only_sse2_loop:
+ movq xmm1, MMWORD PTR [rsi]
+ movq xmm2, MMWORD PTR [rsi + rdx]
+ movq xmm3, MMWORD PTR [rsi + rdx * 2]
+ movq xmm5, MMWORD PTR [rsi + rdx * 4]
+ add rsi, rdx
+ movq xmm4, MMWORD PTR [rsi + rdx * 2]
+ movq xmm6, MMWORD PTR [rsi + rdx * 4]
+
+ punpcklbw xmm1, xmm0
+ pmullw xmm1, [rax]
+
+ punpcklbw xmm2, xmm0
+ pmullw xmm2, [rax + 16]
+
+ punpcklbw xmm3, xmm0
+ pmullw xmm3, [rax + 32]
+
+ punpcklbw xmm5, xmm0
+ pmullw xmm5, [rax + 64]
+
+ punpcklbw xmm4, xmm0
+ pmullw xmm4, [rax + 48]
+
+ punpcklbw xmm6, xmm0
+ pmullw xmm6, [rax + 80]
+
+ paddsw xmm2, xmm5
+ paddsw xmm2, xmm3
+
+ paddsw xmm2, xmm1
+ paddsw xmm2, xmm4
+
+ paddsw xmm2, xmm6
+ paddsw xmm2, xmm7
+
+ psraw xmm2, 7
+ packuswb xmm2, xmm0 ; pack and saturate
+
+ movq QWORD PTR [rdi], xmm2 ; store the results in the destination
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;[dst_ptich]
+%else
+ add rdi, r8
+%endif
+ dec rcx ; decrement count
+ jnz .vp8_filter_block1d8_v6_only_sse2_loop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_unpack_block1d16_h6_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned short *output_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned int output_height,
+; unsigned int output_width
+;)
+global sym(vp8_unpack_block1d16_h6_sse2) PRIVATE
+sym(vp8_unpack_block1d16_h6_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(1) ;output_ptr
+
+ movsxd rcx, dword ptr arg(3) ;output_height
+ movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source
+%endif
+
+.unpack_block1d16_h6_sse2_rowloop:
+ movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
+ movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ punpcklbw xmm1, xmm0
+
+ movdqa XMMWORD Ptr [rdi], xmm1
+ movdqa XMMWORD Ptr [rdi + 16], xmm3
+
+ lea rsi, [rsi + rax]
+%if ABI_IS_32BIT
+ add rdi, DWORD Ptr arg(4) ;[output_width]
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .unpack_block1d16_h6_sse2_rowloop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_bilinear_predict16x16_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; int xoffset,
+; int yoffset,
+; unsigned char *dst_ptr,
+; int dst_pitch
+;)
+extern sym(vp8_bilinear_filters_x86_8)
+global sym(vp8_bilinear_predict16x16_sse2) PRIVATE
+sym(vp8_bilinear_predict16x16_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
+ ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
+
+ lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
+ movsxd rax, dword ptr arg(2) ;xoffset
+
+ cmp rax, 0 ;skip first_pass filter if xoffset=0
+ je .b16x16_sp_only
+
+ shl rax, 5
+ add rax, rcx ;HFilter
+
+ mov rdi, arg(4) ;dst_ptr
+ mov rsi, arg(0) ;src_ptr
+ movsxd rdx, dword ptr arg(5) ;dst_pitch
+
+ movdqa xmm1, [rax]
+ movdqa xmm2, [rax+16]
+
+ movsxd rax, dword ptr arg(3) ;yoffset
+
+ cmp rax, 0 ;skip second_pass filter if yoffset=0
+ je .b16x16_fp_only
+
+ shl rax, 5
+ add rax, rcx ;VFilter
+
+ lea rcx, [rdi+rdx*8]
+ lea rcx, [rcx+rdx*8]
+ movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
+
+ pxor xmm0, xmm0
+
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(5) ;dst_pitch
+%endif
+ ; get the first horizontal line done
+ movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ movdqa xmm4, xmm3 ; make a copy of current line
+
+ punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
+ punpckhbw xmm4, xmm0
+
+ pmullw xmm3, xmm1
+ pmullw xmm4, xmm1
+
+ movdqu xmm5, [rsi+1]
+ movdqa xmm6, xmm5
+
+ punpcklbw xmm5, xmm0
+ punpckhbw xmm6, xmm0
+
+ pmullw xmm5, xmm2
+ pmullw xmm6, xmm2
+
+ paddw xmm3, xmm5
+ paddw xmm4, xmm6
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw xmm4, [GLOBAL(rd)]
+ psraw xmm4, VP8_FILTER_SHIFT
+
+ movdqa xmm7, xmm3
+ packuswb xmm7, xmm4
+
+ add rsi, rdx ; next line
+.next_row:
+ movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ movdqa xmm4, xmm3 ; make a copy of current line
+
+ punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
+ punpckhbw xmm4, xmm0
+
+ pmullw xmm3, xmm1
+ pmullw xmm4, xmm1
+
+ movdqu xmm5, [rsi+1]
+ movdqa xmm6, xmm5
+
+ punpcklbw xmm5, xmm0
+ punpckhbw xmm6, xmm0
+
+ pmullw xmm5, xmm2
+ pmullw xmm6, xmm2
+
+ paddw xmm3, xmm5
+ paddw xmm4, xmm6
+
+ movdqa xmm5, xmm7
+ movdqa xmm6, xmm7
+
+ punpcklbw xmm5, xmm0
+ punpckhbw xmm6, xmm0
+
+ pmullw xmm5, [rax]
+ pmullw xmm6, [rax]
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw xmm4, [GLOBAL(rd)]
+ psraw xmm4, VP8_FILTER_SHIFT
+
+ movdqa xmm7, xmm3
+ packuswb xmm7, xmm4
+
+ pmullw xmm3, [rax+16]
+ pmullw xmm4, [rax+16]
+
+ paddw xmm3, xmm5
+ paddw xmm4, xmm6
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw xmm4, [GLOBAL(rd)]
+ psraw xmm4, VP8_FILTER_SHIFT
+
+ packuswb xmm3, xmm4
+ movdqa [rdi], xmm3 ; store the results in the destination
+
+ add rsi, rdx ; next line
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(5) ;dst_pitch
+%else
+ add rdi, r8
+%endif
+
+ cmp rdi, rcx
+ jne .next_row
+
+ jmp .done
+
+.b16x16_sp_only:
+ movsxd rax, dword ptr arg(3) ;yoffset
+ shl rax, 5
+ add rax, rcx ;VFilter
+
+ mov rdi, arg(4) ;dst_ptr
+ mov rsi, arg(0) ;src_ptr
+ movsxd rdx, dword ptr arg(5) ;dst_pitch
+
+ movdqa xmm1, [rax]
+ movdqa xmm2, [rax+16]
+
+ lea rcx, [rdi+rdx*8]
+ lea rcx, [rcx+rdx*8]
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+
+ pxor xmm0, xmm0
+
+ ; get the first horizontal line done
+ movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+
+ add rsi, rax ; next line
+.next_row_spo:
+ movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+
+ movdqa xmm5, xmm7
+ movdqa xmm6, xmm7
+
+ movdqa xmm4, xmm3 ; make a copy of current line
+ movdqa xmm7, xmm3
+
+ punpcklbw xmm5, xmm0
+ punpckhbw xmm6, xmm0
+ punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
+ punpckhbw xmm4, xmm0
+
+ pmullw xmm5, xmm1
+ pmullw xmm6, xmm1
+ pmullw xmm3, xmm2
+ pmullw xmm4, xmm2
+
+ paddw xmm3, xmm5
+ paddw xmm4, xmm6
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw xmm4, [GLOBAL(rd)]
+ psraw xmm4, VP8_FILTER_SHIFT
+
+ packuswb xmm3, xmm4
+ movdqa [rdi], xmm3 ; store the results in the destination
+
+ add rsi, rax ; next line
+ add rdi, rdx ;dst_pitch
+ cmp rdi, rcx
+ jne .next_row_spo
+
+ jmp .done
+
+.b16x16_fp_only:
+ lea rcx, [rdi+rdx*8]
+ lea rcx, [rcx+rdx*8]
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ pxor xmm0, xmm0
+
+.next_row_fpo:
+ movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ movdqa xmm4, xmm3 ; make a copy of current line
+
+ punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
+ punpckhbw xmm4, xmm0
+
+ pmullw xmm3, xmm1
+ pmullw xmm4, xmm1
+
+ movdqu xmm5, [rsi+1]
+ movdqa xmm6, xmm5
+
+ punpcklbw xmm5, xmm0
+ punpckhbw xmm6, xmm0
+
+ pmullw xmm5, xmm2
+ pmullw xmm6, xmm2
+
+ paddw xmm3, xmm5
+ paddw xmm4, xmm6
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw xmm4, [GLOBAL(rd)]
+ psraw xmm4, VP8_FILTER_SHIFT
+
+ packuswb xmm3, xmm4
+ movdqa [rdi], xmm3 ; store the results in the destination
+
+ add rsi, rax ; next line
+ add rdi, rdx ; dst_pitch
+ cmp rdi, rcx
+ jne .next_row_fpo
+
+.done:
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_bilinear_predict8x8_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; int xoffset,
+; int yoffset,
+; unsigned char *dst_ptr,
+; int dst_pitch
+;)
+global sym(vp8_bilinear_predict8x8_sse2) PRIVATE
+sym(vp8_bilinear_predict8x8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 144 ; reserve 144 bytes
+
+ ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
+ ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
+ lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
+
+ ;Read 9-line unaligned data in and put them on stack. This gives a big
+ ;performance boost.
+ movdqu xmm0, [rsi]
+ lea rax, [rdx + rdx*2]
+ movdqu xmm1, [rsi+rdx]
+ movdqu xmm2, [rsi+rdx*2]
+ add rsi, rax
+ movdqu xmm3, [rsi]
+ movdqu xmm4, [rsi+rdx]
+ movdqu xmm5, [rsi+rdx*2]
+ add rsi, rax
+ movdqu xmm6, [rsi]
+ movdqu xmm7, [rsi+rdx]
+
+ movdqa XMMWORD PTR [rsp], xmm0
+
+ movdqu xmm0, [rsi+rdx*2]
+
+ movdqa XMMWORD PTR [rsp+16], xmm1
+ movdqa XMMWORD PTR [rsp+32], xmm2
+ movdqa XMMWORD PTR [rsp+48], xmm3
+ movdqa XMMWORD PTR [rsp+64], xmm4
+ movdqa XMMWORD PTR [rsp+80], xmm5
+ movdqa XMMWORD PTR [rsp+96], xmm6
+ movdqa XMMWORD PTR [rsp+112], xmm7
+ movdqa XMMWORD PTR [rsp+128], xmm0
+
+ movsxd rax, dword ptr arg(2) ;xoffset
+ shl rax, 5
+ add rax, rcx ;HFilter
+
+ mov rdi, arg(4) ;dst_ptr
+ movsxd rdx, dword ptr arg(5) ;dst_pitch
+
+ movdqa xmm1, [rax]
+ movdqa xmm2, [rax+16]
+
+ movsxd rax, dword ptr arg(3) ;yoffset
+ shl rax, 5
+ add rax, rcx ;VFilter
+
+ lea rcx, [rdi+rdx*8]
+
+ movdqa xmm5, [rax]
+ movdqa xmm6, [rax+16]
+
+ pxor xmm0, xmm0
+
+ ; get the first horizontal line done
+ movdqa xmm3, XMMWORD PTR [rsp]
+ movdqa xmm4, xmm3 ; make a copy of current line
+ psrldq xmm4, 1
+
+ punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07
+ punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08
+
+ pmullw xmm3, xmm1
+ pmullw xmm4, xmm2
+
+ paddw xmm3, xmm4
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ movdqa xmm7, xmm3
+ add rsp, 16 ; next line
+.next_row8x8:
+ movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+ movdqa xmm4, xmm3 ; make a copy of current line
+ psrldq xmm4, 1
+
+ punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07
+ punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08
+
+ pmullw xmm3, xmm1
+ pmullw xmm4, xmm2
+
+ paddw xmm3, xmm4
+ pmullw xmm7, xmm5
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ movdqa xmm4, xmm3
+
+ pmullw xmm3, xmm6
+ paddw xmm3, xmm7
+
+ movdqa xmm7, xmm4
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ packuswb xmm3, xmm0
+ movq [rdi], xmm3 ; store the results in the destination
+
+ add rsp, 16 ; next line
+ add rdi, rdx
+
+ cmp rdi, rcx
+ jne .next_row8x8
+
+ ;add rsp, 144
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+SECTION_RODATA
+align 16
+rd:
+ times 8 dw 0x40
diff --git a/thirdparty/libvpx/vp8/common/x86/subpixel_ssse3.asm b/thirdparty/libvpx/vp8/common/x86/subpixel_ssse3.asm
new file mode 100644
index 0000000000..c06f24556e
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/x86/subpixel_ssse3.asm
@@ -0,0 +1,1508 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define BLOCK_HEIGHT_WIDTH 4
+%define VP8_FILTER_WEIGHT 128
+%define VP8_FILTER_SHIFT 7
+
+
+;/************************************************************************************
+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
+; input pixel array has output_height rows. This routine assumes that output_height is an
+; even number. This function handles 8 pixels in horizontal direction, calculating ONE
+; rows each iteration to take advantage of the 128 bits operations.
+;
+; This is an implementation of some of the SSE optimizations first seen in ffvp8
+;
+;*************************************************************************************/
+;void vp8_filter_block1d8_h6_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; unsigned int vp8_filter_index
+;)
+global sym(vp8_filter_block1d8_h6_ssse3) PRIVATE
+sym(vp8_filter_block1d8_h6_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movsxd rdx, DWORD PTR arg(5) ;table index
+ xor rsi, rsi
+ shl rdx, 4
+
+ movdqa xmm7, [GLOBAL(rd)]
+
+ lea rax, [GLOBAL(k0_k5)]
+ add rax, rdx
+ mov rdi, arg(2) ;output_ptr
+
+ cmp esi, DWORD PTR [rax]
+ je vp8_filter_block1d8_h4_ssse3
+
+ movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
+ movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ movsxd rcx, dword ptr arg(4) ;output_height
+
+ movsxd rdx, dword ptr arg(3) ;output_pitch
+
+ sub rdi, rdx
+;xmm3 free
+.filter_block1d8_h6_rowloop_ssse3:
+ movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
+
+ movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
+
+ punpcklbw xmm0, xmm2 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
+
+ movdqa xmm1, xmm0
+ pmaddubsw xmm0, xmm4
+
+ movdqa xmm2, xmm1
+ pshufb xmm1, [GLOBAL(shuf2bfrom1)]
+
+ pshufb xmm2, [GLOBAL(shuf3bfrom1)]
+ pmaddubsw xmm1, xmm5
+
+ lea rdi, [rdi + rdx]
+ pmaddubsw xmm2, xmm6
+
+ lea rsi, [rsi + rax]
+ dec rcx
+
+ paddsw xmm0, xmm1
+ paddsw xmm2, xmm7
+
+ paddsw xmm0, xmm2
+
+ psraw xmm0, 7
+
+ packuswb xmm0, xmm0
+
+ movq MMWORD Ptr [rdi], xmm0
+ jnz .filter_block1d8_h6_rowloop_ssse3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+vp8_filter_block1d8_h4_ssse3:
+ movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
+
+ movdqa xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
+ movdqa xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
+
+ mov rsi, arg(0) ;src_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ movsxd rcx, dword ptr arg(4) ;output_height
+
+ movsxd rdx, dword ptr arg(3) ;output_pitch
+
+ sub rdi, rdx
+
+.filter_block1d8_h4_rowloop_ssse3:
+ movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
+
+ movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
+
+ punpcklbw xmm0, xmm1 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
+
+ movdqa xmm2, xmm0
+ pshufb xmm0, xmm3
+
+ pshufb xmm2, xmm4
+ pmaddubsw xmm0, xmm5
+
+ lea rdi, [rdi + rdx]
+ pmaddubsw xmm2, xmm6
+
+ lea rsi, [rsi + rax]
+ dec rcx
+
+ paddsw xmm0, xmm7
+
+ paddsw xmm0, xmm2
+
+ psraw xmm0, 7
+
+ packuswb xmm0, xmm0
+
+ movq MMWORD Ptr [rdi], xmm0
+
+ jnz .filter_block1d8_h4_rowloop_ssse3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+;void vp8_filter_block1d16_h6_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; unsigned int vp8_filter_index
+;)
+global sym(vp8_filter_block1d16_h6_ssse3) PRIVATE
+sym(vp8_filter_block1d16_h6_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movsxd rdx, DWORD PTR arg(5) ;table index
+ xor rsi, rsi
+ shl rdx, 4 ;
+
+ lea rax, [GLOBAL(k0_k5)]
+ add rax, rdx
+
+ mov rdi, arg(2) ;output_ptr
+
+ mov rsi, arg(0) ;src_ptr
+
+ movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
+ movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
+
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rdx, dword ptr arg(3) ;output_pitch
+
+.filter_block1d16_h6_rowloop_ssse3:
+ movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
+
+ movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
+
+ punpcklbw xmm0, xmm3 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
+
+ movdqa xmm1, xmm0
+ pmaddubsw xmm0, xmm4
+
+ movdqa xmm2, xmm1
+ pshufb xmm1, [GLOBAL(shuf2bfrom1)]
+
+ pshufb xmm2, [GLOBAL(shuf3bfrom1)]
+ movq xmm3, MMWORD PTR [rsi + 6]
+
+ pmaddubsw xmm1, xmm5
+ movq xmm7, MMWORD PTR [rsi + 11]
+
+ pmaddubsw xmm2, xmm6
+ punpcklbw xmm3, xmm7
+
+ paddsw xmm0, xmm1
+ movdqa xmm1, xmm3
+
+ pmaddubsw xmm3, xmm4
+ paddsw xmm0, xmm2
+
+ movdqa xmm2, xmm1
+ paddsw xmm0, [GLOBAL(rd)]
+
+ pshufb xmm1, [GLOBAL(shuf2bfrom1)]
+ pshufb xmm2, [GLOBAL(shuf3bfrom1)]
+
+ psraw xmm0, 7
+ pmaddubsw xmm1, xmm5
+
+ pmaddubsw xmm2, xmm6
+ packuswb xmm0, xmm0
+
+ lea rsi, [rsi + rax]
+ paddsw xmm3, xmm1
+
+ paddsw xmm3, xmm2
+
+ paddsw xmm3, [GLOBAL(rd)]
+
+ psraw xmm3, 7
+
+ packuswb xmm3, xmm3
+
+ punpcklqdq xmm0, xmm3
+
+ movdqa XMMWORD Ptr [rdi], xmm0
+
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .filter_block1d16_h6_rowloop_ssse3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_filter_block1d4_h6_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; unsigned int vp8_filter_index
+;)
+global sym(vp8_filter_block1d4_h6_ssse3) PRIVATE
+sym(vp8_filter_block1d4_h6_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movsxd rdx, DWORD PTR arg(5) ;table index
+ xor rsi, rsi
+ shl rdx, 4 ;
+
+ lea rax, [GLOBAL(k0_k5)]
+ add rax, rdx
+ movdqa xmm7, [GLOBAL(rd)]
+
+ cmp esi, DWORD PTR [rax]
+ je .vp8_filter_block1d4_h4_ssse3
+
+ movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
+ movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ movsxd rcx, dword ptr arg(4) ;output_height
+
+ movsxd rdx, dword ptr arg(3) ;output_pitch
+
+;xmm3 free
+.filter_block1d4_h6_rowloop_ssse3:
+ movdqu xmm0, XMMWORD PTR [rsi - 2]
+
+ movdqa xmm1, xmm0
+ pshufb xmm0, [GLOBAL(shuf1b)]
+
+ movdqa xmm2, xmm1
+ pshufb xmm1, [GLOBAL(shuf2b)]
+ pmaddubsw xmm0, xmm4
+ pshufb xmm2, [GLOBAL(shuf3b)]
+ pmaddubsw xmm1, xmm5
+
+;--
+ pmaddubsw xmm2, xmm6
+
+ lea rsi, [rsi + rax]
+;--
+ paddsw xmm0, xmm1
+ paddsw xmm0, xmm7
+ pxor xmm1, xmm1
+ paddsw xmm0, xmm2
+ psraw xmm0, 7
+ packuswb xmm0, xmm0
+
+ movd DWORD PTR [rdi], xmm0
+
+ add rdi, rdx
+ dec rcx
+ jnz .filter_block1d4_h6_rowloop_ssse3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+.vp8_filter_block1d4_h4_ssse3:
+ movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
+ movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
+ movdqa xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ movsxd rcx, dword ptr arg(4) ;output_height
+
+ movsxd rdx, dword ptr arg(3) ;output_pitch
+
+.filter_block1d4_h4_rowloop_ssse3:
+ movdqu xmm1, XMMWORD PTR [rsi - 2]
+
+ movdqa xmm2, xmm1
+ pshufb xmm1, xmm0 ;;[GLOBAL(shuf2b)]
+ pshufb xmm2, xmm3 ;;[GLOBAL(shuf3b)]
+ pmaddubsw xmm1, xmm5
+
+;--
+ pmaddubsw xmm2, xmm6
+
+ lea rsi, [rsi + rax]
+;--
+ paddsw xmm1, xmm7
+ paddsw xmm1, xmm2
+ psraw xmm1, 7
+ packuswb xmm1, xmm1
+
+ movd DWORD PTR [rdi], xmm1
+
+ add rdi, rdx
+ dec rcx
+ jnz .filter_block1d4_h4_rowloop_ssse3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+;void vp8_filter_block1d16_v6_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; unsigned int vp8_filter_index
+;)
+global sym(vp8_filter_block1d16_v6_ssse3) PRIVATE
+sym(vp8_filter_block1d16_v6_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movsxd rdx, DWORD PTR arg(5) ;table index
+ xor rsi, rsi
+ shl rdx, 4 ;
+
+ lea rax, [GLOBAL(k0_k5)]
+ add rax, rdx
+
+ cmp esi, DWORD PTR [rax]
+ je .vp8_filter_block1d16_v4_ssse3
+
+ movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
+ movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
+ mov rdi, arg(2) ;output_ptr
+
+%if ABI_IS_32BIT=0
+ movsxd r8, DWORD PTR arg(3) ;out_pitch
+%endif
+ mov rax, rsi
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+ add rax, rdx
+
+
+.vp8_filter_block1d16_v6_ssse3_loop:
+ movq xmm1, MMWORD PTR [rsi] ;A
+ movq xmm2, MMWORD PTR [rsi + rdx] ;B
+ movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
+ movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
+ movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
+
+ punpcklbw xmm2, xmm4 ;B D
+ punpcklbw xmm3, xmm0 ;C E
+
+ movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
+
+ pmaddubsw xmm3, xmm6
+ punpcklbw xmm1, xmm0 ;A F
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm5
+
+ paddsw xmm2, xmm3
+ paddsw xmm2, xmm1
+ paddsw xmm2, [GLOBAL(rd)]
+ psraw xmm2, 7
+ packuswb xmm2, xmm2
+
+ movq MMWORD PTR [rdi], xmm2 ;store the results
+
+ movq xmm1, MMWORD PTR [rsi + 8] ;A
+ movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B
+ movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C
+ movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
+ movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
+
+ punpcklbw xmm2, xmm4 ;B D
+ punpcklbw xmm3, xmm0 ;C E
+
+ movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F
+ pmaddubsw xmm3, xmm6
+ punpcklbw xmm1, xmm0 ;A F
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm5
+
+ add rsi, rdx
+ add rax, rdx
+;--
+;--
+ paddsw xmm2, xmm3
+ paddsw xmm2, xmm1
+ paddsw xmm2, [GLOBAL(rd)]
+ psraw xmm2, 7
+ packuswb xmm2, xmm2
+
+ movq MMWORD PTR [rdi+8], xmm2
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;out_pitch
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .vp8_filter_block1d16_v6_ssse3_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+.vp8_filter_block1d16_v4_ssse3:
+ movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
+ mov rdi, arg(2) ;output_ptr
+
+%if ABI_IS_32BIT=0
+ movsxd r8, DWORD PTR arg(3) ;out_pitch
+%endif
+ mov rax, rsi
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+ add rax, rdx
+
+.vp8_filter_block1d16_v4_ssse3_loop:
+ movq xmm2, MMWORD PTR [rsi + rdx] ;B
+ movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
+ movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
+ movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
+
+ punpcklbw xmm2, xmm4 ;B D
+ punpcklbw xmm3, xmm0 ;C E
+
+ pmaddubsw xmm3, xmm6
+ pmaddubsw xmm2, xmm7
+ movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B
+ movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C
+ movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
+ movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
+
+ paddsw xmm2, [GLOBAL(rd)]
+ paddsw xmm2, xmm3
+ psraw xmm2, 7
+ packuswb xmm2, xmm2
+
+ punpcklbw xmm5, xmm4 ;B D
+ punpcklbw xmm1, xmm0 ;C E
+
+ pmaddubsw xmm1, xmm6
+ pmaddubsw xmm5, xmm7
+
+ movdqa xmm4, [GLOBAL(rd)]
+ add rsi, rdx
+ add rax, rdx
+;--
+;--
+ paddsw xmm5, xmm1
+ paddsw xmm5, xmm4
+ psraw xmm5, 7
+ packuswb xmm5, xmm5
+
+ punpcklqdq xmm2, xmm5
+
+ movdqa XMMWORD PTR [rdi], xmm2
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;out_pitch
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .vp8_filter_block1d16_v4_ssse3_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_filter_block1d8_v6_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; unsigned int vp8_filter_index
+;)
+global sym(vp8_filter_block1d8_v6_ssse3) PRIVATE
+sym(vp8_filter_block1d8_v6_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movsxd rdx, DWORD PTR arg(5) ;table index
+ xor rsi, rsi
+ shl rdx, 4 ;
+
+ lea rax, [GLOBAL(k0_k5)]
+ add rax, rdx
+
+ movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
+ mov rdi, arg(2) ;output_ptr
+%if ABI_IS_32BIT=0
+ movsxd r8, DWORD PTR arg(3) ; out_pitch
+%endif
+ movsxd rcx, DWORD PTR arg(4) ;[output_height]
+
+ cmp esi, DWORD PTR [rax]
+ je .vp8_filter_block1d8_v4_ssse3
+
+ movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
+ movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
+
+ mov rsi, arg(0) ;src_ptr
+
+ mov rax, rsi
+ add rax, rdx
+
+.vp8_filter_block1d8_v6_ssse3_loop:
+ movq xmm1, MMWORD PTR [rsi] ;A
+ movq xmm2, MMWORD PTR [rsi + rdx] ;B
+ movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
+ movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
+ movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
+
+ punpcklbw xmm2, xmm4 ;B D
+ punpcklbw xmm3, xmm0 ;C E
+
+ movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
+ movdqa xmm4, [GLOBAL(rd)]
+
+ pmaddubsw xmm3, xmm6
+ punpcklbw xmm1, xmm0 ;A F
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm5
+ add rsi, rdx
+ add rax, rdx
+;--
+;--
+ paddsw xmm2, xmm3
+ paddsw xmm2, xmm1
+ paddsw xmm2, xmm4
+ psraw xmm2, 7
+ packuswb xmm2, xmm2
+
+ movq MMWORD PTR [rdi], xmm2
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;[out_pitch]
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .vp8_filter_block1d8_v6_ssse3_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+.vp8_filter_block1d8_v4_ssse3:
+ movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
+ movdqa xmm5, [GLOBAL(rd)]
+
+ mov rsi, arg(0) ;src_ptr
+
+ mov rax, rsi
+ add rax, rdx
+
+.vp8_filter_block1d8_v4_ssse3_loop:
+ movq xmm2, MMWORD PTR [rsi + rdx] ;B
+ movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
+ movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
+ movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
+
+ punpcklbw xmm2, xmm4 ;B D
+ punpcklbw xmm3, xmm0 ;C E
+
+ pmaddubsw xmm3, xmm6
+ pmaddubsw xmm2, xmm7
+ add rsi, rdx
+ add rax, rdx
+;--
+;--
+ paddsw xmm2, xmm3
+ paddsw xmm2, xmm5
+ psraw xmm2, 7
+ packuswb xmm2, xmm2
+
+ movq MMWORD PTR [rdi], xmm2
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;[out_pitch]
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .vp8_filter_block1d8_v4_ssse3_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+;void vp8_filter_block1d4_v6_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; unsigned int vp8_filter_index
+;)
+global sym(vp8_filter_block1d4_v6_ssse3) PRIVATE
+sym(vp8_filter_block1d4_v6_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movsxd rdx, DWORD PTR arg(5) ;table index
+ xor rsi, rsi
+ shl rdx, 4 ;
+
+ lea rax, [GLOBAL(k0_k5)]
+ add rax, rdx
+
+ movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
+ mov rdi, arg(2) ;output_ptr
+%if ABI_IS_32BIT=0
+ movsxd r8, DWORD PTR arg(3) ; out_pitch
+%endif
+ movsxd rcx, DWORD PTR arg(4) ;[output_height]
+
+ cmp esi, DWORD PTR [rax]
+ je .vp8_filter_block1d4_v4_ssse3
+
+ movq mm5, MMWORD PTR [rax] ;k0_k5
+ movq mm6, MMWORD PTR [rax+256] ;k2_k4
+ movq mm7, MMWORD PTR [rax+128] ;k1_k3
+
+ mov rsi, arg(0) ;src_ptr
+
+ mov rax, rsi
+ add rax, rdx
+
+.vp8_filter_block1d4_v6_ssse3_loop:
+ movd mm1, DWORD PTR [rsi] ;A
+ movd mm2, DWORD PTR [rsi + rdx] ;B
+ movd mm3, DWORD PTR [rsi + rdx * 2] ;C
+ movd mm4, DWORD PTR [rax + rdx * 2] ;D
+ movd mm0, DWORD PTR [rsi + rdx * 4] ;E
+
+ punpcklbw mm2, mm4 ;B D
+ punpcklbw mm3, mm0 ;C E
+
+ movd mm0, DWORD PTR [rax + rdx * 4] ;F
+
+ movq mm4, [GLOBAL(rd)]
+
+ pmaddubsw mm3, mm6
+ punpcklbw mm1, mm0 ;A F
+ pmaddubsw mm2, mm7
+ pmaddubsw mm1, mm5
+ add rsi, rdx
+ add rax, rdx
+;--
+;--
+ paddsw mm2, mm3
+ paddsw mm2, mm1
+ paddsw mm2, mm4
+ psraw mm2, 7
+ packuswb mm2, mm2
+
+ movd DWORD PTR [rdi], mm2
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;[out_pitch]
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .vp8_filter_block1d4_v6_ssse3_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+.vp8_filter_block1d4_v4_ssse3:
+ movq mm6, MMWORD PTR [rax+256] ;k2_k4
+ movq mm7, MMWORD PTR [rax+128] ;k1_k3
+ movq mm5, MMWORD PTR [GLOBAL(rd)]
+
+ mov rsi, arg(0) ;src_ptr
+
+ mov rax, rsi
+ add rax, rdx
+
+.vp8_filter_block1d4_v4_ssse3_loop:
+ movd mm2, DWORD PTR [rsi + rdx] ;B
+ movd mm3, DWORD PTR [rsi + rdx * 2] ;C
+ movd mm4, DWORD PTR [rax + rdx * 2] ;D
+ movd mm0, DWORD PTR [rsi + rdx * 4] ;E
+
+ punpcklbw mm2, mm4 ;B D
+ punpcklbw mm3, mm0 ;C E
+
+ pmaddubsw mm3, mm6
+ pmaddubsw mm2, mm7
+ add rsi, rdx
+ add rax, rdx
+;--
+;--
+ paddsw mm2, mm3
+ paddsw mm2, mm5
+ psraw mm2, 7
+ packuswb mm2, mm2
+
+ movd DWORD PTR [rdi], mm2
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;[out_pitch]
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .vp8_filter_block1d4_v4_ssse3_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_bilinear_predict16x16_ssse3
+;(
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; int xoffset,
+; int yoffset,
+; unsigned char *dst_ptr,
+; int dst_pitch
+;)
+global sym(vp8_bilinear_predict16x16_ssse3) PRIVATE
+sym(vp8_bilinear_predict16x16_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
+ movsxd rax, dword ptr arg(2) ; xoffset
+
+ cmp rax, 0 ; skip first_pass filter if xoffset=0
+ je .b16x16_sp_only
+
+ shl rax, 4
+ lea rax, [rax + rcx] ; HFilter
+
+ mov rdi, arg(4) ; dst_ptr
+ mov rsi, arg(0) ; src_ptr
+ movsxd rdx, dword ptr arg(5) ; dst_pitch
+
+ movdqa xmm1, [rax]
+
+ movsxd rax, dword ptr arg(3) ; yoffset
+
+ cmp rax, 0 ; skip second_pass filter if yoffset=0
+ je .b16x16_fp_only
+
+ shl rax, 4
+ lea rax, [rax + rcx] ; VFilter
+
+ lea rcx, [rdi+rdx*8]
+ lea rcx, [rcx+rdx*8]
+ movsxd rdx, dword ptr arg(1) ; src_pixels_per_line
+
+ movdqa xmm2, [rax]
+
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(5) ; dst_pitch
+%endif
+ movq xmm3, [rsi] ; 00 01 02 03 04 05 06 07
+ movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
+
+ punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
+ movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
+
+ movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
+
+ lea rsi, [rsi + rdx] ; next line
+
+ pmaddubsw xmm3, xmm1 ; 00 02 04 06 08 10 12 14
+
+ punpcklbw xmm4, xmm5 ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
+ pmaddubsw xmm4, xmm1 ; 01 03 05 07 09 11 13 15
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value
+ psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128
+
+ movdqa xmm7, xmm3
+ packuswb xmm7, xmm4 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+
+.next_row:
+ movq xmm6, [rsi] ; 00 01 02 03 04 05 06 07
+ movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
+
+ punpcklbw xmm6, xmm5
+ movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
+
+ movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
+ lea rsi, [rsi + rdx] ; next line
+
+ pmaddubsw xmm6, xmm1
+
+ punpcklbw xmm4, xmm5
+ pmaddubsw xmm4, xmm1
+
+ paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value
+ psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128
+
+ paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value
+ psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128
+
+ packuswb xmm6, xmm4
+ movdqa xmm5, xmm7
+
+ punpcklbw xmm5, xmm6
+ pmaddubsw xmm5, xmm2
+
+ punpckhbw xmm7, xmm6
+ pmaddubsw xmm7, xmm2
+
+ paddw xmm5, [GLOBAL(rd)] ; xmm5 += round value
+ psraw xmm5, VP8_FILTER_SHIFT ; xmm5 /= 128
+
+ paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value
+ psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128
+
+ packuswb xmm5, xmm7
+ movdqa xmm7, xmm6
+
+ movdqa [rdi], xmm5 ; store the results in the destination
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(5) ; dst_pitch
+%else
+ add rdi, r8
+%endif
+
+ cmp rdi, rcx
+ jne .next_row
+
+ jmp .done
+
+.b16x16_sp_only:
+ movsxd rax, dword ptr arg(3) ; yoffset
+ shl rax, 4
+ lea rax, [rax + rcx] ; VFilter
+
+ mov rdi, arg(4) ; dst_ptr
+ mov rsi, arg(0) ; src_ptr
+ movsxd rdx, dword ptr arg(5) ; dst_pitch
+
+ movdqa xmm1, [rax] ; VFilter
+
+ lea rcx, [rdi+rdx*8]
+ lea rcx, [rcx+rdx*8]
+ movsxd rax, dword ptr arg(1) ; src_pixels_per_line
+
+ ; get the first horizontal line done
+ movq xmm4, [rsi] ; load row 0
+ movq xmm2, [rsi + 8] ; load row 0
+
+ lea rsi, [rsi + rax] ; next line
+.next_row_sp:
+ movq xmm3, [rsi] ; load row + 1
+ movq xmm5, [rsi + 8] ; load row + 1
+
+ punpcklbw xmm4, xmm3
+ punpcklbw xmm2, xmm5
+
+ pmaddubsw xmm4, xmm1
+ movq xmm7, [rsi + rax] ; load row + 2
+
+ pmaddubsw xmm2, xmm1
+ movq xmm6, [rsi + rax + 8] ; load row + 2
+
+ punpcklbw xmm3, xmm7
+ punpcklbw xmm5, xmm6
+
+ pmaddubsw xmm3, xmm1
+ paddw xmm4, [GLOBAL(rd)]
+
+ pmaddubsw xmm5, xmm1
+ paddw xmm2, [GLOBAL(rd)]
+
+ psraw xmm4, VP8_FILTER_SHIFT
+ psraw xmm2, VP8_FILTER_SHIFT
+
+ packuswb xmm4, xmm2
+ paddw xmm3, [GLOBAL(rd)]
+
+ movdqa [rdi], xmm4 ; store row 0
+ paddw xmm5, [GLOBAL(rd)]
+
+ psraw xmm3, VP8_FILTER_SHIFT
+ psraw xmm5, VP8_FILTER_SHIFT
+
+ packuswb xmm3, xmm5
+ movdqa xmm4, xmm7
+
+ movdqa [rdi + rdx],xmm3 ; store row 1
+ lea rsi, [rsi + 2*rax]
+
+ movdqa xmm2, xmm6
+ lea rdi, [rdi + 2*rdx]
+
+ cmp rdi, rcx
+ jne .next_row_sp
+
+ jmp .done
+
+.b16x16_fp_only:
+ lea rcx, [rdi+rdx*8]
+ lea rcx, [rcx+rdx*8]
+ movsxd rax, dword ptr arg(1) ; src_pixels_per_line
+
+.next_row_fp:
+ movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07
+ movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08
+
+ punpcklbw xmm2, xmm4
+ movq xmm3, [rsi+8] ; 08 09 10 11 12 13 14 15
+
+ pmaddubsw xmm2, xmm1
+ movq xmm4, [rsi+9] ; 09 10 11 12 13 14 15 16
+
+ lea rsi, [rsi + rax] ; next line
+ punpcklbw xmm3, xmm4
+
+ pmaddubsw xmm3, xmm1
+ movq xmm5, [rsi]
+
+ paddw xmm2, [GLOBAL(rd)]
+ movq xmm7, [rsi+1]
+
+ movq xmm6, [rsi+8]
+ psraw xmm2, VP8_FILTER_SHIFT
+
+ punpcklbw xmm5, xmm7
+ movq xmm7, [rsi+9]
+
+ paddw xmm3, [GLOBAL(rd)]
+ pmaddubsw xmm5, xmm1
+
+ psraw xmm3, VP8_FILTER_SHIFT
+ punpcklbw xmm6, xmm7
+
+ packuswb xmm2, xmm3
+ pmaddubsw xmm6, xmm1
+
+ movdqa [rdi], xmm2 ; store the results in the destination
+ paddw xmm5, [GLOBAL(rd)]
+
+ lea rdi, [rdi + rdx] ; dst_pitch
+ psraw xmm5, VP8_FILTER_SHIFT
+
+ paddw xmm6, [GLOBAL(rd)]
+ psraw xmm6, VP8_FILTER_SHIFT
+
+ packuswb xmm5, xmm6
+ lea rsi, [rsi + rax] ; next line
+
+ movdqa [rdi], xmm5 ; store the results in the destination
+ lea rdi, [rdi + rdx] ; dst_pitch
+
+ cmp rdi, rcx
+
+ jne .next_row_fp
+
+.done:
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_bilinear_predict8x8_ssse3
+;(
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; int xoffset,
+; int yoffset,
+; unsigned char *dst_ptr,
+; int dst_pitch
+;)
+global sym(vp8_bilinear_predict8x8_ssse3) PRIVATE
+sym(vp8_bilinear_predict8x8_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 144 ; reserve 144 bytes
+
+ lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
+
+ ;Read 9-line unaligned data in and put them on stack. This gives a big
+ ;performance boost.
+ movdqu xmm0, [rsi]
+ lea rax, [rdx + rdx*2]
+ movdqu xmm1, [rsi+rdx]
+ movdqu xmm2, [rsi+rdx*2]
+ add rsi, rax
+ movdqu xmm3, [rsi]
+ movdqu xmm4, [rsi+rdx]
+ movdqu xmm5, [rsi+rdx*2]
+ add rsi, rax
+ movdqu xmm6, [rsi]
+ movdqu xmm7, [rsi+rdx]
+
+ movdqa XMMWORD PTR [rsp], xmm0
+
+ movdqu xmm0, [rsi+rdx*2]
+
+ movdqa XMMWORD PTR [rsp+16], xmm1
+ movdqa XMMWORD PTR [rsp+32], xmm2
+ movdqa XMMWORD PTR [rsp+48], xmm3
+ movdqa XMMWORD PTR [rsp+64], xmm4
+ movdqa XMMWORD PTR [rsp+80], xmm5
+ movdqa XMMWORD PTR [rsp+96], xmm6
+ movdqa XMMWORD PTR [rsp+112], xmm7
+ movdqa XMMWORD PTR [rsp+128], xmm0
+
+ movsxd rax, dword ptr arg(2) ; xoffset
+ cmp rax, 0 ; skip first_pass filter if xoffset=0
+ je .b8x8_sp_only
+
+ shl rax, 4
+ add rax, rcx ; HFilter
+
+ mov rdi, arg(4) ; dst_ptr
+ movsxd rdx, dword ptr arg(5) ; dst_pitch
+
+ movdqa xmm0, [rax]
+
+ movsxd rax, dword ptr arg(3) ; yoffset
+ cmp rax, 0 ; skip second_pass filter if yoffset=0
+ je .b8x8_fp_only
+
+ shl rax, 4
+ lea rax, [rax + rcx] ; VFilter
+
+ lea rcx, [rdi+rdx*8]
+
+ movdqa xmm1, [rax]
+
+ ; get the first horizontal line done
+ movdqa xmm3, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+ movdqa xmm5, xmm3 ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
+
+ psrldq xmm5, 1
+ lea rsp, [rsp + 16] ; next line
+
+ punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
+ pmaddubsw xmm3, xmm0 ; 00 02 04 06 08 10 12 14
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ movdqa xmm7, xmm3
+ packuswb xmm7, xmm7 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+
+.next_row:
+ movdqa xmm6, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+ lea rsp, [rsp + 16] ; next line
+
+ movdqa xmm5, xmm6
+
+ psrldq xmm5, 1
+
+ punpcklbw xmm6, xmm5
+ pmaddubsw xmm6, xmm0
+
+ paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value
+ psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128
+
+ packuswb xmm6, xmm6
+
+ punpcklbw xmm7, xmm6
+ pmaddubsw xmm7, xmm1
+
+ paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value
+ psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128
+
+ packuswb xmm7, xmm7
+
+ movq [rdi], xmm7 ; store the results in the destination
+ lea rdi, [rdi + rdx]
+
+ movdqa xmm7, xmm6
+
+ cmp rdi, rcx
+ jne .next_row
+
+ jmp .done8x8
+
+.b8x8_sp_only:
+ movsxd rax, dword ptr arg(3) ; yoffset
+ shl rax, 4
+ lea rax, [rax + rcx] ; VFilter
+
+ mov rdi, arg(4) ;dst_ptr
+ movsxd rdx, dword ptr arg(5) ; dst_pitch
+
+ movdqa xmm0, [rax] ; VFilter
+
+ movq xmm1, XMMWORD PTR [rsp]
+ movq xmm2, XMMWORD PTR [rsp+16]
+
+ movq xmm3, XMMWORD PTR [rsp+32]
+ punpcklbw xmm1, xmm2
+
+ movq xmm4, XMMWORD PTR [rsp+48]
+ punpcklbw xmm2, xmm3
+
+ movq xmm5, XMMWORD PTR [rsp+64]
+ punpcklbw xmm3, xmm4
+
+ movq xmm6, XMMWORD PTR [rsp+80]
+ punpcklbw xmm4, xmm5
+
+ movq xmm7, XMMWORD PTR [rsp+96]
+ punpcklbw xmm5, xmm6
+
+ pmaddubsw xmm1, xmm0
+ pmaddubsw xmm2, xmm0
+
+ pmaddubsw xmm3, xmm0
+ pmaddubsw xmm4, xmm0
+
+ pmaddubsw xmm5, xmm0
+ punpcklbw xmm6, xmm7
+
+ pmaddubsw xmm6, xmm0
+ paddw xmm1, [GLOBAL(rd)]
+
+ paddw xmm2, [GLOBAL(rd)]
+ psraw xmm1, VP8_FILTER_SHIFT
+
+ paddw xmm3, [GLOBAL(rd)]
+ psraw xmm2, VP8_FILTER_SHIFT
+
+ paddw xmm4, [GLOBAL(rd)]
+ psraw xmm3, VP8_FILTER_SHIFT
+
+ paddw xmm5, [GLOBAL(rd)]
+ psraw xmm4, VP8_FILTER_SHIFT
+
+ paddw xmm6, [GLOBAL(rd)]
+ psraw xmm5, VP8_FILTER_SHIFT
+
+ psraw xmm6, VP8_FILTER_SHIFT
+ packuswb xmm1, xmm1
+
+ packuswb xmm2, xmm2
+ movq [rdi], xmm1
+
+ packuswb xmm3, xmm3
+ movq [rdi+rdx], xmm2
+
+ packuswb xmm4, xmm4
+ movq xmm1, XMMWORD PTR [rsp+112]
+
+ lea rdi, [rdi + 2*rdx]
+ movq xmm2, XMMWORD PTR [rsp+128]
+
+ packuswb xmm5, xmm5
+ movq [rdi], xmm3
+
+ packuswb xmm6, xmm6
+ movq [rdi+rdx], xmm4
+
+ lea rdi, [rdi + 2*rdx]
+ punpcklbw xmm7, xmm1
+
+ movq [rdi], xmm5
+ pmaddubsw xmm7, xmm0
+
+ movq [rdi+rdx], xmm6
+ punpcklbw xmm1, xmm2
+
+ pmaddubsw xmm1, xmm0
+ paddw xmm7, [GLOBAL(rd)]
+
+ psraw xmm7, VP8_FILTER_SHIFT
+ paddw xmm1, [GLOBAL(rd)]
+
+ psraw xmm1, VP8_FILTER_SHIFT
+ packuswb xmm7, xmm7
+
+ packuswb xmm1, xmm1
+ lea rdi, [rdi + 2*rdx]
+
+ movq [rdi], xmm7
+
+ movq [rdi+rdx], xmm1
+ lea rsp, [rsp + 144]
+
+ jmp .done8x8
+
+.b8x8_fp_only:
+ lea rcx, [rdi+rdx*8]
+
+.next_row_fp:
+ movdqa xmm1, XMMWORD PTR [rsp]
+ movdqa xmm3, XMMWORD PTR [rsp+16]
+
+ movdqa xmm2, xmm1
+ movdqa xmm5, XMMWORD PTR [rsp+32]
+
+ psrldq xmm2, 1
+ movdqa xmm7, XMMWORD PTR [rsp+48]
+
+ movdqa xmm4, xmm3
+ psrldq xmm4, 1
+
+ movdqa xmm6, xmm5
+ psrldq xmm6, 1
+
+ punpcklbw xmm1, xmm2
+ pmaddubsw xmm1, xmm0
+
+ punpcklbw xmm3, xmm4
+ pmaddubsw xmm3, xmm0
+
+ punpcklbw xmm5, xmm6
+ pmaddubsw xmm5, xmm0
+
+ movdqa xmm2, xmm7
+ psrldq xmm2, 1
+
+ punpcklbw xmm7, xmm2
+ pmaddubsw xmm7, xmm0
+
+ paddw xmm1, [GLOBAL(rd)]
+ psraw xmm1, VP8_FILTER_SHIFT
+
+ paddw xmm3, [GLOBAL(rd)]
+ psraw xmm3, VP8_FILTER_SHIFT
+
+ paddw xmm5, [GLOBAL(rd)]
+ psraw xmm5, VP8_FILTER_SHIFT
+
+ paddw xmm7, [GLOBAL(rd)]
+ psraw xmm7, VP8_FILTER_SHIFT
+
+ packuswb xmm1, xmm1
+ packuswb xmm3, xmm3
+
+ packuswb xmm5, xmm5
+ movq [rdi], xmm1
+
+ packuswb xmm7, xmm7
+ movq [rdi+rdx], xmm3
+
+ lea rdi, [rdi + 2*rdx]
+ movq [rdi], xmm5
+
+ lea rsp, [rsp + 4*16]
+ movq [rdi+rdx], xmm7
+
+ lea rdi, [rdi + 2*rdx]
+ cmp rdi, rcx
+
+ jne .next_row_fp
+
+ lea rsp, [rsp + 16]
+
+.done8x8:
+ ;add rsp, 144
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+shuf1b:
+ db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
+shuf2b:
+ db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
+shuf3b:
+ db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
+
+align 16
+shuf2bfrom1:
+ db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
+align 16
+shuf3bfrom1:
+ db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
+
+align 16
+rd:
+ times 8 dw 0x40
+
+align 16
+k0_k5:
+ times 8 db 0, 0 ;placeholder
+ times 8 db 0, 0
+ times 8 db 2, 1
+ times 8 db 0, 0
+ times 8 db 3, 3
+ times 8 db 0, 0
+ times 8 db 1, 2
+ times 8 db 0, 0
+k1_k3:
+ times 8 db 0, 0 ;placeholder
+ times 8 db -6, 12
+ times 8 db -11, 36
+ times 8 db -9, 50
+ times 8 db -16, 77
+ times 8 db -6, 93
+ times 8 db -8, 108
+ times 8 db -1, 123
+k2_k4:
+ times 8 db 128, 0 ;placeholder
+ times 8 db 123, -1
+ times 8 db 108, -8
+ times 8 db 93, -6
+ times 8 db 77, -16
+ times 8 db 50, -9
+ times 8 db 36, -11
+ times 8 db 12, -6
+align 16
+vp8_bilinear_filters_ssse3:
+ times 8 db 128, 0
+ times 8 db 112, 16
+ times 8 db 96, 32
+ times 8 db 80, 48
+ times 8 db 64, 64
+ times 8 db 48, 80
+ times 8 db 32, 96
+ times 8 db 16, 112
+
diff --git a/thirdparty/libvpx/vp8/common/x86/vp8_asm_stubs.c b/thirdparty/libvpx/vp8/common/x86/vp8_asm_stubs.c
new file mode 100644
index 0000000000..fb0b57eb1c
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/x86/vp8_asm_stubs.c
@@ -0,0 +1,625 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "filter_x86.h"
+
+extern const short vp8_six_tap_mmx[8][6*8];
+
+extern void vp8_filter_block1d_h6_mmx
+(
+ unsigned char *src_ptr,
+ unsigned short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp8_filter
+);
+extern void vp8_filter_block1dc_v6_mmx
+(
+ unsigned short *src_ptr,
+ unsigned char *output_ptr,
+ int output_pitch,
+ unsigned int pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp8_filter
+);
+extern void vp8_filter_block1d8_h6_sse2
+(
+ unsigned char *src_ptr,
+ unsigned short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp8_filter
+);
+extern void vp8_filter_block1d16_h6_sse2
+(
+ unsigned char *src_ptr,
+ unsigned short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp8_filter
+);
+extern void vp8_filter_block1d8_v6_sse2
+(
+ unsigned short *src_ptr,
+ unsigned char *output_ptr,
+ int dst_ptich,
+ unsigned int pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp8_filter
+);
+extern void vp8_filter_block1d16_v6_sse2
+(
+ unsigned short *src_ptr,
+ unsigned char *output_ptr,
+ int dst_ptich,
+ unsigned int pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp8_filter
+);
+extern void vp8_unpack_block1d16_h6_sse2
+(
+ unsigned char *src_ptr,
+ unsigned short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int output_height,
+ unsigned int output_width
+);
+extern void vp8_filter_block1d8_h6_only_sse2
+(
+ unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ int dst_ptich,
+ unsigned int output_height,
+ const short *vp8_filter
+);
+extern void vp8_filter_block1d16_h6_only_sse2
+(
+ unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ int dst_ptich,
+ unsigned int output_height,
+ const short *vp8_filter
+);
+extern void vp8_filter_block1d8_v6_only_sse2
+(
+ unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ int dst_ptich,
+ unsigned int output_height,
+ const short *vp8_filter
+);
+
+
+#if HAVE_MMX
+void vp8_sixtap_predict4x4_mmx
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ DECLARE_ALIGNED(16, unsigned short, FData2[16*16]); /* Temp data bufffer used in filtering */
+ const short *HFilter, *VFilter;
+ HFilter = vp8_six_tap_mmx[xoffset];
+ vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 8, HFilter);
+ VFilter = vp8_six_tap_mmx[yoffset];
+ vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4 , 4, 4, VFilter);
+
+}
+
+
+void vp8_sixtap_predict16x16_mmx
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+
+ DECLARE_ALIGNED(16, unsigned short, FData2[24*24]); /* Temp data bufffer used in filtering */
+
+ const short *HFilter, *VFilter;
+
+
+ HFilter = vp8_six_tap_mmx[xoffset];
+
+ vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter);
+ vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 21, 32, HFilter);
+ vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8, FData2 + 8, src_pixels_per_line, 1, 21, 32, HFilter);
+ vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12, FData2 + 12, src_pixels_per_line, 1, 21, 32, HFilter);
+
+ VFilter = vp8_six_tap_mmx[yoffset];
+ vp8_filter_block1dc_v6_mmx(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, 16, VFilter);
+ vp8_filter_block1dc_v6_mmx(FData2 + 36, dst_ptr + 4, dst_pitch, 32, 16 , 16, 16, VFilter);
+ vp8_filter_block1dc_v6_mmx(FData2 + 40, dst_ptr + 8, dst_pitch, 32, 16 , 16, 16, VFilter);
+ vp8_filter_block1dc_v6_mmx(FData2 + 44, dst_ptr + 12, dst_pitch, 32, 16 , 16, 16, VFilter);
+
+}
+
+
+void vp8_sixtap_predict8x8_mmx
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+
+ DECLARE_ALIGNED(16, unsigned short, FData2[256]); /* Temp data bufffer used in filtering */
+
+ const short *HFilter, *VFilter;
+
+ HFilter = vp8_six_tap_mmx[xoffset];
+ vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter);
+ vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 13, 16, HFilter);
+
+ VFilter = vp8_six_tap_mmx[yoffset];
+ vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 8, 8, VFilter);
+ vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 8, 8, VFilter);
+
+}
+
+
+void vp8_sixtap_predict8x4_mmx
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+
+ DECLARE_ALIGNED(16, unsigned short, FData2[256]); /* Temp data bufffer used in filtering */
+
+ const short *HFilter, *VFilter;
+
+ HFilter = vp8_six_tap_mmx[xoffset];
+ vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter);
+ vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 9, 16, HFilter);
+
+ VFilter = vp8_six_tap_mmx[yoffset];
+ vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 4, 8, VFilter);
+ vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 4, 8, VFilter);
+
+}
+
+
+
+void vp8_bilinear_predict16x16_mmx
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ vp8_bilinear_predict8x8_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pitch);
+ vp8_bilinear_predict8x8_mmx(src_ptr + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + 8, dst_pitch);
+ vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8, dst_pitch);
+ vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8 + 8, dst_pitch);
+}
+#endif
+
+
+#if HAVE_SSE2
+void vp8_sixtap_predict16x16_sse2
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+
+)
+{
+ DECLARE_ALIGNED(16, unsigned short, FData2[24*24]); /* Temp data bufffer used in filtering */
+
+ const short *HFilter, *VFilter;
+
+ if (xoffset)
+ {
+ if (yoffset)
+ {
+ HFilter = vp8_six_tap_mmx[xoffset];
+ vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter);
+ VFilter = vp8_six_tap_mmx[yoffset];
+ vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, dst_pitch, VFilter);
+ }
+ else
+ {
+ /* First-pass only */
+ HFilter = vp8_six_tap_mmx[xoffset];
+ vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 16, HFilter);
+ }
+ }
+ else
+ {
+ /* Second-pass only */
+ VFilter = vp8_six_tap_mmx[yoffset];
+ vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 21, 32);
+ vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, dst_pitch, VFilter);
+ }
+}
+
+
+void vp8_sixtap_predict8x8_sse2
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ DECLARE_ALIGNED(16, unsigned short, FData2[256]); /* Temp data bufffer used in filtering */
+ const short *HFilter, *VFilter;
+
+ if (xoffset)
+ {
+ if (yoffset)
+ {
+ HFilter = vp8_six_tap_mmx[xoffset];
+ vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter);
+ VFilter = vp8_six_tap_mmx[yoffset];
+ vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 8, dst_pitch, VFilter);
+ }
+ else
+ {
+ /* First-pass only */
+ HFilter = vp8_six_tap_mmx[xoffset];
+ vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 8, HFilter);
+ }
+ }
+ else
+ {
+ /* Second-pass only */
+ VFilter = vp8_six_tap_mmx[yoffset];
+ vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 8, VFilter);
+ }
+}
+
+
+void vp8_sixtap_predict8x4_sse2
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ DECLARE_ALIGNED(16, unsigned short, FData2[256]); /* Temp data bufffer used in filtering */
+ const short *HFilter, *VFilter;
+
+ if (xoffset)
+ {
+ if (yoffset)
+ {
+ HFilter = vp8_six_tap_mmx[xoffset];
+ vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter);
+ VFilter = vp8_six_tap_mmx[yoffset];
+ vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 4, dst_pitch, VFilter);
+ }
+ else
+ {
+ /* First-pass only */
+ HFilter = vp8_six_tap_mmx[xoffset];
+ vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, HFilter);
+ }
+ }
+ else
+ {
+ /* Second-pass only */
+ VFilter = vp8_six_tap_mmx[yoffset];
+ vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, VFilter);
+ }
+}
+
+#endif
+
+#if HAVE_SSSE3
+
+extern void vp8_filter_block1d8_h6_ssse3
+(
+ unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ unsigned int output_pitch,
+ unsigned int output_height,
+ unsigned int vp8_filter_index
+);
+
+extern void vp8_filter_block1d16_h6_ssse3
+(
+ unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ unsigned int output_pitch,
+ unsigned int output_height,
+ unsigned int vp8_filter_index
+);
+
+extern void vp8_filter_block1d16_v6_ssse3
+(
+ unsigned char *src_ptr,
+ unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ unsigned int vp8_filter_index
+);
+
+extern void vp8_filter_block1d8_v6_ssse3
+(
+ unsigned char *src_ptr,
+ unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ unsigned int vp8_filter_index
+);
+
+extern void vp8_filter_block1d4_h6_ssse3
+(
+ unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ unsigned int output_pitch,
+ unsigned int output_height,
+ unsigned int vp8_filter_index
+);
+
+extern void vp8_filter_block1d4_v6_ssse3
+(
+ unsigned char *src_ptr,
+ unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ unsigned int vp8_filter_index
+);
+
+void vp8_sixtap_predict16x16_ssse3
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+
+)
+{
+ DECLARE_ALIGNED(16, unsigned char, FData2[24*24]);
+
+ if (xoffset)
+ {
+ if (yoffset)
+ {
+ vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line, FData2,
+ 16, 21, xoffset);
+ vp8_filter_block1d16_v6_ssse3(FData2 , 16, dst_ptr, dst_pitch,
+ 16, yoffset);
+ }
+ else
+ {
+ /* First-pass only */
+ vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pitch, 16, xoffset);
+ }
+ }
+ else
+ {
+ if (yoffset)
+ {
+ /* Second-pass only */
+ vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line,
+ dst_ptr, dst_pitch, 16, yoffset);
+ }
+ else
+ {
+ /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
+ * yoffset==0) case correctly. Add copy function here to guarantee
+ * six-tap function handles all possible offsets. */
+ vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
+ }
+ }
+}
+
+void vp8_sixtap_predict8x8_ssse3
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ DECLARE_ALIGNED(16, unsigned char, FData2[256]);
+
+ if (xoffset)
+ {
+ if (yoffset)
+ {
+ vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line, FData2,
+ 8, 13, xoffset);
+ vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch,
+ 8, yoffset);
+ }
+ else
+ {
+ vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pitch, 8, xoffset);
+ }
+ }
+ else
+ {
+ if (yoffset)
+ {
+ /* Second-pass only */
+ vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line,
+ dst_ptr, dst_pitch, 8, yoffset);
+ }
+ else
+ {
+ /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
+ * yoffset==0) case correctly. Add copy function here to guarantee
+ * six-tap function handles all possible offsets. */
+ vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
+ }
+ }
+}
+
+
+void vp8_sixtap_predict8x4_ssse3
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ DECLARE_ALIGNED(16, unsigned char, FData2[256]);
+
+ if (xoffset)
+ {
+ if (yoffset)
+ {
+ vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line, FData2,
+ 8, 9, xoffset);
+ vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch,
+ 4, yoffset);
+ }
+ else
+ {
+ /* First-pass only */
+ vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pitch, 4, xoffset);
+ }
+ }
+ else
+ {
+ if (yoffset)
+ {
+ /* Second-pass only */
+ vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line,
+ dst_ptr, dst_pitch, 4, yoffset);
+ }
+ else
+ {
+ /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
+ * yoffset==0) case correctly. Add copy function here to guarantee
+ * six-tap function handles all possible offsets. */
+ vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
+ }
+ }
+}
+
+void vp8_sixtap_predict4x4_ssse3
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ DECLARE_ALIGNED(16, unsigned char, FData2[4*9]);
+
+ if (xoffset)
+ {
+ if (yoffset)
+ {
+ vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line,
+ FData2, 4, 9, xoffset);
+ vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch,
+ 4, yoffset);
+ }
+ else
+ {
+ vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pitch, 4, xoffset);
+ }
+ }
+ else
+ {
+ if (yoffset)
+ {
+ vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line,
+ dst_ptr, dst_pitch, 4, yoffset);
+ }
+ else
+ {
+ /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
+ * yoffset==0) case correctly. Add copy function here to guarantee
+ * six-tap function handles all possible offsets. */
+ int r;
+
+ for (r = 0; r < 4; r++)
+ {
+ dst_ptr[0] = src_ptr[0];
+ dst_ptr[1] = src_ptr[1];
+ dst_ptr[2] = src_ptr[2];
+ dst_ptr[3] = src_ptr[3];
+ dst_ptr += dst_pitch;
+ src_ptr += src_pixels_per_line;
+ }
+ }
+ }
+}
+
+#endif
diff --git a/thirdparty/libvpx/vp8/common/x86/vp8_loopfilter_mmx.asm b/thirdparty/libvpx/vp8/common/x86/vp8_loopfilter_mmx.asm
new file mode 100644
index 0000000000..88a07b9f3f
--- /dev/null
+++ b/thirdparty/libvpx/vp8/common/x86/vp8_loopfilter_mmx.asm
@@ -0,0 +1,1753 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;void vp8_loop_filter_horizontal_edge_mmx
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+; int count
+;)
+global sym(vp8_loop_filter_horizontal_edge_mmx) PRIVATE
+sym(vp8_loop_filter_horizontal_edge_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 32 ; reserve 32 bytes
+ %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
+ %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+
+ movsxd rcx, dword ptr arg(5) ;count
+.next8_h:
+ mov rdx, arg(3) ;limit
+ movq mm7, [rdx]
+ mov rdi, rsi ; rdi points to row +1 for indirect addressing
+ add rdi, rax
+
+ ; calculate breakout conditions
+ movq mm2, [rdi+2*rax] ; q3
+ movq mm1, [rsi+2*rax] ; q2
+ movq mm6, mm1 ; q2
+ psubusb mm1, mm2 ; q2-=q3
+ psubusb mm2, mm6 ; q3-=q2
+ por mm1, mm2 ; abs(q3-q2)
+ psubusb mm1, mm7 ;
+
+
+ movq mm4, [rsi+rax] ; q1
+ movq mm3, mm4 ; q1
+ psubusb mm4, mm6 ; q1-=q2
+ psubusb mm6, mm3 ; q2-=q1
+ por mm4, mm6 ; abs(q2-q1)
+
+ psubusb mm4, mm7
+ por mm1, mm4
+
+ movq mm4, [rsi] ; q0
+ movq mm0, mm4 ; q0
+ psubusb mm4, mm3 ; q0-=q1
+ psubusb mm3, mm0 ; q1-=q0
+ por mm4, mm3 ; abs(q0-q1)
+ movq t0, mm4 ; save to t0
+ psubusb mm4, mm7
+ por mm1, mm4
+
+
+ neg rax ; negate pitch to deal with above border
+
+ movq mm2, [rsi+4*rax] ; p3
+ movq mm4, [rdi+4*rax] ; p2
+ movq mm5, mm4 ; p2
+ psubusb mm4, mm2 ; p2-=p3
+ psubusb mm2, mm5 ; p3-=p2
+ por mm4, mm2 ; abs(p3 - p2)
+ psubusb mm4, mm7
+ por mm1, mm4
+
+
+ movq mm4, [rsi+2*rax] ; p1
+ movq mm3, mm4 ; p1
+ psubusb mm4, mm5 ; p1-=p2
+ psubusb mm5, mm3 ; p2-=p1
+ por mm4, mm5 ; abs(p2 - p1)
+ psubusb mm4, mm7
+ por mm1, mm4
+
+ movq mm2, mm3 ; p1
+
+ movq mm4, [rsi+rax] ; p0
+ movq mm5, mm4 ; p0
+ psubusb mm4, mm3 ; p0-=p1
+ psubusb mm3, mm5 ; p1-=p0
+ por mm4, mm3 ; abs(p1 - p0)
+ movq t1, mm4 ; save to t1
+ psubusb mm4, mm7
+ por mm1, mm4
+
+ movq mm3, [rdi] ; q1
+ movq mm4, mm3 ; q1
+ psubusb mm3, mm2 ; q1-=p1
+ psubusb mm2, mm4 ; p1-=q1
+ por mm2, mm3 ; abs(p1-q1)
+ pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psrlw mm2, 1 ; abs(p1-q1)/2
+
+ movq mm6, mm5 ; p0
+ movq mm3, [rsi] ; q0
+ psubusb mm5, mm3 ; p0-=q0
+ psubusb mm3, mm6 ; q0-=p0
+ por mm5, mm3 ; abs(p0 - q0)
+ paddusb mm5, mm5 ; abs(p0-q0)*2
+ paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ mov rdx, arg(2) ;blimit ; get blimit
+ movq mm7, [rdx] ; blimit
+
+ psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ por mm1, mm5
+ pxor mm5, mm5
+ pcmpeqb mm1, mm5 ; mask mm1
+
+ ; calculate high edge variance
+ mov rdx, arg(4) ;thresh ; get thresh
+ movq mm7, [rdx] ;
+ movq mm4, t0 ; get abs (q1 - q0)
+ psubusb mm4, mm7
+ movq mm3, t1 ; get abs (p1 - p0)
+ psubusb mm3, mm7
+ paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+
+ pcmpeqb mm4, mm5
+
+ pcmpeqb mm5, mm5
+ pxor mm4, mm5
+
+
+ ; start work on filters
+ movq mm2, [rsi+2*rax] ; p1
+ movq mm7, [rdi] ; q1
+ pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
+ pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
+ psubsb mm2, mm7 ; p1 - q1
+ pand mm2, mm4 ; high var mask (hvm)(p1 - q1)
+ pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
+ pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
+ movq mm3, mm0 ; q0
+ psubsb mm0, mm6 ; q0 - p0
+ paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
+ paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
+ paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
+ pand mm1, mm2 ; mask filter values we don't care about
+ movq mm2, mm1
+ paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+ paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+
+ pxor mm0, mm0 ;
+ pxor mm5, mm5
+ punpcklbw mm0, mm2 ;
+ punpckhbw mm5, mm2 ;
+ psraw mm0, 11 ;
+ psraw mm5, 11
+ packsswb mm0, mm5
+ movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
+
+ pxor mm0, mm0 ; 0
+ movq mm5, mm1 ; abcdefgh
+ punpcklbw mm0, mm1 ; e0f0g0h0
+ psraw mm0, 11 ; sign extended shift right by 3
+ pxor mm1, mm1 ; 0
+ punpckhbw mm1, mm5 ; a0b0c0d0
+ psraw mm1, 11 ; sign extended shift right by 3
+ movq mm5, mm0 ; save results
+
+ packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+ paddsw mm5, [GLOBAL(ones)]
+ paddsw mm1, [GLOBAL(ones)]
+ psraw mm5, 1 ; partial shifted one more time for 2nd tap
+ psraw mm1, 1 ; partial shifted one more time for 2nd tap
+ packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+ pandn mm4, mm5 ; high edge variance additive
+
+ paddsb mm6, mm2 ; p0+= p0 add
+ pxor mm6, [GLOBAL(t80)] ; unoffset
+ movq [rsi+rax], mm6 ; write back
+
+ movq mm6, [rsi+2*rax] ; p1
+ pxor mm6, [GLOBAL(t80)] ; reoffset
+ paddsb mm6, mm4 ; p1+= p1 add
+ pxor mm6, [GLOBAL(t80)] ; unoffset
+ movq [rsi+2*rax], mm6 ; write back
+
+ psubsb mm3, mm0 ; q0-= q0 add
+ pxor mm3, [GLOBAL(t80)] ; unoffset
+ movq [rsi], mm3 ; write back
+
+ psubsb mm7, mm4 ; q1-= q1 add
+ pxor mm7, [GLOBAL(t80)] ; unoffset
+ movq [rdi], mm7 ; write back
+
+ add rsi,8
+ neg rax
+ dec rcx
+ jnz .next8_h
+
+ add rsp, 32
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_loop_filter_vertical_edge_mmx
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+; int count
+;)
+global sym(vp8_loop_filter_vertical_edge_mmx) PRIVATE
+sym(vp8_loop_filter_vertical_edge_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 64 ; reserve 64 bytes
+ %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
+ %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
+ %define srct [rsp + 32] ;__declspec(align(16)) char srct[32];
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+
+ lea rsi, [rsi + rax*4 - 4]
+
+ movsxd rcx, dword ptr arg(5) ;count
+.next8_v:
+ mov rdi, rsi ; rdi points to row +1 for indirect addressing
+ add rdi, rax
+
+
+ ;transpose
+ movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60
+ movq mm7, mm6 ; 77 76 75 74 73 72 71 70
+
+ punpckhbw mm7, [rdi+2*rax] ; 77 67 76 66 75 65 74 64
+ punpcklbw mm6, [rdi+2*rax] ; 73 63 72 62 71 61 70 60
+
+ movq mm4, [rsi] ; 47 46 45 44 43 42 41 40
+ movq mm5, mm4 ; 47 46 45 44 43 42 41 40
+
+ punpckhbw mm5, [rsi+rax] ; 57 47 56 46 55 45 54 44
+ punpcklbw mm4, [rsi+rax] ; 53 43 52 42 51 41 50 40
+
+ movq mm3, mm5 ; 57 47 56 46 55 45 54 44
+ punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46
+
+ punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44
+ movq mm2, mm4 ; 53 43 52 42 51 41 50 40
+
+ punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42
+ punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40
+
+ neg rax
+ movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20
+
+ movq mm1, mm6 ; 27 26 25 24 23 22 21 20
+ punpckhbw mm6, [rsi+rax] ; 37 27 36 36 35 25 34 24
+
+ punpcklbw mm1, [rsi+rax] ; 33 23 32 22 31 21 30 20
+ movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00
+
+ punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04
+ movq mm0, mm7 ; 17 07 16 06 15 05 14 04
+
+ punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06
+ punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04
+
+ movq mm6, mm7 ; 37 27 17 07 36 26 16 06
+ punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3
+
+ punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2
+
+ movq mm5, mm6 ; 76 66 56 46 36 26 16 06
+ psubusb mm5, mm7 ; q2-q3
+
+ psubusb mm7, mm6 ; q3-q2
+ por mm7, mm5; ; mm7=abs (q3-q2)
+
+ movq mm5, mm0 ; 35 25 15 05 34 24 14 04
+ punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1
+
+ punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0
+ movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1
+
+ psubusb mm3, mm6 ; q1-q2
+ psubusb mm6, mm5 ; q2-q1
+
+ por mm6, mm3 ; mm6=abs(q2-q1)
+ lea rdx, srct
+
+ movq [rdx+24], mm5 ; save q1
+ movq [rdx+16], mm0 ; save q0
+
+ movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00
+ punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00
+
+ movq mm0, mm3 ; 13 03 12 02 11 01 10 00
+ punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00
+
+ punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02
+ movq mm1, mm0 ; 31 21 11 01 30 20 10 00
+
+ punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3
+ punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2
+
+ movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2
+ psubusb mm2, mm0 ; p2-p3
+
+ psubusb mm0, mm1 ; p3-p2
+ por mm0, mm2 ; mm0=abs(p3-p2)
+
+ movq mm2, mm3 ; 33 23 13 03 32 22 12 02
+ punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1
+
+ punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0
+ movq [rdx+8], mm3 ; save p0
+
+ movq [rdx], mm2 ; save p1
+ movq mm5, mm2 ; mm5 = p1
+
+ psubusb mm2, mm1 ; p1-p2
+ psubusb mm1, mm5 ; p2-p1
+
+ por mm1, mm2 ; mm1=abs(p2-p1)
+ mov rdx, arg(3) ;limit
+
+ movq mm4, [rdx] ; mm4 = limit
+ psubusb mm7, mm4
+
+ psubusb mm0, mm4
+ psubusb mm1, mm4
+
+ psubusb mm6, mm4
+ por mm7, mm6
+
+ por mm0, mm1
+ por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
+
+ movq mm1, mm5 ; p1
+
+ movq mm7, mm3 ; mm3=mm7=p0
+ psubusb mm7, mm5 ; p0 - p1
+
+ psubusb mm5, mm3 ; p1 - p0
+ por mm5, mm7 ; abs(p1-p0)
+
+ movq t0, mm5 ; save abs(p1-p0)
+ lea rdx, srct
+
+ psubusb mm5, mm4
+ por mm0, mm5 ; mm0=mask
+
+ movq mm5, [rdx+16] ; mm5=q0
+ movq mm7, [rdx+24] ; mm7=q1
+
+ movq mm6, mm5 ; mm6=q0
+ movq mm2, mm7 ; q1
+ psubusb mm5, mm7 ; q0-q1
+
+ psubusb mm7, mm6 ; q1-q0
+ por mm7, mm5 ; abs(q1-q0)
+
+ movq t1, mm7 ; save abs(q1-q0)
+ psubusb mm7, mm4
+
+ por mm0, mm7 ; mask
+
+ movq mm5, mm2 ; q1
+ psubusb mm5, mm1 ; q1-=p1
+ psubusb mm1, mm2 ; p1-=q1
+ por mm5, mm1 ; abs(p1-q1)
+ pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psrlw mm5, 1 ; abs(p1-q1)/2
+
+ mov rdx, arg(2) ;blimit ;
+
+ movq mm4, [rdx] ;blimit
+ movq mm1, mm3 ; mm1=mm3=p0
+
+ movq mm7, mm6 ; mm7=mm6=q0
+ psubusb mm1, mm7 ; p0-q0
+
+ psubusb mm7, mm3 ; q0-p0
+ por mm1, mm7 ; abs(q0-p0)
+ paddusb mm1, mm1 ; abs(q0-p0)*2
+ paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ por mm1, mm0; ; mask
+
+ pxor mm0, mm0
+ pcmpeqb mm1, mm0
+
+ ; calculate high edge variance
+ mov rdx, arg(4) ;thresh ; get thresh
+ movq mm7, [rdx]
+ ;
+ movq mm4, t0 ; get abs (q1 - q0)
+ psubusb mm4, mm7
+
+ movq mm3, t1 ; get abs (p1 - p0)
+ psubusb mm3, mm7
+
+ por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+ pcmpeqb mm4, mm0
+
+ pcmpeqb mm0, mm0
+ pxor mm4, mm0
+
+
+
+ ; start work on filters
+ lea rdx, srct
+
+ movq mm2, [rdx] ; p1
+ movq mm7, [rdx+24] ; q1
+
+ movq mm6, [rdx+8] ; p0
+ movq mm0, [rdx+16] ; q0
+
+ pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
+ pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
+
+ psubsb mm2, mm7 ; p1 - q1
+ pand mm2, mm4 ; high var mask (hvm)(p1 - q1)
+
+ pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
+ pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
+
+ movq mm3, mm0 ; q0
+ psubsb mm0, mm6 ; q0 - p0
+
+ paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
+ paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
+
+ paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
+ pand mm1, mm2 ; mask filter values we don't care about
+
+ movq mm2, mm1
+ paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+
+ paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+ pxor mm0, mm0 ;
+
+ pxor mm5, mm5
+ punpcklbw mm0, mm2 ;
+
+ punpckhbw mm5, mm2 ;
+ psraw mm0, 11 ;
+
+ psraw mm5, 11
+ packsswb mm0, mm5
+
+ movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
+
+ pxor mm0, mm0 ; 0
+ movq mm5, mm1 ; abcdefgh
+
+ punpcklbw mm0, mm1 ; e0f0g0h0
+ psraw mm0, 11 ; sign extended shift right by 3
+
+ pxor mm1, mm1 ; 0
+ punpckhbw mm1, mm5 ; a0b0c0d0
+
+ psraw mm1, 11 ; sign extended shift right by 3
+ movq mm5, mm0 ; save results
+
+ packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+ paddsw mm5, [GLOBAL(ones)]
+
+ paddsw mm1, [GLOBAL(ones)]
+ psraw mm5, 1 ; partial shifted one more time for 2nd tap
+
+ psraw mm1, 1 ; partial shifted one more time for 2nd tap
+ packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+
+ pandn mm4, mm5 ; high edge variance additive
+
+ paddsb mm6, mm2 ; p0+= p0 add
+ pxor mm6, [GLOBAL(t80)] ; unoffset
+
+ ; mm6=p0 ;
+ movq mm1, [rdx] ; p1
+ pxor mm1, [GLOBAL(t80)] ; reoffset
+
+ paddsb mm1, mm4 ; p1+= p1 add
+ pxor mm1, [GLOBAL(t80)] ; unoffset
+ ; mm6 = p0 mm1 = p1
+
+ psubsb mm3, mm0 ; q0-= q0 add
+ pxor mm3, [GLOBAL(t80)] ; unoffset
+
+ ; mm3 = q0
+ psubsb mm7, mm4 ; q1-= q1 add
+ pxor mm7, [GLOBAL(t80)] ; unoffset
+ ; mm7 = q1
+
+ ; transpose and write back
+ ; mm1 = 72 62 52 42 32 22 12 02
+ ; mm6 = 73 63 53 43 33 23 13 03
+ ; mm3 = 74 64 54 44 34 24 14 04
+ ; mm7 = 75 65 55 45 35 25 15 05
+
+ movq mm2, mm1 ; 72 62 52 42 32 22 12 02
+ punpcklbw mm2, mm6 ; 33 32 23 22 13 12 03 02
+
+ movq mm4, mm3 ; 74 64 54 44 34 24 14 04
+ punpckhbw mm1, mm6 ; 73 72 63 62 53 52 43 42
+
+ punpcklbw mm4, mm7 ; 35 34 25 24 15 14 05 04
+ punpckhbw mm3, mm7 ; 75 74 65 64 55 54 45 44
+
+ movq mm6, mm2 ; 33 32 23 22 13 12 03 02
+ punpcklwd mm2, mm4 ; 15 14 13 12 05 04 03 02
+
+ punpckhwd mm6, mm4 ; 35 34 33 32 25 24 23 22
+ movq mm5, mm1 ; 73 72 63 62 53 52 43 42
+
+ punpcklwd mm1, mm3 ; 55 54 53 52 45 44 43 42
+ punpckhwd mm5, mm3 ; 75 74 73 72 65 64 63 62
+
+
+ ; mm2 = 15 14 13 12 05 04 03 02
+ ; mm6 = 35 34 33 32 25 24 23 22
+ ; mm5 = 55 54 53 52 45 44 43 42
+ ; mm1 = 75 74 73 72 65 64 63 62
+
+
+
+ movd [rsi+rax*4+2], mm2
+ psrlq mm2, 32
+
+ movd [rdi+rax*4+2], mm2
+ movd [rsi+rax*2+2], mm6
+
+ psrlq mm6, 32
+ movd [rsi+rax+2],mm6
+
+ movd [rsi+2], mm1
+ psrlq mm1, 32
+
+ movd [rdi+2], mm1
+ neg rax
+
+ movd [rdi+rax+2],mm5
+ psrlq mm5, 32
+
+ movd [rdi+rax*2+2], mm5
+
+ lea rsi, [rsi+rax*8]
+ dec rcx
+ jnz .next8_v
+
+ add rsp, 64
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_mbloop_filter_horizontal_edge_mmx
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+; int count
+;)
+global sym(vp8_mbloop_filter_horizontal_edge_mmx) PRIVATE
+sym(vp8_mbloop_filter_horizontal_edge_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 32 ; reserve 32 bytes
+ %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
+ %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+
+ movsxd rcx, dword ptr arg(5) ;count
+.next8_mbh:
+ mov rdx, arg(3) ;limit
+ movq mm7, [rdx]
+ mov rdi, rsi ; rdi points to row +1 for indirect addressing
+ add rdi, rax
+
+ ; calculate breakout conditions
+ movq mm2, [rdi+2*rax] ; q3
+
+ movq mm1, [rsi+2*rax] ; q2
+ movq mm6, mm1 ; q2
+ psubusb mm1, mm2 ; q2-=q3
+ psubusb mm2, mm6 ; q3-=q2
+ por mm1, mm2 ; abs(q3-q2)
+ psubusb mm1, mm7
+
+
+ ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit
+ movq mm4, [rsi+rax] ; q1
+ movq mm3, mm4 ; q1
+ psubusb mm4, mm6 ; q1-=q2
+ psubusb mm6, mm3 ; q2-=q1
+ por mm4, mm6 ; abs(q2-q1)
+ psubusb mm4, mm7
+ por mm1, mm4
+
+
+ ; mm1 = mask, mm3=q1, mm7 = limit
+
+ movq mm4, [rsi] ; q0
+ movq mm0, mm4 ; q0
+ psubusb mm4, mm3 ; q0-=q1
+ psubusb mm3, mm0 ; q1-=q0
+ por mm4, mm3 ; abs(q0-q1)
+ movq t0, mm4 ; save to t0
+ psubusb mm4, mm7
+ por mm1, mm4
+
+
+ ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1)
+
+ neg rax ; negate pitch to deal with above border
+
+ movq mm2, [rsi+4*rax] ; p3
+ movq mm4, [rdi+4*rax] ; p2
+ movq mm5, mm4 ; p2
+ psubusb mm4, mm2 ; p2-=p3
+ psubusb mm2, mm5 ; p3-=p2
+ por mm4, mm2 ; abs(p3 - p2)
+ psubusb mm4, mm7
+ por mm1, mm4
+ ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1)
+
+ movq mm4, [rsi+2*rax] ; p1
+ movq mm3, mm4 ; p1
+ psubusb mm4, mm5 ; p1-=p2
+ psubusb mm5, mm3 ; p2-=p1
+ por mm4, mm5 ; abs(p2 - p1)
+ psubusb mm4, mm7
+ por mm1, mm4
+
+ movq mm2, mm3 ; p1
+
+
+ ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1)
+
+ movq mm4, [rsi+rax] ; p0
+ movq mm5, mm4 ; p0
+ psubusb mm4, mm3 ; p0-=p1
+ psubusb mm3, mm5 ; p1-=p0
+ por mm4, mm3 ; abs(p1 - p0)
+ movq t1, mm4 ; save to t1
+ psubusb mm4, mm7
+ por mm1, mm4
+ ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0)
+ ; mm5 = p0
+ movq mm3, [rdi] ; q1
+ movq mm4, mm3 ; q1
+ psubusb mm3, mm2 ; q1-=p1
+ psubusb mm2, mm4 ; p1-=q1
+ por mm2, mm3 ; abs(p1-q1)
+ pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psrlw mm2, 1 ; abs(p1-q1)/2
+
+ movq mm6, mm5 ; p0
+ movq mm3, mm0 ; q0
+ psubusb mm5, mm3 ; p0-=q0
+ psubusb mm3, mm6 ; q0-=p0
+ por mm5, mm3 ; abs(p0 - q0)
+ paddusb mm5, mm5 ; abs(p0-q0)*2
+ paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ mov rdx, arg(2) ;blimit ; get blimit
+ movq mm7, [rdx] ; blimit
+
+ psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ por mm1, mm5
+ pxor mm5, mm5
+ pcmpeqb mm1, mm5 ; mask mm1
+
+ ; mm1 = mask, mm0=q0, mm7 = blimit, t0 = abs(q0-q1) t1 = abs(p1-p0)
+ ; mm6 = p0,
+
+ ; calculate high edge variance
+ mov rdx, arg(4) ;thresh ; get thresh
+ movq mm7, [rdx] ;
+ movq mm4, t0 ; get abs (q1 - q0)
+ psubusb mm4, mm7
+ movq mm3, t1 ; get abs (p1 - p0)
+ psubusb mm3, mm7
+ paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+
+ pcmpeqb mm4, mm5
+
+ pcmpeqb mm5, mm5
+ pxor mm4, mm5
+
+
+
+ ; mm1 = mask, mm0=q0, mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0)
+ ; mm6 = p0, mm4=hev
+ ; start work on filters
+ movq mm2, [rsi+2*rax] ; p1
+ movq mm7, [rdi] ; q1
+ pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
+ pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
+ psubsb mm2, mm7 ; p1 - q1
+
+ pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
+ pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
+ movq mm3, mm0 ; q0
+ psubsb mm0, mm6 ; q0 - p0
+ paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1)
+ paddsb mm2, mm0 ; 2 * (q0 - p0)
+ paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1)
+ pand mm1, mm2 ; mask filter values we don't care about
+
+
+ ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
+ movq mm2, mm1 ; vp8_filter
+ pand mm2, mm4; ; Filter2 = vp8_filter & hev
+
+ movq mm5, mm2 ;
+ paddsb mm5, [GLOBAL(t3)];
+
+ pxor mm0, mm0 ; 0
+ pxor mm7, mm7 ; 0
+
+ punpcklbw mm0, mm5 ; e0f0g0h0
+ psraw mm0, 11 ; sign extended shift right by 3
+ punpckhbw mm7, mm5 ; a0b0c0d0
+ psraw mm7, 11 ; sign extended shift right by 3
+ packsswb mm0, mm7 ; Filter2 >>=3;
+
+ movq mm5, mm0 ; Filter2
+
+ paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4)
+ pxor mm0, mm0 ; 0
+ pxor mm7, mm7 ; 0
+
+ punpcklbw mm0, mm2 ; e0f0g0h0
+ psraw mm0, 11 ; sign extended shift right by 3
+ punpckhbw mm7, mm2 ; a0b0c0d0
+ psraw mm7, 11 ; sign extended shift right by 3
+ packsswb mm0, mm7 ; Filter2 >>=3;
+
+ ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0
+ psubsb mm3, mm0 ; qs0 =qs0 - filter1
+ paddsb mm6, mm5 ; ps0 =ps0 + Fitler2
+
+ ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
+ ; vp8_filter &= ~hev;
+ ; Filter2 = vp8_filter;
+ pandn mm4, mm1 ; vp8_filter&=~hev
+
+
+ ; mm3=qs0, mm4=filter2, mm6=ps0
+
+ ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
+ ; s = vp8_signed_char_clamp(qs0 - u);
+ ; *oq0 = s^0x80;
+ ; s = vp8_signed_char_clamp(ps0 + u);
+ ; *op0 = s^0x80;
+ pxor mm0, mm0
+
+ pxor mm1, mm1
+ pxor mm2, mm2
+ punpcklbw mm1, mm4
+ punpckhbw mm2, mm4
+ pmulhw mm1, [GLOBAL(s27)]
+ pmulhw mm2, [GLOBAL(s27)]
+ paddw mm1, [GLOBAL(s63)]
+ paddw mm2, [GLOBAL(s63)]
+ psraw mm1, 7
+ psraw mm2, 7
+ packsswb mm1, mm2
+
+ psubsb mm3, mm1
+ paddsb mm6, mm1
+
+ pxor mm3, [GLOBAL(t80)]
+ pxor mm6, [GLOBAL(t80)]
+ movq [rsi+rax], mm6
+ movq [rsi], mm3
+
+ ; roughly 2/7th difference across boundary
+ ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
+ ; s = vp8_signed_char_clamp(qs1 - u);
+ ; *oq1 = s^0x80;
+ ; s = vp8_signed_char_clamp(ps1 + u);
+ ; *op1 = s^0x80;
+ pxor mm1, mm1
+ pxor mm2, mm2
+ punpcklbw mm1, mm4
+ punpckhbw mm2, mm4
+ pmulhw mm1, [GLOBAL(s18)]
+ pmulhw mm2, [GLOBAL(s18)]
+ paddw mm1, [GLOBAL(s63)]
+ paddw mm2, [GLOBAL(s63)]
+ psraw mm1, 7
+ psraw mm2, 7
+ packsswb mm1, mm2
+
+ movq mm3, [rdi]
+ movq mm6, [rsi+rax*2] ; p1
+
+ pxor mm3, [GLOBAL(t80)]
+ pxor mm6, [GLOBAL(t80)]
+
+ paddsb mm6, mm1
+ psubsb mm3, mm1
+
+ pxor mm6, [GLOBAL(t80)]
+ pxor mm3, [GLOBAL(t80)]
+ movq [rdi], mm3
+ movq [rsi+rax*2], mm6
+
+ ; roughly 1/7th difference across boundary
+ ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
+ ; s = vp8_signed_char_clamp(qs2 - u);
+ ; *oq2 = s^0x80;
+ ; s = vp8_signed_char_clamp(ps2 + u);
+ ; *op2 = s^0x80;
+ pxor mm1, mm1
+ pxor mm2, mm2
+ punpcklbw mm1, mm4
+ punpckhbw mm2, mm4
+ pmulhw mm1, [GLOBAL(s9)]
+ pmulhw mm2, [GLOBAL(s9)]
+ paddw mm1, [GLOBAL(s63)]
+ paddw mm2, [GLOBAL(s63)]
+ psraw mm1, 7
+ psraw mm2, 7
+ packsswb mm1, mm2
+
+
+ movq mm6, [rdi+rax*4]
+ neg rax
+ movq mm3, [rdi+rax ]
+
+ pxor mm6, [GLOBAL(t80)]
+ pxor mm3, [GLOBAL(t80)]
+
+ paddsb mm6, mm1
+ psubsb mm3, mm1
+
+ pxor mm6, [GLOBAL(t80)]
+ pxor mm3, [GLOBAL(t80)]
+ movq [rdi+rax ], mm3
+ neg rax
+ movq [rdi+rax*4], mm6
+
+;EARLY_BREAK_OUT:
+ neg rax
+ add rsi,8
+ dec rcx
+ jnz .next8_mbh
+
+ add rsp, 32
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_mbloop_filter_vertical_edge_mmx
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+; int count
+;)
+global sym(vp8_mbloop_filter_vertical_edge_mmx) PRIVATE
+sym(vp8_mbloop_filter_vertical_edge_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 96 ; reserve 96 bytes
+ %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
+ %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
+ %define srct [rsp + 32] ;__declspec(align(16)) char srct[64];
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+
+ lea rsi, [rsi + rax*4 - 4]
+
+ movsxd rcx, dword ptr arg(5) ;count
+.next8_mbv:
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+
+ ;transpose
+ movq mm0, [rdi+2*rax] ; 77 76 75 74 73 72 71 70
+ movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60
+
+ movq mm7, mm6 ; 77 76 75 74 73 72 71 70
+ punpckhbw mm7, mm0 ; 77 67 76 66 75 65 74 64
+
+ punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60
+ movq mm0, [rsi+rax] ; 57 56 55 54 53 52 51 50
+
+ movq mm4, [rsi] ; 47 46 45 44 43 42 41 40
+ movq mm5, mm4 ; 47 46 45 44 43 42 41 40
+
+ punpckhbw mm5, mm0 ; 57 47 56 46 55 45 54 44
+ punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40
+
+ movq mm3, mm5 ; 57 47 56 46 55 45 54 44
+ punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46
+
+ punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44
+ movq mm2, mm4 ; 53 43 52 42 51 41 50 40
+
+ punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42
+ punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40
+
+ neg rax
+
+ movq mm7, [rsi+rax] ; 37 36 35 34 33 32 31 30
+ movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20
+
+ movq mm1, mm6 ; 27 26 25 24 23 22 21 20
+ punpckhbw mm6, mm7 ; 37 27 36 36 35 25 34 24
+
+ punpcklbw mm1, mm7 ; 33 23 32 22 31 21 30 20
+
+ movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00
+ punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04
+
+ movq mm0, mm7 ; 17 07 16 06 15 05 14 04
+ punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06
+
+ punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04
+ movq mm6, mm7 ; 37 27 17 07 36 26 16 06
+
+ punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3
+ punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2
+
+ lea rdx, srct
+ movq mm5, mm6 ; 76 66 56 46 36 26 16 06
+
+ movq [rdx+56], mm7
+ psubusb mm5, mm7 ; q2-q3
+
+
+ movq [rdx+48], mm6
+ psubusb mm7, mm6 ; q3-q2
+
+ por mm7, mm5; ; mm7=abs (q3-q2)
+ movq mm5, mm0 ; 35 25 15 05 34 24 14 04
+
+ punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1
+ punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0
+
+ movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1
+ psubusb mm3, mm6 ; q1-q2
+
+ psubusb mm6, mm5 ; q2-q1
+ por mm6, mm3 ; mm6=abs(q2-q1)
+
+ movq [rdx+40], mm5 ; save q1
+ movq [rdx+32], mm0 ; save q0
+
+ movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00
+ punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00
+
+ movq mm0, mm3 ; 13 03 12 02 11 01 10 00
+ punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00
+
+ punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02
+ movq mm1, mm0 ; 31 21 11 01 30 20 10 00
+
+ punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3
+ punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2
+
+ movq [rdx], mm0 ; save p3
+ movq [rdx+8], mm1 ; save p2
+
+ movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2
+ psubusb mm2, mm0 ; p2-p3
+
+ psubusb mm0, mm1 ; p3-p2
+ por mm0, mm2 ; mm0=abs(p3-p2)
+
+ movq mm2, mm3 ; 33 23 13 03 32 22 12 02
+ punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1
+
+ punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0
+ movq [rdx+24], mm3 ; save p0
+
+ movq [rdx+16], mm2 ; save p1
+ movq mm5, mm2 ; mm5 = p1
+
+ psubusb mm2, mm1 ; p1-p2
+ psubusb mm1, mm5 ; p2-p1
+
+ por mm1, mm2 ; mm1=abs(p2-p1)
+ mov rdx, arg(3) ;limit
+
+ movq mm4, [rdx] ; mm4 = limit
+ psubusb mm7, mm4 ; abs(q3-q2) > limit
+
+ psubusb mm0, mm4 ; abs(p3-p2) > limit
+ psubusb mm1, mm4 ; abs(p2-p1) > limit
+
+ psubusb mm6, mm4 ; abs(q2-q1) > limit
+ por mm7, mm6 ; or
+
+ por mm0, mm1 ;
+ por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
+
+ movq mm1, mm5 ; p1
+
+ movq mm7, mm3 ; mm3=mm7=p0
+ psubusb mm7, mm5 ; p0 - p1
+
+ psubusb mm5, mm3 ; p1 - p0
+ por mm5, mm7 ; abs(p1-p0)
+
+ movq t0, mm5 ; save abs(p1-p0)
+ lea rdx, srct
+
+ psubusb mm5, mm4 ; mm5 = abs(p1-p0) > limit
+ por mm0, mm5 ; mm0=mask
+
+ movq mm5, [rdx+32] ; mm5=q0
+ movq mm7, [rdx+40] ; mm7=q1
+
+ movq mm6, mm5 ; mm6=q0
+ movq mm2, mm7 ; q1
+ psubusb mm5, mm7 ; q0-q1
+
+ psubusb mm7, mm6 ; q1-q0
+ por mm7, mm5 ; abs(q1-q0)
+
+ movq t1, mm7 ; save abs(q1-q0)
+ psubusb mm7, mm4 ; mm7=abs(q1-q0)> limit
+
+ por mm0, mm7 ; mask
+
+ movq mm5, mm2 ; q1
+ psubusb mm5, mm1 ; q1-=p1
+ psubusb mm1, mm2 ; p1-=q1
+ por mm5, mm1 ; abs(p1-q1)
+ pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psrlw mm5, 1 ; abs(p1-q1)/2
+
+ mov rdx, arg(2) ;blimit ;
+
+ movq mm4, [rdx] ;blimit
+ movq mm1, mm3 ; mm1=mm3=p0
+
+ movq mm7, mm6 ; mm7=mm6=q0
+ psubusb mm1, mm7 ; p0-q0
+
+ psubusb mm7, mm3 ; q0-p0
+ por mm1, mm7 ; abs(q0-p0)
+ paddusb mm1, mm1 ; abs(q0-p0)*2
+ paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ por mm1, mm0; ; mask
+
+ pxor mm0, mm0
+ pcmpeqb mm1, mm0
+
+ ; calculate high edge variance
+ mov rdx, arg(4) ;thresh ; get thresh
+ movq mm7, [rdx]
+ ;
+ movq mm4, t0 ; get abs (q1 - q0)
+ psubusb mm4, mm7 ; abs(q1 - q0) > thresh
+
+ movq mm3, t1 ; get abs (p1 - p0)
+ psubusb mm3, mm7 ; abs(p1 - p0)> thresh
+
+ por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+ pcmpeqb mm4, mm0
+
+ pcmpeqb mm0, mm0
+ pxor mm4, mm0
+
+
+
+
+ ; start work on filters
+ lea rdx, srct
+
+ ; start work on filters
+ movq mm2, [rdx+16] ; p1
+ movq mm7, [rdx+40] ; q1
+ pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
+ pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
+ psubsb mm2, mm7 ; p1 - q1
+
+ movq mm6, [rdx+24] ; p0
+ movq mm0, [rdx+32] ; q0
+ pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
+ pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
+
+ movq mm3, mm0 ; q0
+ psubsb mm0, mm6 ; q0 - p0
+ paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1)
+ paddsb mm2, mm0 ; 2 * (q0 - p0)
+ paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1)
+ pand mm1, mm2 ; mask filter values we don't care about
+
+ ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
+ movq mm2, mm1 ; vp8_filter
+ pand mm2, mm4; ; Filter2 = vp8_filter & hev
+
+ movq mm5, mm2 ;
+ paddsb mm5, [GLOBAL(t3)];
+
+ pxor mm0, mm0 ; 0
+ pxor mm7, mm7 ; 0
+
+ punpcklbw mm0, mm5 ; e0f0g0h0
+ psraw mm0, 11 ; sign extended shift right by 3
+ punpckhbw mm7, mm5 ; a0b0c0d0
+ psraw mm7, 11 ; sign extended shift right by 3
+ packsswb mm0, mm7 ; Filter2 >>=3;
+
+ movq mm5, mm0 ; Filter2
+
+ paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4)
+ pxor mm0, mm0 ; 0
+ pxor mm7, mm7 ; 0
+
+ punpcklbw mm0, mm2 ; e0f0g0h0
+ psraw mm0, 11 ; sign extended shift right by 3
+ punpckhbw mm7, mm2 ; a0b0c0d0
+ psraw mm7, 11 ; sign extended shift right by 3
+ packsswb mm0, mm7 ; Filter2 >>=3;
+
+ ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0
+ psubsb mm3, mm0 ; qs0 =qs0 - filter1
+ paddsb mm6, mm5 ; ps0 =ps0 + Fitler2
+
+ ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
+ ; vp8_filter &= ~hev;
+ ; Filter2 = vp8_filter;
+ pandn mm4, mm1 ; vp8_filter&=~hev
+
+
+ ; mm3=qs0, mm4=filter2, mm6=ps0
+
+ ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
+ ; s = vp8_signed_char_clamp(qs0 - u);
+ ; *oq0 = s^0x80;
+ ; s = vp8_signed_char_clamp(ps0 + u);
+ ; *op0 = s^0x80;
+ pxor mm0, mm0
+
+ pxor mm1, mm1
+ pxor mm2, mm2
+ punpcklbw mm1, mm4
+ punpckhbw mm2, mm4
+ pmulhw mm1, [GLOBAL(s27)]
+ pmulhw mm2, [GLOBAL(s27)]
+ paddw mm1, [GLOBAL(s63)]
+ paddw mm2, [GLOBAL(s63)]
+ psraw mm1, 7
+ psraw mm2, 7
+ packsswb mm1, mm2
+
+ psubsb mm3, mm1
+ paddsb mm6, mm1
+
+ pxor mm3, [GLOBAL(t80)]
+ pxor mm6, [GLOBAL(t80)]
+ movq [rdx+24], mm6
+ movq [rdx+32], mm3
+
+ ; roughly 2/7th difference across boundary
+ ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
+ ; s = vp8_signed_char_clamp(qs1 - u);
+ ; *oq1 = s^0x80;
+ ; s = vp8_signed_char_clamp(ps1 + u);
+ ; *op1 = s^0x80;
+ pxor mm1, mm1
+ pxor mm2, mm2
+ punpcklbw mm1, mm4
+ punpckhbw mm2, mm4
+ pmulhw mm1, [GLOBAL(s18)]
+ pmulhw mm2, [GLOBAL(s18)]
+ paddw mm1, [GLOBAL(s63)]
+ paddw mm2, [GLOBAL(s63)]
+ psraw mm1, 7
+ psraw mm2, 7
+ packsswb mm1, mm2
+
+ movq mm3, [rdx + 40]
+ movq mm6, [rdx + 16] ; p1
+ pxor mm3, [GLOBAL(t80)]
+ pxor mm6, [GLOBAL(t80)]
+
+ paddsb mm6, mm1
+ psubsb mm3, mm1
+
+ pxor mm6, [GLOBAL(t80)]
+ pxor mm3, [GLOBAL(t80)]
+ movq [rdx + 40], mm3
+ movq [rdx + 16], mm6
+
+ ; roughly 1/7th difference across boundary
+ ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
+ ; s = vp8_signed_char_clamp(qs2 - u);
+ ; *oq2 = s^0x80;
+ ; s = vp8_signed_char_clamp(ps2 + u);
+ ; *op2 = s^0x80;
+ pxor mm1, mm1
+ pxor mm2, mm2
+ punpcklbw mm1, mm4
+ punpckhbw mm2, mm4
+ pmulhw mm1, [GLOBAL(s9)]
+ pmulhw mm2, [GLOBAL(s9)]
+ paddw mm1, [GLOBAL(s63)]
+ paddw mm2, [GLOBAL(s63)]
+ psraw mm1, 7
+ psraw mm2, 7
+ packsswb mm1, mm2
+
+ movq mm6, [rdx+ 8]
+ movq mm3, [rdx+48]
+
+ pxor mm6, [GLOBAL(t80)]
+ pxor mm3, [GLOBAL(t80)]
+
+ paddsb mm6, mm1
+ psubsb mm3, mm1
+
+ pxor mm6, [GLOBAL(t80)] ; mm6 = 71 61 51 41 31 21 11 01
+ pxor mm3, [GLOBAL(t80)] ; mm3 = 76 66 56 46 36 26 15 06
+
+ ; transpose and write back
+ movq mm0, [rdx] ; mm0 = 70 60 50 40 30 20 10 00
+ movq mm1, mm0 ; mm0 = 70 60 50 40 30 20 10 00
+
+ punpcklbw mm0, mm6 ; mm0 = 31 30 21 20 11 10 01 00
+ punpckhbw mm1, mm6 ; mm3 = 71 70 61 60 51 50 41 40
+
+ movq mm2, [rdx+16] ; mm2 = 72 62 52 42 32 22 12 02
+ movq mm6, mm2 ; mm3 = 72 62 52 42 32 22 12 02
+
+ punpcklbw mm2, [rdx+24] ; mm2 = 33 32 23 22 13 12 03 02
+ punpckhbw mm6, [rdx+24] ; mm3 = 73 72 63 62 53 52 43 42
+
+ movq mm5, mm0 ; mm5 = 31 30 21 20 11 10 01 00
+ punpcklwd mm0, mm2 ; mm0 = 13 12 11 10 03 02 01 00
+
+ punpckhwd mm5, mm2 ; mm5 = 33 32 31 30 23 22 21 20
+ movq mm4, mm1 ; mm4 = 71 70 61 60 51 50 41 40
+
+ punpcklwd mm1, mm6 ; mm1 = 53 52 51 50 43 42 41 40
+ punpckhwd mm4, mm6 ; mm4 = 73 72 71 70 63 62 61 60
+
+ movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04
+ punpcklbw mm2, [rdx+40] ; mm2 = 35 34 25 24 15 14 05 04
+
+ movq mm6, mm3 ; mm6 = 76 66 56 46 36 26 15 06
+ punpcklbw mm6, [rdx+56] ; mm6 = 37 36 27 26 17 16 07 06
+
+ movq mm7, mm2 ; mm7 = 35 34 25 24 15 14 05 04
+ punpcklwd mm2, mm6 ; mm2 = 17 16 15 14 07 06 05 04
+
+ punpckhwd mm7, mm6 ; mm7 = 37 36 35 34 27 26 25 24
+ movq mm6, mm0 ; mm6 = 13 12 11 10 03 02 01 00
+
+ punpckldq mm0, mm2 ; mm0 = 07 06 05 04 03 02 01 00
+ punpckhdq mm6, mm2 ; mm6 = 17 16 15 14 13 12 11 10
+
+ movq [rsi+rax*4], mm0 ; write out
+ movq [rdi+rax*4], mm6 ; write out
+
+ movq mm0, mm5 ; mm0 = 33 32 31 30 23 22 21 20
+ punpckldq mm0, mm7 ; mm0 = 27 26 25 24 23 22 20 20
+
+ punpckhdq mm5, mm7 ; mm5 = 37 36 35 34 33 32 31 30
+ movq [rsi+rax*2], mm0 ; write out
+
+ movq [rdi+rax*2], mm5 ; write out
+ movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04
+
+ punpckhbw mm2, [rdx+40] ; mm2 = 75 74 65 64 54 54 45 44
+ punpckhbw mm3, [rdx+56] ; mm3 = 77 76 67 66 57 56 47 46
+
+ movq mm5, mm2 ; mm5 = 75 74 65 64 54 54 45 44
+ punpcklwd mm2, mm3 ; mm2 = 57 56 55 54 47 46 45 44
+
+ punpckhwd mm5, mm3 ; mm5 = 77 76 75 74 67 66 65 64
+ movq mm0, mm1 ; mm0= 53 52 51 50 43 42 41 40
+
+ movq mm3, mm4 ; mm4 = 73 72 71 70 63 62 61 60
+ punpckldq mm0, mm2 ; mm0 = 47 46 45 44 43 42 41 40
+
+ punpckhdq mm1, mm2 ; mm1 = 57 56 55 54 53 52 51 50
+ movq [rsi], mm0 ; write out
+
+ movq [rdi], mm1 ; write out
+ neg rax
+
+ punpckldq mm3, mm5 ; mm3 = 67 66 65 64 63 62 61 60
+ punpckhdq mm4, mm5 ; mm4 = 77 76 75 74 73 72 71 60
+
+ movq [rsi+rax*2], mm3
+ movq [rdi+rax*2], mm4
+
+ lea rsi, [rsi+rax*8]
+ dec rcx
+
+ jnz .next8_mbv
+
+ add rsp, 96
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_loop_filter_simple_horizontal_edge_mmx
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit
+;)
+global sym(vp8_loop_filter_simple_horizontal_edge_mmx) PRIVATE
+sym(vp8_loop_filter_simple_horizontal_edge_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+
+ mov rcx, 2 ; count
+.nexts8_h:
+ mov rdx, arg(2) ;blimit ; get blimit
+ movq mm3, [rdx] ;
+
+ mov rdi, rsi ; rdi points to row +1 for indirect addressing
+ add rdi, rax
+ neg rax
+
+ ; calculate mask
+ movq mm1, [rsi+2*rax] ; p1
+ movq mm0, [rdi] ; q1
+ movq mm2, mm1
+ movq mm7, mm0
+ movq mm4, mm0
+ psubusb mm0, mm1 ; q1-=p1
+ psubusb mm1, mm4 ; p1-=q1
+ por mm1, mm0 ; abs(p1-q1)
+ pand mm1, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psrlw mm1, 1 ; abs(p1-q1)/2
+
+ movq mm5, [rsi+rax] ; p0
+ movq mm4, [rsi] ; q0
+ movq mm0, mm4 ; q0
+ movq mm6, mm5 ; p0
+ psubusb mm5, mm4 ; p0-=q0
+ psubusb mm4, mm6 ; q0-=p0
+ por mm5, mm4 ; abs(p0 - q0)
+ paddusb mm5, mm5 ; abs(p0-q0)*2
+ paddusb mm5, mm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ pxor mm3, mm3
+ pcmpeqb mm5, mm3
+
+ ; start work on filters
+ pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
+ pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
+ psubsb mm2, mm7 ; p1 - q1
+
+ pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
+ pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
+ movq mm3, mm0 ; q0
+ psubsb mm0, mm6 ; q0 - p0
+ paddsb mm2, mm0 ; p1 - q1 + 1 * (q0 - p0)
+ paddsb mm2, mm0 ; p1 - q1 + 2 * (q0 - p0)
+ paddsb mm2, mm0 ; p1 - q1 + 3 * (q0 - p0)
+ pand mm5, mm2 ; mask filter values we don't care about
+
+ ; do + 4 side
+ paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
+
+ movq mm0, mm5 ; get a copy of filters
+ psllw mm0, 8 ; shift left 8
+ psraw mm0, 3 ; arithmetic shift right 11
+ psrlw mm0, 8
+ movq mm1, mm5 ; get a copy of filters
+ psraw mm1, 11 ; arithmetic shift right 11
+ psllw mm1, 8 ; shift left 8 to put it back
+
+ por mm0, mm1 ; put the two together to get result
+
+ psubsb mm3, mm0 ; q0-= q0 add
+ pxor mm3, [GLOBAL(t80)] ; unoffset
+ movq [rsi], mm3 ; write back
+
+
+ ; now do +3 side
+ psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4
+
+ movq mm0, mm5 ; get a copy of filters
+ psllw mm0, 8 ; shift left 8
+ psraw mm0, 3 ; arithmetic shift right 11
+ psrlw mm0, 8
+ psraw mm5, 11 ; arithmetic shift right 11
+ psllw mm5, 8 ; shift left 8 to put it back
+ por mm0, mm5 ; put the two together to get result
+
+
+ paddsb mm6, mm0 ; p0+= p0 add
+ pxor mm6, [GLOBAL(t80)] ; unoffset
+ movq [rsi+rax], mm6 ; write back
+
+ add rsi,8
+ neg rax
+ dec rcx
+ jnz .nexts8_h
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_loop_filter_simple_vertical_edge_mmx
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit
+;)
+global sym(vp8_loop_filter_simple_vertical_edge_mmx) PRIVATE
+sym(vp8_loop_filter_simple_vertical_edge_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 32 ; reserve 32 bytes
+ %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
+ %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+
+ lea rsi, [rsi + rax*4- 2]; ;
+ mov rcx, 2 ; count
+.nexts8_v:
+
+ lea rdi, [rsi + rax];
+ movd mm0, [rdi + rax * 2] ; xx xx xx xx 73 72 71 70
+
+ movd mm6, [rsi + rax * 2] ; xx xx xx xx 63 62 61 60
+ punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60
+
+ movd mm0, [rsi + rax] ; xx xx xx xx 53 52 51 50
+ movd mm4, [rsi] ; xx xx xx xx 43 42 41 40
+
+ punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40
+ movq mm5, mm4 ; 53 43 52 42 51 41 50 40
+
+ punpcklwd mm4, mm6 ; 71 61 51 41 70 60 50 40
+ punpckhwd mm5, mm6 ; 73 63 53 43 72 62 52 42
+
+ neg rax
+
+ movd mm7, [rsi + rax] ; xx xx xx xx 33 32 31 30
+ movd mm6, [rsi + rax * 2] ; xx xx xx xx 23 22 21 20
+
+ punpcklbw mm6, mm7 ; 33 23 32 22 31 21 30 20
+ movd mm1, [rdi + rax * 4] ; xx xx xx xx 13 12 11 10
+
+ movd mm0, [rsi + rax * 4] ; xx xx xx xx 03 02 01 00
+ punpcklbw mm0, mm1 ; 13 03 12 02 11 01 10 00
+
+ movq mm2, mm0 ; 13 03 12 02 11 01 10 00
+ punpcklwd mm0, mm6 ; 31 21 11 01 30 20 10 00
+
+ punpckhwd mm2, mm6 ; 33 23 13 03 32 22 12 02
+ movq mm1, mm0 ; 13 03 12 02 11 01 10 00
+
+ punpckldq mm0, mm4 ; 70 60 50 40 30 20 10 00 = p1
+ movq mm3, mm2 ; 33 23 13 03 32 22 12 02
+
+ punpckhdq mm1, mm4 ; 71 61 51 41 31 21 11 01 = p0
+ punpckldq mm2, mm5 ; 72 62 52 42 32 22 12 02 = q0
+
+ punpckhdq mm3, mm5 ; 73 63 53 43 33 23 13 03 = q1
+
+
+ ; calculate mask
+ movq mm6, mm0 ; p1
+ movq mm7, mm3 ; q1
+ psubusb mm7, mm6 ; q1-=p1
+ psubusb mm6, mm3 ; p1-=q1
+ por mm6, mm7 ; abs(p1-q1)
+ pand mm6, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psrlw mm6, 1 ; abs(p1-q1)/2
+
+ movq mm5, mm1 ; p0
+ movq mm4, mm2 ; q0
+
+ psubusb mm5, mm2 ; p0-=q0
+ psubusb mm4, mm1 ; q0-=p0
+
+ por mm5, mm4 ; abs(p0 - q0)
+ paddusb mm5, mm5 ; abs(p0-q0)*2
+ paddusb mm5, mm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ mov rdx, arg(2) ;blimit ; get blimit
+ movq mm7, [rdx]
+
+ psubusb mm5, mm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ pxor mm7, mm7
+ pcmpeqb mm5, mm7 ; mm5 = mask
+
+ ; start work on filters
+ movq t0, mm0
+ movq t1, mm3
+
+ pxor mm0, [GLOBAL(t80)] ; p1 offset to convert to signed values
+ pxor mm3, [GLOBAL(t80)] ; q1 offset to convert to signed values
+
+ psubsb mm0, mm3 ; p1 - q1
+ movq mm6, mm1 ; p0
+
+ movq mm7, mm2 ; q0
+ pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
+
+ pxor mm7, [GLOBAL(t80)] ; offset to convert to signed values
+ movq mm3, mm7 ; offseted ; q0
+
+ psubsb mm7, mm6 ; q0 - p0
+ paddsb mm0, mm7 ; p1 - q1 + 1 * (q0 - p0)
+
+ paddsb mm0, mm7 ; p1 - q1 + 2 * (q0 - p0)
+ paddsb mm0, mm7 ; p1 - q1 + 3 * (q0 - p0)
+
+ pand mm5, mm0 ; mask filter values we don't care about
+
+ paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
+
+ movq mm0, mm5 ; get a copy of filters
+ psllw mm0, 8 ; shift left 8
+ psraw mm0, 3 ; arithmetic shift right 11
+ psrlw mm0, 8
+
+ movq mm7, mm5 ; get a copy of filters
+ psraw mm7, 11 ; arithmetic shift right 11
+ psllw mm7, 8 ; shift left 8 to put it back
+
+ por mm0, mm7 ; put the two together to get result
+
+ psubsb mm3, mm0 ; q0-= q0sz add
+ pxor mm3, [GLOBAL(t80)] ; unoffset
+
+ ; now do +3 side
+ psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4
+
+ movq mm0, mm5 ; get a copy of filters
+ psllw mm0, 8 ; shift left 8
+ psraw mm0, 3 ; arithmetic shift right 11
+ psrlw mm0, 8
+
+ psraw mm5, 11 ; arithmetic shift right 11
+ psllw mm5, 8 ; shift left 8 to put it back
+ por mm0, mm5 ; put the two together to get result
+
+ paddsb mm6, mm0 ; p0+= p0 add
+ pxor mm6, [GLOBAL(t80)] ; unoffset
+
+
+ movq mm0, t0
+ movq mm4, t1
+
+ ; mm0 = 70 60 50 40 30 20 10 00
+ ; mm6 = 71 61 51 41 31 21 11 01
+ ; mm3 = 72 62 52 42 32 22 12 02
+ ; mm4 = 73 63 53 43 33 23 13 03
+ ; transpose back to write out
+
+ movq mm1, mm0 ;
+ punpcklbw mm0, mm6 ; 31 30 21 20 11 10 01 00
+
+ punpckhbw mm1, mm6 ; 71 70 61 60 51 50 41 40
+ movq mm2, mm3 ;
+
+ punpcklbw mm2, mm4 ; 33 32 23 22 13 12 03 02
+ movq mm5, mm1 ; 71 70 61 60 51 50 41 40
+
+ punpckhbw mm3, mm4 ; 73 72 63 62 53 52 43 42
+ movq mm6, mm0 ; 31 30 21 20 11 10 01 00
+
+ punpcklwd mm0, mm2 ; 13 12 11 10 03 02 01 00
+ punpckhwd mm6, mm2 ; 33 32 31 30 23 22 21 20
+
+ movd [rsi+rax*4], mm0 ; write 03 02 01 00
+ punpcklwd mm1, mm3 ; 53 52 51 50 43 42 41 40
+
+ psrlq mm0, 32 ; xx xx xx xx 13 12 11 10
+ punpckhwd mm5, mm3 ; 73 72 71 70 63 62 61 60
+
+ movd [rdi+rax*4], mm0 ; write 13 12 11 10
+ movd [rsi+rax*2], mm6 ; write 23 22 21 20
+
+ psrlq mm6, 32 ; 33 32 31 30
+ movd [rsi], mm1 ; write 43 42 41 40
+
+ movd [rsi + rax], mm6 ; write 33 32 31 30
+ neg rax
+
+ movd [rsi + rax*2], mm5 ; write 63 62 61 60
+ psrlq mm1, 32 ; 53 52 51 50
+
+ movd [rdi], mm1 ; write out 53 52 51 50
+ psrlq mm5, 32 ; 73 72 71 70
+
+ movd [rdi + rax*2], mm5 ; write 73 72 71 70
+
+ lea rsi, [rsi+rax*8] ; next 8
+
+ dec rcx
+ jnz .nexts8_v
+
+ add rsp, 32
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr,
+; int y_stride,
+; loop_filter_info *lfi)
+;{
+;
+;
+; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
+; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
+; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
+;}
+
+SECTION_RODATA
+align 16
+tfe:
+ times 8 db 0xfe
+align 16
+t80:
+ times 8 db 0x80
+align 16
+t1s:
+ times 8 db 0x01
+align 16
+t3:
+ times 8 db 0x03
+align 16
+t4:
+ times 8 db 0x04
+align 16
+ones:
+ times 4 dw 0x0001
+align 16
+s27:
+ times 4 dw 0x1b00
+align 16
+s18:
+ times 4 dw 0x1200
+align 16
+s9:
+ times 4 dw 0x0900
+align 16
+s63:
+ times 4 dw 0x003f
diff --git a/thirdparty/libvpx/vp8/decoder/dboolhuff.c b/thirdparty/libvpx/vp8/decoder/dboolhuff.c
new file mode 100644
index 0000000000..5cdd2a2491
--- /dev/null
+++ b/thirdparty/libvpx/vp8/decoder/dboolhuff.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "dboolhuff.h"
+#include "vp8/common/common.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+int vp8dx_start_decode(BOOL_DECODER *br,
+ const unsigned char *source,
+ unsigned int source_sz,
+ vpx_decrypt_cb decrypt_cb,
+ void *decrypt_state)
+{
+ br->user_buffer_end = source+source_sz;
+ br->user_buffer = source;
+ br->value = 0;
+ br->count = -8;
+ br->range = 255;
+ br->decrypt_cb = decrypt_cb;
+ br->decrypt_state = decrypt_state;
+
+ if (source_sz && !source)
+ return 1;
+
+ /* Populate the buffer */
+ vp8dx_bool_decoder_fill(br);
+
+ return 0;
+}
+
+void vp8dx_bool_decoder_fill(BOOL_DECODER *br)
+{
+ const unsigned char *bufptr = br->user_buffer;
+ VP8_BD_VALUE value = br->value;
+ int count = br->count;
+ int shift = VP8_BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT);
+ size_t bytes_left = br->user_buffer_end - bufptr;
+ size_t bits_left = bytes_left * CHAR_BIT;
+ int x = shift + CHAR_BIT - (int)bits_left;
+ int loop_end = 0;
+ unsigned char decrypted[sizeof(VP8_BD_VALUE) + 1];
+
+ if (br->decrypt_cb) {
+ size_t n = VPXMIN(sizeof(decrypted), bytes_left);
+ br->decrypt_cb(br->decrypt_state, bufptr, decrypted, (int)n);
+ bufptr = decrypted;
+ }
+
+ if(x >= 0)
+ {
+ count += VP8_LOTS_OF_BITS;
+ loop_end = x;
+ }
+
+ if (x < 0 || bits_left)
+ {
+ while(shift >= loop_end)
+ {
+ count += CHAR_BIT;
+ value |= (VP8_BD_VALUE)*bufptr << shift;
+ ++bufptr;
+ ++br->user_buffer;
+ shift -= CHAR_BIT;
+ }
+ }
+
+ br->value = value;
+ br->count = count;
+}
diff --git a/thirdparty/libvpx/vp8/decoder/dboolhuff.h b/thirdparty/libvpx/vp8/decoder/dboolhuff.h
new file mode 100644
index 0000000000..1b1bbf868e
--- /dev/null
+++ b/thirdparty/libvpx/vp8/decoder/dboolhuff.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_DECODER_DBOOLHUFF_H_
+#define VP8_DECODER_DBOOLHUFF_H_
+
+#include <stddef.h>
+#include <limits.h>
+
+#include "./vpx_config.h"
+#include "vpx_ports/mem.h"
+#include "vpx/vp8dx.h"
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef size_t VP8_BD_VALUE;
+
+#define VP8_BD_VALUE_SIZE ((int)sizeof(VP8_BD_VALUE)*CHAR_BIT)
+
+/*This is meant to be a large, positive constant that can still be efficiently
+ loaded as an immediate (on platforms like ARM, for example).
+ Even relatively modest values like 100 would work fine.*/
+#define VP8_LOTS_OF_BITS (0x40000000)
+
+typedef struct
+{
+ const unsigned char *user_buffer_end;
+ const unsigned char *user_buffer;
+ VP8_BD_VALUE value;
+ int count;
+ unsigned int range;
+ vpx_decrypt_cb decrypt_cb;
+ void *decrypt_state;
+} BOOL_DECODER;
+
+DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]);
+
+int vp8dx_start_decode(BOOL_DECODER *br,
+ const unsigned char *source,
+ unsigned int source_sz,
+ vpx_decrypt_cb decrypt_cb,
+ void *decrypt_state);
+
+void vp8dx_bool_decoder_fill(BOOL_DECODER *br);
+
+
+static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) {
+ unsigned int bit = 0;
+ VP8_BD_VALUE value;
+ unsigned int split;
+ VP8_BD_VALUE bigsplit;
+ int count;
+ unsigned int range;
+
+ split = 1 + (((br->range - 1) * probability) >> 8);
+
+ if(br->count < 0)
+ vp8dx_bool_decoder_fill(br);
+
+ value = br->value;
+ count = br->count;
+
+ bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8);
+
+ range = split;
+
+ if (value >= bigsplit)
+ {
+ range = br->range - split;
+ value = value - bigsplit;
+ bit = 1;
+ }
+
+ {
+ register int shift = vp8_norm[range];
+ range <<= shift;
+ value <<= shift;
+ count -= shift;
+ }
+ br->value = value;
+ br->count = count;
+ br->range = range;
+
+ return bit;
+}
+
+static INLINE int vp8_decode_value(BOOL_DECODER *br, int bits)
+{
+ int z = 0;
+ int bit;
+
+ for (bit = bits - 1; bit >= 0; bit--)
+ {
+ z |= (vp8dx_decode_bool(br, 0x80) << bit);
+ }
+
+ return z;
+}
+
+static INLINE int vp8dx_bool_error(BOOL_DECODER *br)
+{
+ /* Check if we have reached the end of the buffer.
+ *
+ * Variable 'count' stores the number of bits in the 'value' buffer, minus
+ * 8. The top byte is part of the algorithm, and the remainder is buffered
+ * to be shifted into it. So if count == 8, the top 16 bits of 'value' are
+ * occupied, 8 for the algorithm and 8 in the buffer.
+ *
+ * When reading a byte from the user's buffer, count is filled with 8 and
+ * one byte is filled into the value buffer. When we reach the end of the
+ * data, count is additionally filled with VP8_LOTS_OF_BITS. So when
+ * count == VP8_LOTS_OF_BITS - 1, the user's data has been exhausted.
+ */
+ if ((br->count > VP8_BD_VALUE_SIZE) && (br->count < VP8_LOTS_OF_BITS))
+ {
+ /* We have tried to decode bits after the end of
+ * stream was encountered.
+ */
+ return 1;
+ }
+
+ /* No error. */
+ return 0;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_DECODER_DBOOLHUFF_H_
diff --git a/thirdparty/libvpx/vp8/decoder/decodeframe.c b/thirdparty/libvpx/vp8/decoder/decodeframe.c
new file mode 100644
index 0000000000..51acdbb9c8
--- /dev/null
+++ b/thirdparty/libvpx/vp8/decoder/decodeframe.c
@@ -0,0 +1,1397 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+#include "onyxd_int.h"
+#include "vp8/common/header.h"
+#include "vp8/common/reconintra4x4.h"
+#include "vp8/common/reconinter.h"
+#include "detokenize.h"
+#include "vp8/common/common.h"
+#include "vp8/common/invtrans.h"
+#include "vp8/common/alloccommon.h"
+#include "vp8/common/entropymode.h"
+#include "vp8/common/quant_common.h"
+#include "vpx_scale/vpx_scale.h"
+#include "vp8/common/reconintra.h"
+#include "vp8/common/setupintrarecon.h"
+
+#include "decodemv.h"
+#include "vp8/common/extend.h"
+#if CONFIG_ERROR_CONCEALMENT
+#include "error_concealment.h"
+#endif
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/common/threading.h"
+#include "decoderthreading.h"
+#include "dboolhuff.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+#include <assert.h>
+#include <stdio.h>
+
+void vp8cx_init_de_quantizer(VP8D_COMP *pbi)
+{
+ int Q;
+ VP8_COMMON *const pc = & pbi->common;
+
+ for (Q = 0; Q < QINDEX_RANGE; Q++)
+ {
+ pc->Y1dequant[Q][0] = (short)vp8_dc_quant(Q, pc->y1dc_delta_q);
+ pc->Y2dequant[Q][0] = (short)vp8_dc2quant(Q, pc->y2dc_delta_q);
+ pc->UVdequant[Q][0] = (short)vp8_dc_uv_quant(Q, pc->uvdc_delta_q);
+
+ pc->Y1dequant[Q][1] = (short)vp8_ac_yquant(Q);
+ pc->Y2dequant[Q][1] = (short)vp8_ac2quant(Q, pc->y2ac_delta_q);
+ pc->UVdequant[Q][1] = (short)vp8_ac_uv_quant(Q, pc->uvac_delta_q);
+ }
+}
+
+void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd)
+{
+ int i;
+ int QIndex;
+ MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+ VP8_COMMON *const pc = & pbi->common;
+
+ /* Decide whether to use the default or alternate baseline Q value. */
+ if (xd->segmentation_enabled)
+ {
+ /* Abs Value */
+ if (xd->mb_segement_abs_delta == SEGMENT_ABSDATA)
+ QIndex = xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id];
+
+ /* Delta Value */
+ else
+ QIndex = pc->base_qindex + xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id];
+
+ QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0; /* Clamp to valid range */
+ }
+ else
+ QIndex = pc->base_qindex;
+
+ /* Set up the macroblock dequant constants */
+ xd->dequant_y1_dc[0] = 1;
+ xd->dequant_y1[0] = pc->Y1dequant[QIndex][0];
+ xd->dequant_y2[0] = pc->Y2dequant[QIndex][0];
+ xd->dequant_uv[0] = pc->UVdequant[QIndex][0];
+
+ for (i = 1; i < 16; i++)
+ {
+ xd->dequant_y1_dc[i] =
+ xd->dequant_y1[i] = pc->Y1dequant[QIndex][1];
+ xd->dequant_y2[i] = pc->Y2dequant[QIndex][1];
+ xd->dequant_uv[i] = pc->UVdequant[QIndex][1];
+ }
+}
+
+static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
+ unsigned int mb_idx)
+{
+ MB_PREDICTION_MODE mode;
+ int i;
+#if CONFIG_ERROR_CONCEALMENT
+ int corruption_detected = 0;
+#else
+ (void)mb_idx;
+#endif
+
+ if (xd->mode_info_context->mbmi.mb_skip_coeff)
+ {
+ vp8_reset_mb_tokens_context(xd);
+ }
+ else if (!vp8dx_bool_error(xd->current_bc))
+ {
+ int eobtotal;
+ eobtotal = vp8_decode_mb_tokens(pbi, xd);
+
+ /* Special case: Force the loopfilter to skip when eobtotal is zero */
+ xd->mode_info_context->mbmi.mb_skip_coeff = (eobtotal==0);
+ }
+
+ mode = xd->mode_info_context->mbmi.mode;
+
+ if (xd->segmentation_enabled)
+ vp8_mb_init_dequantizer(pbi, xd);
+
+
+#if CONFIG_ERROR_CONCEALMENT
+
+ if(pbi->ec_active)
+ {
+ int throw_residual;
+ /* When we have independent partitions we can apply residual even
+ * though other partitions within the frame are corrupt.
+ */
+ throw_residual = (!pbi->independent_partitions &&
+ pbi->frame_corrupt_residual);
+ throw_residual = (throw_residual || vp8dx_bool_error(xd->current_bc));
+
+ if ((mb_idx >= pbi->mvs_corrupt_from_mb || throw_residual))
+ {
+ /* MB with corrupt residuals or corrupt mode/motion vectors.
+ * Better to use the predictor as reconstruction.
+ */
+ pbi->frame_corrupt_residual = 1;
+ memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
+
+ corruption_detected = 1;
+
+ /* force idct to be skipped for B_PRED and use the
+ * prediction only for reconstruction
+ * */
+ memset(xd->eobs, 0, 25);
+ }
+ }
+#endif
+
+ /* do prediction */
+ if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
+ {
+ vp8_build_intra_predictors_mbuv_s(xd,
+ xd->recon_above[1],
+ xd->recon_above[2],
+ xd->recon_left[1],
+ xd->recon_left[2],
+ xd->recon_left_stride[1],
+ xd->dst.u_buffer, xd->dst.v_buffer,
+ xd->dst.uv_stride);
+
+ if (mode != B_PRED)
+ {
+ vp8_build_intra_predictors_mby_s(xd,
+ xd->recon_above[0],
+ xd->recon_left[0],
+ xd->recon_left_stride[0],
+ xd->dst.y_buffer,
+ xd->dst.y_stride);
+ }
+ else
+ {
+ short *DQC = xd->dequant_y1;
+ int dst_stride = xd->dst.y_stride;
+
+ /* clear out residual eob info */
+ if(xd->mode_info_context->mbmi.mb_skip_coeff)
+ memset(xd->eobs, 0, 25);
+
+ intra_prediction_down_copy(xd, xd->recon_above[0] + 16);
+
+ for (i = 0; i < 16; i++)
+ {
+ BLOCKD *b = &xd->block[i];
+ unsigned char *dst = xd->dst.y_buffer + b->offset;
+ B_PREDICTION_MODE b_mode =
+ xd->mode_info_context->bmi[i].as_mode;
+ unsigned char *Above = dst - dst_stride;
+ unsigned char *yleft = dst - 1;
+ int left_stride = dst_stride;
+ unsigned char top_left = Above[-1];
+
+ vp8_intra4x4_predict(Above, yleft, left_stride, b_mode,
+ dst, dst_stride, top_left);
+
+ if (xd->eobs[i])
+ {
+ if (xd->eobs[i] > 1)
+ {
+ vp8_dequant_idct_add(b->qcoeff, DQC, dst, dst_stride);
+ }
+ else
+ {
+ vp8_dc_only_idct_add
+ (b->qcoeff[0] * DQC[0],
+ dst, dst_stride,
+ dst, dst_stride);
+ memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
+ }
+ }
+ }
+ }
+ }
+ else
+ {
+ vp8_build_inter_predictors_mb(xd);
+ }
+
+
+#if CONFIG_ERROR_CONCEALMENT
+ if (corruption_detected)
+ {
+ return;
+ }
+#endif
+
+ if(!xd->mode_info_context->mbmi.mb_skip_coeff)
+ {
+ /* dequantization and idct */
+ if (mode != B_PRED)
+ {
+ short *DQC = xd->dequant_y1;
+
+ if (mode != SPLITMV)
+ {
+ BLOCKD *b = &xd->block[24];
+
+ /* do 2nd order transform on the dc block */
+ if (xd->eobs[24] > 1)
+ {
+ vp8_dequantize_b(b, xd->dequant_y2);
+
+ vp8_short_inv_walsh4x4(&b->dqcoeff[0],
+ xd->qcoeff);
+ memset(b->qcoeff, 0, 16 * sizeof(b->qcoeff[0]));
+ }
+ else
+ {
+ b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0];
+ vp8_short_inv_walsh4x4_1(&b->dqcoeff[0],
+ xd->qcoeff);
+ memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
+ }
+
+ /* override the dc dequant constant in order to preserve the
+ * dc components
+ */
+ DQC = xd->dequant_y1_dc;
+ }
+
+ vp8_dequant_idct_add_y_block
+ (xd->qcoeff, DQC,
+ xd->dst.y_buffer,
+ xd->dst.y_stride, xd->eobs);
+ }
+
+ vp8_dequant_idct_add_uv_block
+ (xd->qcoeff+16*16, xd->dequant_uv,
+ xd->dst.u_buffer, xd->dst.v_buffer,
+ xd->dst.uv_stride, xd->eobs+16);
+ }
+}
+
+static int get_delta_q(vp8_reader *bc, int prev, int *q_update)
+{
+ int ret_val = 0;
+
+ if (vp8_read_bit(bc))
+ {
+ ret_val = vp8_read_literal(bc, 4);
+
+ if (vp8_read_bit(bc))
+ ret_val = -ret_val;
+ }
+
+ /* Trigger a quantizer update if the delta-q value has changed */
+ if (ret_val != prev)
+ *q_update = 1;
+
+ return ret_val;
+}
+
+#ifdef PACKET_TESTING
+#include <stdio.h>
+FILE *vpxlog = 0;
+#endif
+
+static void yv12_extend_frame_top_c(YV12_BUFFER_CONFIG *ybf)
+{
+ int i;
+ unsigned char *src_ptr1;
+ unsigned char *dest_ptr1;
+
+ unsigned int Border;
+ int plane_stride;
+
+ /***********/
+ /* Y Plane */
+ /***********/
+ Border = ybf->border;
+ plane_stride = ybf->y_stride;
+ src_ptr1 = ybf->y_buffer - Border;
+ dest_ptr1 = src_ptr1 - (Border * plane_stride);
+
+ for (i = 0; i < (int)Border; i++)
+ {
+ memcpy(dest_ptr1, src_ptr1, plane_stride);
+ dest_ptr1 += plane_stride;
+ }
+
+
+ /***********/
+ /* U Plane */
+ /***********/
+ plane_stride = ybf->uv_stride;
+ Border /= 2;
+ src_ptr1 = ybf->u_buffer - Border;
+ dest_ptr1 = src_ptr1 - (Border * plane_stride);
+
+ for (i = 0; i < (int)(Border); i++)
+ {
+ memcpy(dest_ptr1, src_ptr1, plane_stride);
+ dest_ptr1 += plane_stride;
+ }
+
+ /***********/
+ /* V Plane */
+ /***********/
+
+ src_ptr1 = ybf->v_buffer - Border;
+ dest_ptr1 = src_ptr1 - (Border * plane_stride);
+
+ for (i = 0; i < (int)(Border); i++)
+ {
+ memcpy(dest_ptr1, src_ptr1, plane_stride);
+ dest_ptr1 += plane_stride;
+ }
+}
+
+static void yv12_extend_frame_bottom_c(YV12_BUFFER_CONFIG *ybf)
+{
+ int i;
+ unsigned char *src_ptr1, *src_ptr2;
+ unsigned char *dest_ptr2;
+
+ unsigned int Border;
+ int plane_stride;
+ int plane_height;
+
+ /***********/
+ /* Y Plane */
+ /***********/
+ Border = ybf->border;
+ plane_stride = ybf->y_stride;
+ plane_height = ybf->y_height;
+
+ src_ptr1 = ybf->y_buffer - Border;
+ src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+ dest_ptr2 = src_ptr2 + plane_stride;
+
+ for (i = 0; i < (int)Border; i++)
+ {
+ memcpy(dest_ptr2, src_ptr2, plane_stride);
+ dest_ptr2 += plane_stride;
+ }
+
+
+ /***********/
+ /* U Plane */
+ /***********/
+ plane_stride = ybf->uv_stride;
+ plane_height = ybf->uv_height;
+ Border /= 2;
+
+ src_ptr1 = ybf->u_buffer - Border;
+ src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+ dest_ptr2 = src_ptr2 + plane_stride;
+
+ for (i = 0; i < (int)(Border); i++)
+ {
+ memcpy(dest_ptr2, src_ptr2, plane_stride);
+ dest_ptr2 += plane_stride;
+ }
+
+ /***********/
+ /* V Plane */
+ /***********/
+
+ src_ptr1 = ybf->v_buffer - Border;
+ src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+ dest_ptr2 = src_ptr2 + plane_stride;
+
+ for (i = 0; i < (int)(Border); i++)
+ {
+ memcpy(dest_ptr2, src_ptr2, plane_stride);
+ dest_ptr2 += plane_stride;
+ }
+}
+
+static void yv12_extend_frame_left_right_c(YV12_BUFFER_CONFIG *ybf,
+ unsigned char *y_src,
+ unsigned char *u_src,
+ unsigned char *v_src)
+{
+ int i;
+ unsigned char *src_ptr1, *src_ptr2;
+ unsigned char *dest_ptr1, *dest_ptr2;
+
+ unsigned int Border;
+ int plane_stride;
+ int plane_height;
+ int plane_width;
+
+ /***********/
+ /* Y Plane */
+ /***********/
+ Border = ybf->border;
+ plane_stride = ybf->y_stride;
+ plane_height = 16;
+ plane_width = ybf->y_width;
+
+ /* copy the left and right most columns out */
+ src_ptr1 = y_src;
+ src_ptr2 = src_ptr1 + plane_width - 1;
+ dest_ptr1 = src_ptr1 - Border;
+ dest_ptr2 = src_ptr2 + 1;
+
+ for (i = 0; i < plane_height; i++)
+ {
+ memset(dest_ptr1, src_ptr1[0], Border);
+ memset(dest_ptr2, src_ptr2[0], Border);
+ src_ptr1 += plane_stride;
+ src_ptr2 += plane_stride;
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+
+ /***********/
+ /* U Plane */
+ /***********/
+ plane_stride = ybf->uv_stride;
+ plane_height = 8;
+ plane_width = ybf->uv_width;
+ Border /= 2;
+
+ /* copy the left and right most columns out */
+ src_ptr1 = u_src;
+ src_ptr2 = src_ptr1 + plane_width - 1;
+ dest_ptr1 = src_ptr1 - Border;
+ dest_ptr2 = src_ptr2 + 1;
+
+ for (i = 0; i < plane_height; i++)
+ {
+ memset(dest_ptr1, src_ptr1[0], Border);
+ memset(dest_ptr2, src_ptr2[0], Border);
+ src_ptr1 += plane_stride;
+ src_ptr2 += plane_stride;
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+
+ /***********/
+ /* V Plane */
+ /***********/
+
+ /* copy the left and right most columns out */
+ src_ptr1 = v_src;
+ src_ptr2 = src_ptr1 + plane_width - 1;
+ dest_ptr1 = src_ptr1 - Border;
+ dest_ptr2 = src_ptr2 + 1;
+
+ for (i = 0; i < plane_height; i++)
+ {
+ memset(dest_ptr1, src_ptr1[0], Border);
+ memset(dest_ptr2, src_ptr2[0], Border);
+ src_ptr1 += plane_stride;
+ src_ptr2 += plane_stride;
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+}
+
+static void decode_mb_rows(VP8D_COMP *pbi)
+{
+ VP8_COMMON *const pc = & pbi->common;
+ MACROBLOCKD *const xd = & pbi->mb;
+
+ MODE_INFO *lf_mic = xd->mode_info_context;
+
+ int ibc = 0;
+ int num_part = 1 << pc->multi_token_partition;
+
+ int recon_yoffset, recon_uvoffset;
+ int mb_row, mb_col;
+ int mb_idx = 0;
+
+ YV12_BUFFER_CONFIG *yv12_fb_new = pbi->dec_fb_ref[INTRA_FRAME];
+
+ int recon_y_stride = yv12_fb_new->y_stride;
+ int recon_uv_stride = yv12_fb_new->uv_stride;
+
+ unsigned char *ref_buffer[MAX_REF_FRAMES][3];
+ unsigned char *dst_buffer[3];
+ unsigned char *lf_dst[3];
+ unsigned char *eb_dst[3];
+ int i;
+ int ref_fb_corrupted[MAX_REF_FRAMES];
+
+ ref_fb_corrupted[INTRA_FRAME] = 0;
+
+ for(i = 1; i < MAX_REF_FRAMES; i++)
+ {
+ YV12_BUFFER_CONFIG *this_fb = pbi->dec_fb_ref[i];
+
+ ref_buffer[i][0] = this_fb->y_buffer;
+ ref_buffer[i][1] = this_fb->u_buffer;
+ ref_buffer[i][2] = this_fb->v_buffer;
+
+ ref_fb_corrupted[i] = this_fb->corrupted;
+ }
+
+ /* Set up the buffer pointers */
+ eb_dst[0] = lf_dst[0] = dst_buffer[0] = yv12_fb_new->y_buffer;
+ eb_dst[1] = lf_dst[1] = dst_buffer[1] = yv12_fb_new->u_buffer;
+ eb_dst[2] = lf_dst[2] = dst_buffer[2] = yv12_fb_new->v_buffer;
+
+ xd->up_available = 0;
+
+ /* Initialize the loop filter for this frame. */
+ if(pc->filter_level)
+ vp8_loop_filter_frame_init(pc, xd, pc->filter_level);
+
+ vp8_setup_intra_recon_top_line(yv12_fb_new);
+
+ /* Decode the individual macro block */
+ for (mb_row = 0; mb_row < pc->mb_rows; mb_row++)
+ {
+ if (num_part > 1)
+ {
+ xd->current_bc = & pbi->mbc[ibc];
+ ibc++;
+
+ if (ibc == num_part)
+ ibc = 0;
+ }
+
+ recon_yoffset = mb_row * recon_y_stride * 16;
+ recon_uvoffset = mb_row * recon_uv_stride * 8;
+
+ /* reset contexts */
+ xd->above_context = pc->above_context;
+ memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+
+ xd->left_available = 0;
+
+ xd->mb_to_top_edge = -((mb_row * 16) << 3);
+ xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
+
+ xd->recon_above[0] = dst_buffer[0] + recon_yoffset;
+ xd->recon_above[1] = dst_buffer[1] + recon_uvoffset;
+ xd->recon_above[2] = dst_buffer[2] + recon_uvoffset;
+
+ xd->recon_left[0] = xd->recon_above[0] - 1;
+ xd->recon_left[1] = xd->recon_above[1] - 1;
+ xd->recon_left[2] = xd->recon_above[2] - 1;
+
+ xd->recon_above[0] -= xd->dst.y_stride;
+ xd->recon_above[1] -= xd->dst.uv_stride;
+ xd->recon_above[2] -= xd->dst.uv_stride;
+
+ /* TODO: move to outside row loop */
+ xd->recon_left_stride[0] = xd->dst.y_stride;
+ xd->recon_left_stride[1] = xd->dst.uv_stride;
+
+ setup_intra_recon_left(xd->recon_left[0], xd->recon_left[1],
+ xd->recon_left[2], xd->dst.y_stride,
+ xd->dst.uv_stride);
+
+ for (mb_col = 0; mb_col < pc->mb_cols; mb_col++)
+ {
+ /* Distance of Mb to the various image edges.
+ * These are specified to 8th pel as they are always compared to values
+ * that are in 1/8th pel units
+ */
+ xd->mb_to_left_edge = -((mb_col * 16) << 3);
+ xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
+
+#if CONFIG_ERROR_CONCEALMENT
+ {
+ int corrupt_residual = (!pbi->independent_partitions &&
+ pbi->frame_corrupt_residual) ||
+ vp8dx_bool_error(xd->current_bc);
+ if (pbi->ec_active &&
+ xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME &&
+ corrupt_residual)
+ {
+ /* We have an intra block with corrupt coefficients, better to
+ * conceal with an inter block. Interpolate MVs from neighboring
+ * MBs.
+ *
+ * Note that for the first mb with corrupt residual in a frame,
+ * we might not discover that before decoding the residual. That
+ * happens after this check, and therefore no inter concealment
+ * will be done.
+ */
+ vp8_interpolate_motion(xd,
+ mb_row, mb_col,
+ pc->mb_rows, pc->mb_cols);
+ }
+ }
+#endif
+
+ xd->dst.y_buffer = dst_buffer[0] + recon_yoffset;
+ xd->dst.u_buffer = dst_buffer[1] + recon_uvoffset;
+ xd->dst.v_buffer = dst_buffer[2] + recon_uvoffset;
+
+ if (xd->mode_info_context->mbmi.ref_frame >= LAST_FRAME) {
+ const MV_REFERENCE_FRAME ref = xd->mode_info_context->mbmi.ref_frame;
+ xd->pre.y_buffer = ref_buffer[ref][0] + recon_yoffset;
+ xd->pre.u_buffer = ref_buffer[ref][1] + recon_uvoffset;
+ xd->pre.v_buffer = ref_buffer[ref][2] + recon_uvoffset;
+ } else {
+ // ref_frame is INTRA_FRAME, pre buffer should not be used.
+ xd->pre.y_buffer = 0;
+ xd->pre.u_buffer = 0;
+ xd->pre.v_buffer = 0;
+ }
+
+ /* propagate errors from reference frames */
+ xd->corrupted |= ref_fb_corrupted[xd->mode_info_context->mbmi.ref_frame];
+
+ decode_macroblock(pbi, xd, mb_idx);
+
+ mb_idx++;
+ xd->left_available = 1;
+
+ /* check if the boolean decoder has suffered an error */
+ xd->corrupted |= vp8dx_bool_error(xd->current_bc);
+
+ xd->recon_above[0] += 16;
+ xd->recon_above[1] += 8;
+ xd->recon_above[2] += 8;
+ xd->recon_left[0] += 16;
+ xd->recon_left[1] += 8;
+ xd->recon_left[2] += 8;
+
+ recon_yoffset += 16;
+ recon_uvoffset += 8;
+
+ ++xd->mode_info_context; /* next mb */
+
+ xd->above_context++;
+ }
+
+ /* adjust to the next row of mbs */
+ vp8_extend_mb_row(yv12_fb_new, xd->dst.y_buffer + 16,
+ xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
+
+ ++xd->mode_info_context; /* skip prediction column */
+ xd->up_available = 1;
+
+ if(pc->filter_level)
+ {
+ if(mb_row > 0)
+ {
+ if (pc->filter_type == NORMAL_LOOPFILTER)
+ vp8_loop_filter_row_normal(pc, lf_mic, mb_row-1,
+ recon_y_stride, recon_uv_stride,
+ lf_dst[0], lf_dst[1], lf_dst[2]);
+ else
+ vp8_loop_filter_row_simple(pc, lf_mic, mb_row-1,
+ recon_y_stride, recon_uv_stride,
+ lf_dst[0], lf_dst[1], lf_dst[2]);
+ if(mb_row > 1)
+ {
+ yv12_extend_frame_left_right_c(yv12_fb_new,
+ eb_dst[0],
+ eb_dst[1],
+ eb_dst[2]);
+
+ eb_dst[0] += recon_y_stride * 16;
+ eb_dst[1] += recon_uv_stride * 8;
+ eb_dst[2] += recon_uv_stride * 8;
+ }
+
+ lf_dst[0] += recon_y_stride * 16;
+ lf_dst[1] += recon_uv_stride * 8;
+ lf_dst[2] += recon_uv_stride * 8;
+ lf_mic += pc->mb_cols;
+ lf_mic++; /* Skip border mb */
+ }
+ }
+ else
+ {
+ if(mb_row > 0)
+ {
+ /**/
+ yv12_extend_frame_left_right_c(yv12_fb_new,
+ eb_dst[0],
+ eb_dst[1],
+ eb_dst[2]);
+ eb_dst[0] += recon_y_stride * 16;
+ eb_dst[1] += recon_uv_stride * 8;
+ eb_dst[2] += recon_uv_stride * 8;
+ }
+ }
+ }
+
+ if(pc->filter_level)
+ {
+ if (pc->filter_type == NORMAL_LOOPFILTER)
+ vp8_loop_filter_row_normal(pc, lf_mic, mb_row-1, recon_y_stride,
+ recon_uv_stride, lf_dst[0], lf_dst[1],
+ lf_dst[2]);
+ else
+ vp8_loop_filter_row_simple(pc, lf_mic, mb_row-1, recon_y_stride,
+ recon_uv_stride, lf_dst[0], lf_dst[1],
+ lf_dst[2]);
+
+ yv12_extend_frame_left_right_c(yv12_fb_new,
+ eb_dst[0],
+ eb_dst[1],
+ eb_dst[2]);
+ eb_dst[0] += recon_y_stride * 16;
+ eb_dst[1] += recon_uv_stride * 8;
+ eb_dst[2] += recon_uv_stride * 8;
+ }
+ yv12_extend_frame_left_right_c(yv12_fb_new,
+ eb_dst[0],
+ eb_dst[1],
+ eb_dst[2]);
+ yv12_extend_frame_top_c(yv12_fb_new);
+ yv12_extend_frame_bottom_c(yv12_fb_new);
+
+}
+
+static unsigned int read_partition_size(VP8D_COMP *pbi,
+ const unsigned char *cx_size)
+{
+ unsigned char temp[3];
+ if (pbi->decrypt_cb)
+ {
+ pbi->decrypt_cb(pbi->decrypt_state, cx_size, temp, 3);
+ cx_size = temp;
+ }
+ return cx_size[0] + (cx_size[1] << 8) + (cx_size[2] << 16);
+}
+
+static int read_is_valid(const unsigned char *start,
+ size_t len,
+ const unsigned char *end)
+{
+ return (start + len > start && start + len <= end);
+}
+
+static unsigned int read_available_partition_size(
+ VP8D_COMP *pbi,
+ const unsigned char *token_part_sizes,
+ const unsigned char *fragment_start,
+ const unsigned char *first_fragment_end,
+ const unsigned char *fragment_end,
+ int i,
+ int num_part)
+{
+ VP8_COMMON* pc = &pbi->common;
+ const unsigned char *partition_size_ptr = token_part_sizes + i * 3;
+ unsigned int partition_size = 0;
+ ptrdiff_t bytes_left = fragment_end - fragment_start;
+ /* Calculate the length of this partition. The last partition
+ * size is implicit. If the partition size can't be read, then
+ * either use the remaining data in the buffer (for EC mode)
+ * or throw an error.
+ */
+ if (i < num_part - 1)
+ {
+ if (read_is_valid(partition_size_ptr, 3, first_fragment_end))
+ partition_size = read_partition_size(pbi, partition_size_ptr);
+ else if (pbi->ec_active)
+ partition_size = (unsigned int)bytes_left;
+ else
+ vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+ "Truncated partition size data");
+ }
+ else
+ partition_size = (unsigned int)bytes_left;
+
+ /* Validate the calculated partition length. If the buffer
+ * described by the partition can't be fully read, then restrict
+ * it to the portion that can be (for EC mode) or throw an error.
+ */
+ if (!read_is_valid(fragment_start, partition_size, fragment_end))
+ {
+ if (pbi->ec_active)
+ partition_size = (unsigned int)bytes_left;
+ else
+ vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+ "Truncated packet or corrupt partition "
+ "%d length", i + 1);
+ }
+ return partition_size;
+}
+
+
+static void setup_token_decoder(VP8D_COMP *pbi,
+ const unsigned char* token_part_sizes)
+{
+ vp8_reader *bool_decoder = &pbi->mbc[0];
+ unsigned int partition_idx;
+ unsigned int fragment_idx;
+ unsigned int num_token_partitions;
+ const unsigned char *first_fragment_end = pbi->fragments.ptrs[0] +
+ pbi->fragments.sizes[0];
+
+ TOKEN_PARTITION multi_token_partition =
+ (TOKEN_PARTITION)vp8_read_literal(&pbi->mbc[8], 2);
+ if (!vp8dx_bool_error(&pbi->mbc[8]))
+ pbi->common.multi_token_partition = multi_token_partition;
+ num_token_partitions = 1 << pbi->common.multi_token_partition;
+
+ /* Check for partitions within the fragments and unpack the fragments
+ * so that each fragment pointer points to its corresponding partition. */
+ for (fragment_idx = 0; fragment_idx < pbi->fragments.count; ++fragment_idx)
+ {
+ unsigned int fragment_size = pbi->fragments.sizes[fragment_idx];
+ const unsigned char *fragment_end = pbi->fragments.ptrs[fragment_idx] +
+ fragment_size;
+ /* Special case for handling the first partition since we have already
+ * read its size. */
+ if (fragment_idx == 0)
+ {
+ /* Size of first partition + token partition sizes element */
+ ptrdiff_t ext_first_part_size = token_part_sizes -
+ pbi->fragments.ptrs[0] + 3 * (num_token_partitions - 1);
+ fragment_size -= (unsigned int)ext_first_part_size;
+ if (fragment_size > 0)
+ {
+ pbi->fragments.sizes[0] = (unsigned int)ext_first_part_size;
+ /* The fragment contains an additional partition. Move to
+ * next. */
+ fragment_idx++;
+ pbi->fragments.ptrs[fragment_idx] = pbi->fragments.ptrs[0] +
+ pbi->fragments.sizes[0];
+ }
+ }
+ /* Split the chunk into partitions read from the bitstream */
+ while (fragment_size > 0)
+ {
+ ptrdiff_t partition_size = read_available_partition_size(
+ pbi,
+ token_part_sizes,
+ pbi->fragments.ptrs[fragment_idx],
+ first_fragment_end,
+ fragment_end,
+ fragment_idx - 1,
+ num_token_partitions);
+ pbi->fragments.sizes[fragment_idx] = (unsigned int)partition_size;
+ fragment_size -= (unsigned int)partition_size;
+ assert(fragment_idx <= num_token_partitions);
+ if (fragment_size > 0)
+ {
+ /* The fragment contains an additional partition.
+ * Move to next. */
+ fragment_idx++;
+ pbi->fragments.ptrs[fragment_idx] =
+ pbi->fragments.ptrs[fragment_idx - 1] + partition_size;
+ }
+ }
+ }
+
+ pbi->fragments.count = num_token_partitions + 1;
+
+ for (partition_idx = 1; partition_idx < pbi->fragments.count; ++partition_idx)
+ {
+ if (vp8dx_start_decode(bool_decoder,
+ pbi->fragments.ptrs[partition_idx],
+ pbi->fragments.sizes[partition_idx],
+ pbi->decrypt_cb, pbi->decrypt_state))
+ vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate bool decoder %d",
+ partition_idx);
+
+ bool_decoder++;
+ }
+
+#if CONFIG_MULTITHREAD
+ /* Clamp number of decoder threads */
+ if (pbi->decoding_thread_count > num_token_partitions - 1)
+ pbi->decoding_thread_count = num_token_partitions - 1;
+#endif
+}
+
+
+static void init_frame(VP8D_COMP *pbi)
+{
+ VP8_COMMON *const pc = & pbi->common;
+ MACROBLOCKD *const xd = & pbi->mb;
+
+ if (pc->frame_type == KEY_FRAME)
+ {
+ /* Various keyframe initializations */
+ memcpy(pc->fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
+
+ vp8_init_mbmode_probs(pc);
+
+ vp8_default_coef_probs(pc);
+
+ /* reset the segment feature data to 0 with delta coding (Default state). */
+ memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
+ xd->mb_segement_abs_delta = SEGMENT_DELTADATA;
+
+ /* reset the mode ref deltasa for loop filter */
+ memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
+ memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));
+
+ /* All buffers are implicitly updated on key frames. */
+ pc->refresh_golden_frame = 1;
+ pc->refresh_alt_ref_frame = 1;
+ pc->copy_buffer_to_gf = 0;
+ pc->copy_buffer_to_arf = 0;
+
+ /* Note that Golden and Altref modes cannot be used on a key frame so
+ * ref_frame_sign_bias[] is undefined and meaningless
+ */
+ pc->ref_frame_sign_bias[GOLDEN_FRAME] = 0;
+ pc->ref_frame_sign_bias[ALTREF_FRAME] = 0;
+ }
+ else
+ {
+ /* To enable choice of different interploation filters */
+ if (!pc->use_bilinear_mc_filter)
+ {
+ xd->subpixel_predict = vp8_sixtap_predict4x4;
+ xd->subpixel_predict8x4 = vp8_sixtap_predict8x4;
+ xd->subpixel_predict8x8 = vp8_sixtap_predict8x8;
+ xd->subpixel_predict16x16 = vp8_sixtap_predict16x16;
+ }
+ else
+ {
+ xd->subpixel_predict = vp8_bilinear_predict4x4;
+ xd->subpixel_predict8x4 = vp8_bilinear_predict8x4;
+ xd->subpixel_predict8x8 = vp8_bilinear_predict8x8;
+ xd->subpixel_predict16x16 = vp8_bilinear_predict16x16;
+ }
+
+ if (pbi->decoded_key_frame && pbi->ec_enabled && !pbi->ec_active)
+ pbi->ec_active = 1;
+ }
+
+ xd->left_context = &pc->left_context;
+ xd->mode_info_context = pc->mi;
+ xd->frame_type = pc->frame_type;
+ xd->mode_info_context->mbmi.mode = DC_PRED;
+ xd->mode_info_stride = pc->mode_info_stride;
+ xd->corrupted = 0; /* init without corruption */
+
+ xd->fullpixel_mask = 0xffffffff;
+ if(pc->full_pixel)
+ xd->fullpixel_mask = 0xfffffff8;
+
+}
+
+int vp8_decode_frame(VP8D_COMP *pbi)
+{
+ vp8_reader *const bc = &pbi->mbc[8];
+ VP8_COMMON *const pc = &pbi->common;
+ MACROBLOCKD *const xd = &pbi->mb;
+ const unsigned char *data = pbi->fragments.ptrs[0];
+ const unsigned int data_sz = pbi->fragments.sizes[0];
+ const unsigned char *data_end = data + data_sz;
+ ptrdiff_t first_partition_length_in_bytes;
+
+ int i, j, k, l;
+ const int *const mb_feature_data_bits = vp8_mb_feature_data_bits;
+ int corrupt_tokens = 0;
+ int prev_independent_partitions = pbi->independent_partitions;
+
+ YV12_BUFFER_CONFIG *yv12_fb_new = pbi->dec_fb_ref[INTRA_FRAME];
+
+ /* start with no corruption of current frame */
+ xd->corrupted = 0;
+ yv12_fb_new->corrupted = 0;
+
+ if (data_end - data < 3)
+ {
+ if (!pbi->ec_active)
+ {
+ vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+ "Truncated packet");
+ }
+
+ /* Declare the missing frame as an inter frame since it will
+ be handled as an inter frame when we have estimated its
+ motion vectors. */
+ pc->frame_type = INTER_FRAME;
+ pc->version = 0;
+ pc->show_frame = 1;
+ first_partition_length_in_bytes = 0;
+ }
+ else
+ {
+ unsigned char clear_buffer[10];
+ const unsigned char *clear = data;
+ if (pbi->decrypt_cb)
+ {
+ int n = (int)VPXMIN(sizeof(clear_buffer), data_sz);
+ pbi->decrypt_cb(pbi->decrypt_state, data, clear_buffer, n);
+ clear = clear_buffer;
+ }
+
+ pc->frame_type = (FRAME_TYPE)(clear[0] & 1);
+ pc->version = (clear[0] >> 1) & 7;
+ pc->show_frame = (clear[0] >> 4) & 1;
+ first_partition_length_in_bytes =
+ (clear[0] | (clear[1] << 8) | (clear[2] << 16)) >> 5;
+
+ if (!pbi->ec_active &&
+ (data + first_partition_length_in_bytes > data_end
+ || data + first_partition_length_in_bytes < data))
+ vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+ "Truncated packet or corrupt partition 0 length");
+
+ data += 3;
+ clear += 3;
+
+ vp8_setup_version(pc);
+
+
+ if (pc->frame_type == KEY_FRAME)
+ {
+ /* vet via sync code */
+ /* When error concealment is enabled we should only check the sync
+ * code if we have enough bits available
+ */
+ if (!pbi->ec_active || data + 3 < data_end)
+ {
+ if (clear[0] != 0x9d || clear[1] != 0x01 || clear[2] != 0x2a)
+ vpx_internal_error(&pc->error, VPX_CODEC_UNSUP_BITSTREAM,
+ "Invalid frame sync code");
+ }
+
+ /* If error concealment is enabled we should only parse the new size
+ * if we have enough data. Otherwise we will end up with the wrong
+ * size.
+ */
+ if (!pbi->ec_active || data + 6 < data_end)
+ {
+ pc->Width = (clear[3] | (clear[4] << 8)) & 0x3fff;
+ pc->horiz_scale = clear[4] >> 6;
+ pc->Height = (clear[5] | (clear[6] << 8)) & 0x3fff;
+ pc->vert_scale = clear[6] >> 6;
+ }
+ data += 7;
+ }
+ else
+ {
+ memcpy(&xd->pre, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG));
+ memcpy(&xd->dst, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG));
+ }
+ }
+ if ((!pbi->decoded_key_frame && pc->frame_type != KEY_FRAME))
+ {
+ return -1;
+ }
+
+ init_frame(pbi);
+
+ if (vp8dx_start_decode(bc, data, (unsigned int)(data_end - data),
+ pbi->decrypt_cb, pbi->decrypt_state))
+ vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate bool decoder 0");
+ if (pc->frame_type == KEY_FRAME) {
+ (void)vp8_read_bit(bc); // colorspace
+ pc->clamp_type = (CLAMP_TYPE)vp8_read_bit(bc);
+ }
+
+ /* Is segmentation enabled */
+ xd->segmentation_enabled = (unsigned char)vp8_read_bit(bc);
+
+ if (xd->segmentation_enabled)
+ {
+ /* Signal whether or not the segmentation map is being explicitly updated this frame. */
+ xd->update_mb_segmentation_map = (unsigned char)vp8_read_bit(bc);
+ xd->update_mb_segmentation_data = (unsigned char)vp8_read_bit(bc);
+
+ if (xd->update_mb_segmentation_data)
+ {
+ xd->mb_segement_abs_delta = (unsigned char)vp8_read_bit(bc);
+
+ memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
+
+ /* For each segmentation feature (Quant and loop filter level) */
+ for (i = 0; i < MB_LVL_MAX; i++)
+ {
+ for (j = 0; j < MAX_MB_SEGMENTS; j++)
+ {
+ /* Frame level data */
+ if (vp8_read_bit(bc))
+ {
+ xd->segment_feature_data[i][j] = (signed char)vp8_read_literal(bc, mb_feature_data_bits[i]);
+
+ if (vp8_read_bit(bc))
+ xd->segment_feature_data[i][j] = -xd->segment_feature_data[i][j];
+ }
+ else
+ xd->segment_feature_data[i][j] = 0;
+ }
+ }
+ }
+
+ if (xd->update_mb_segmentation_map)
+ {
+ /* Which macro block level features are enabled */
+ memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs));
+
+ /* Read the probs used to decode the segment id for each macro block. */
+ for (i = 0; i < MB_FEATURE_TREE_PROBS; i++)
+ {
+ /* If not explicitly set value is defaulted to 255 by memset above */
+ if (vp8_read_bit(bc))
+ xd->mb_segment_tree_probs[i] = (vp8_prob)vp8_read_literal(bc, 8);
+ }
+ }
+ }
+ else
+ {
+ /* No segmentation updates on this frame */
+ xd->update_mb_segmentation_map = 0;
+ xd->update_mb_segmentation_data = 0;
+ }
+
+ /* Read the loop filter level and type */
+ pc->filter_type = (LOOPFILTERTYPE) vp8_read_bit(bc);
+ pc->filter_level = vp8_read_literal(bc, 6);
+ pc->sharpness_level = vp8_read_literal(bc, 3);
+
+ /* Read in loop filter deltas applied at the MB level based on mode or ref frame. */
+ xd->mode_ref_lf_delta_update = 0;
+ xd->mode_ref_lf_delta_enabled = (unsigned char)vp8_read_bit(bc);
+
+ if (xd->mode_ref_lf_delta_enabled)
+ {
+ /* Do the deltas need to be updated */
+ xd->mode_ref_lf_delta_update = (unsigned char)vp8_read_bit(bc);
+
+ if (xd->mode_ref_lf_delta_update)
+ {
+ /* Send update */
+ for (i = 0; i < MAX_REF_LF_DELTAS; i++)
+ {
+ if (vp8_read_bit(bc))
+ {
+ /*sign = vp8_read_bit( bc );*/
+ xd->ref_lf_deltas[i] = (signed char)vp8_read_literal(bc, 6);
+
+ if (vp8_read_bit(bc)) /* Apply sign */
+ xd->ref_lf_deltas[i] = xd->ref_lf_deltas[i] * -1;
+ }
+ }
+
+ /* Send update */
+ for (i = 0; i < MAX_MODE_LF_DELTAS; i++)
+ {
+ if (vp8_read_bit(bc))
+ {
+ /*sign = vp8_read_bit( bc );*/
+ xd->mode_lf_deltas[i] = (signed char)vp8_read_literal(bc, 6);
+
+ if (vp8_read_bit(bc)) /* Apply sign */
+ xd->mode_lf_deltas[i] = xd->mode_lf_deltas[i] * -1;
+ }
+ }
+ }
+ }
+
+ setup_token_decoder(pbi, data + first_partition_length_in_bytes);
+
+ xd->current_bc = &pbi->mbc[0];
+
+ /* Read the default quantizers. */
+ {
+ int Q, q_update;
+
+ Q = vp8_read_literal(bc, 7); /* AC 1st order Q = default */
+ pc->base_qindex = Q;
+ q_update = 0;
+ pc->y1dc_delta_q = get_delta_q(bc, pc->y1dc_delta_q, &q_update);
+ pc->y2dc_delta_q = get_delta_q(bc, pc->y2dc_delta_q, &q_update);
+ pc->y2ac_delta_q = get_delta_q(bc, pc->y2ac_delta_q, &q_update);
+ pc->uvdc_delta_q = get_delta_q(bc, pc->uvdc_delta_q, &q_update);
+ pc->uvac_delta_q = get_delta_q(bc, pc->uvac_delta_q, &q_update);
+
+ if (q_update)
+ vp8cx_init_de_quantizer(pbi);
+
+ /* MB level dequantizer setup */
+ vp8_mb_init_dequantizer(pbi, &pbi->mb);
+ }
+
+ /* Determine if the golden frame or ARF buffer should be updated and how.
+ * For all non key frames the GF and ARF refresh flags and sign bias
+ * flags must be set explicitly.
+ */
+ if (pc->frame_type != KEY_FRAME)
+ {
+ /* Should the GF or ARF be updated from the current frame */
+ pc->refresh_golden_frame = vp8_read_bit(bc);
+#if CONFIG_ERROR_CONCEALMENT
+ /* Assume we shouldn't refresh golden if the bit is missing */
+ xd->corrupted |= vp8dx_bool_error(bc);
+ if (pbi->ec_active && xd->corrupted)
+ pc->refresh_golden_frame = 0;
+#endif
+
+ pc->refresh_alt_ref_frame = vp8_read_bit(bc);
+#if CONFIG_ERROR_CONCEALMENT
+ /* Assume we shouldn't refresh altref if the bit is missing */
+ xd->corrupted |= vp8dx_bool_error(bc);
+ if (pbi->ec_active && xd->corrupted)
+ pc->refresh_alt_ref_frame = 0;
+#endif
+
+ /* Buffer to buffer copy flags. */
+ pc->copy_buffer_to_gf = 0;
+
+ if (!pc->refresh_golden_frame)
+ pc->copy_buffer_to_gf = vp8_read_literal(bc, 2);
+
+#if CONFIG_ERROR_CONCEALMENT
+ /* Assume we shouldn't copy to the golden if the bit is missing */
+ xd->corrupted |= vp8dx_bool_error(bc);
+ if (pbi->ec_active && xd->corrupted)
+ pc->copy_buffer_to_gf = 0;
+#endif
+
+ pc->copy_buffer_to_arf = 0;
+
+ if (!pc->refresh_alt_ref_frame)
+ pc->copy_buffer_to_arf = vp8_read_literal(bc, 2);
+
+#if CONFIG_ERROR_CONCEALMENT
+ /* Assume we shouldn't copy to the alt-ref if the bit is missing */
+ xd->corrupted |= vp8dx_bool_error(bc);
+ if (pbi->ec_active && xd->corrupted)
+ pc->copy_buffer_to_arf = 0;
+#endif
+
+
+ pc->ref_frame_sign_bias[GOLDEN_FRAME] = vp8_read_bit(bc);
+ pc->ref_frame_sign_bias[ALTREF_FRAME] = vp8_read_bit(bc);
+ }
+
+ pc->refresh_entropy_probs = vp8_read_bit(bc);
+#if CONFIG_ERROR_CONCEALMENT
+ /* Assume we shouldn't refresh the probabilities if the bit is
+ * missing */
+ xd->corrupted |= vp8dx_bool_error(bc);
+ if (pbi->ec_active && xd->corrupted)
+ pc->refresh_entropy_probs = 0;
+#endif
+ if (pc->refresh_entropy_probs == 0)
+ {
+ memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));
+ }
+
+ pc->refresh_last_frame = pc->frame_type == KEY_FRAME || vp8_read_bit(bc);
+
+#if CONFIG_ERROR_CONCEALMENT
+ /* Assume we should refresh the last frame if the bit is missing */
+ xd->corrupted |= vp8dx_bool_error(bc);
+ if (pbi->ec_active && xd->corrupted)
+ pc->refresh_last_frame = 1;
+#endif
+
+ if (0)
+ {
+ FILE *z = fopen("decodestats.stt", "a");
+ fprintf(z, "%6d F:%d,G:%d,A:%d,L:%d,Q:%d\n",
+ pc->current_video_frame,
+ pc->frame_type,
+ pc->refresh_golden_frame,
+ pc->refresh_alt_ref_frame,
+ pc->refresh_last_frame,
+ pc->base_qindex);
+ fclose(z);
+ }
+
+ {
+ pbi->independent_partitions = 1;
+
+ /* read coef probability tree */
+ for (i = 0; i < BLOCK_TYPES; i++)
+ for (j = 0; j < COEF_BANDS; j++)
+ for (k = 0; k < PREV_COEF_CONTEXTS; k++)
+ for (l = 0; l < ENTROPY_NODES; l++)
+ {
+
+ vp8_prob *const p = pc->fc.coef_probs [i][j][k] + l;
+
+ if (vp8_read(bc, vp8_coef_update_probs [i][j][k][l]))
+ {
+ *p = (vp8_prob)vp8_read_literal(bc, 8);
+
+ }
+ if (k > 0 && *p != pc->fc.coef_probs[i][j][k-1][l])
+ pbi->independent_partitions = 0;
+
+ }
+ }
+
+ /* clear out the coeff buffer */
+ memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
+
+ vp8_decode_mode_mvs(pbi);
+
+#if CONFIG_ERROR_CONCEALMENT
+ if (pbi->ec_active &&
+ pbi->mvs_corrupt_from_mb < (unsigned int)pc->mb_cols * pc->mb_rows)
+ {
+ /* Motion vectors are missing in this frame. We will try to estimate
+ * them and then continue decoding the frame as usual */
+ vp8_estimate_missing_mvs(pbi);
+ }
+#endif
+
+ memset(pc->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols);
+ pbi->frame_corrupt_residual = 0;
+
+#if CONFIG_MULTITHREAD
+ if (pbi->b_multithreaded_rd && pc->multi_token_partition != ONE_PARTITION)
+ {
+ unsigned int thread;
+ vp8mt_decode_mb_rows(pbi, xd);
+ vp8_yv12_extend_frame_borders(yv12_fb_new);
+ for (thread = 0; thread < pbi->decoding_thread_count; ++thread)
+ corrupt_tokens |= pbi->mb_row_di[thread].mbd.corrupted;
+ }
+ else
+#endif
+ {
+ decode_mb_rows(pbi);
+ corrupt_tokens |= xd->corrupted;
+ }
+
+ /* Collect information about decoder corruption. */
+ /* 1. Check first boolean decoder for errors. */
+ yv12_fb_new->corrupted = vp8dx_bool_error(bc);
+ /* 2. Check the macroblock information */
+ yv12_fb_new->corrupted |= corrupt_tokens;
+
+ if (!pbi->decoded_key_frame)
+ {
+ if (pc->frame_type == KEY_FRAME &&
+ !yv12_fb_new->corrupted)
+ pbi->decoded_key_frame = 1;
+ else
+ vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME,
+ "A stream must start with a complete key frame");
+ }
+
+ /* vpx_log("Decoder: Frame Decoded, Size Roughly:%d bytes \n",bc->pos+pbi->bc2.pos); */
+
+ if (pc->refresh_entropy_probs == 0)
+ {
+ memcpy(&pc->fc, &pc->lfc, sizeof(pc->fc));
+ pbi->independent_partitions = prev_independent_partitions;
+ }
+
+#ifdef PACKET_TESTING
+ {
+ FILE *f = fopen("decompressor.VP8", "ab");
+ unsigned int size = pbi->bc2.pos + pbi->bc.pos + 8;
+ fwrite((void *) &size, 4, 1, f);
+ fwrite((void *) pbi->Source, size, 1, f);
+ fclose(f);
+ }
+#endif
+
+ return 0;
+}
diff --git a/thirdparty/libvpx/vp8/decoder/decodemv.c b/thirdparty/libvpx/vp8/decoder/decodemv.c
new file mode 100644
index 0000000000..1d155e7e16
--- /dev/null
+++ b/thirdparty/libvpx/vp8/decoder/decodemv.c
@@ -0,0 +1,670 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "treereader.h"
+#include "vp8/common/entropymv.h"
+#include "vp8/common/entropymode.h"
+#include "onyxd_int.h"
+#include "vp8/common/findnearmv.h"
+
+#if CONFIG_DEBUG
+#include <assert.h>
+#endif
+static B_PREDICTION_MODE read_bmode(vp8_reader *bc, const vp8_prob *p)
+{
+ const int i = vp8_treed_read(bc, vp8_bmode_tree, p);
+
+ return (B_PREDICTION_MODE)i;
+}
+
+static MB_PREDICTION_MODE read_ymode(vp8_reader *bc, const vp8_prob *p)
+{
+ const int i = vp8_treed_read(bc, vp8_ymode_tree, p);
+
+ return (MB_PREDICTION_MODE)i;
+}
+
+static MB_PREDICTION_MODE read_kf_ymode(vp8_reader *bc, const vp8_prob *p)
+{
+ const int i = vp8_treed_read(bc, vp8_kf_ymode_tree, p);
+
+ return (MB_PREDICTION_MODE)i;
+}
+
+static MB_PREDICTION_MODE read_uv_mode(vp8_reader *bc, const vp8_prob *p)
+{
+ const int i = vp8_treed_read(bc, vp8_uv_mode_tree, p);
+
+ return (MB_PREDICTION_MODE)i;
+}
+
+static void read_kf_modes(VP8D_COMP *pbi, MODE_INFO *mi)
+{
+ vp8_reader *const bc = & pbi->mbc[8];
+ const int mis = pbi->common.mode_info_stride;
+
+ mi->mbmi.ref_frame = INTRA_FRAME;
+ mi->mbmi.mode = read_kf_ymode(bc, vp8_kf_ymode_prob);
+
+ if (mi->mbmi.mode == B_PRED)
+ {
+ int i = 0;
+ mi->mbmi.is_4x4 = 1;
+
+ do
+ {
+ const B_PREDICTION_MODE A = above_block_mode(mi, i, mis);
+ const B_PREDICTION_MODE L = left_block_mode(mi, i);
+
+ mi->bmi[i].as_mode =
+ read_bmode(bc, vp8_kf_bmode_prob [A] [L]);
+ }
+ while (++i < 16);
+ }
+
+ mi->mbmi.uv_mode = read_uv_mode(bc, vp8_kf_uv_mode_prob);
+}
+
+static int read_mvcomponent(vp8_reader *r, const MV_CONTEXT *mvc)
+{
+ const vp8_prob *const p = (const vp8_prob *) mvc;
+ int x = 0;
+
+ if (vp8_read(r, p [mvpis_short])) /* Large */
+ {
+ int i = 0;
+
+ do
+ {
+ x += vp8_read(r, p [MVPbits + i]) << i;
+ }
+ while (++i < 3);
+
+ i = mvlong_width - 1; /* Skip bit 3, which is sometimes implicit */
+
+ do
+ {
+ x += vp8_read(r, p [MVPbits + i]) << i;
+ }
+ while (--i > 3);
+
+ if (!(x & 0xFFF0) || vp8_read(r, p [MVPbits + 3]))
+ x += 8;
+ }
+ else /* small */
+ x = vp8_treed_read(r, vp8_small_mvtree, p + MVPshort);
+
+ if (x && vp8_read(r, p [MVPsign]))
+ x = -x;
+
+ return x;
+}
+
+static void read_mv(vp8_reader *r, MV *mv, const MV_CONTEXT *mvc)
+{
+ mv->row = (short)(read_mvcomponent(r, mvc) * 2);
+ mv->col = (short)(read_mvcomponent(r, ++mvc) * 2);
+}
+
+
+static void read_mvcontexts(vp8_reader *bc, MV_CONTEXT *mvc)
+{
+ int i = 0;
+
+ do
+ {
+ const vp8_prob *up = vp8_mv_update_probs[i].prob;
+ vp8_prob *p = (vp8_prob *)(mvc + i);
+ vp8_prob *const pstop = p + MVPcount;
+
+ do
+ {
+ if (vp8_read(bc, *up++))
+ {
+ const vp8_prob x = (vp8_prob)vp8_read_literal(bc, 7);
+
+ *p = x ? x << 1 : 1;
+ }
+ }
+ while (++p < pstop);
+ }
+ while (++i < 2);
+}
+
+static const unsigned char mbsplit_fill_count[4] = {8, 8, 4, 1};
+static const unsigned char mbsplit_fill_offset[4][16] = {
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+ { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15},
+ { 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15},
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
+};
+
+
+static void mb_mode_mv_init(VP8D_COMP *pbi)
+{
+ vp8_reader *const bc = & pbi->mbc[8];
+ MV_CONTEXT *const mvc = pbi->common.fc.mvc;
+
+#if CONFIG_ERROR_CONCEALMENT
+ /* Default is that no macroblock is corrupt, therefore we initialize
+ * mvs_corrupt_from_mb to something very big, which we can be sure is
+ * outside the frame. */
+ pbi->mvs_corrupt_from_mb = UINT_MAX;
+#endif
+ /* Read the mb_no_coeff_skip flag */
+ pbi->common.mb_no_coeff_skip = (int)vp8_read_bit(bc);
+
+ pbi->prob_skip_false = 0;
+ if (pbi->common.mb_no_coeff_skip)
+ pbi->prob_skip_false = (vp8_prob)vp8_read_literal(bc, 8);
+
+ if(pbi->common.frame_type != KEY_FRAME)
+ {
+ pbi->prob_intra = (vp8_prob)vp8_read_literal(bc, 8);
+ pbi->prob_last = (vp8_prob)vp8_read_literal(bc, 8);
+ pbi->prob_gf = (vp8_prob)vp8_read_literal(bc, 8);
+
+ if (vp8_read_bit(bc))
+ {
+ int i = 0;
+
+ do
+ {
+ pbi->common.fc.ymode_prob[i] =
+ (vp8_prob) vp8_read_literal(bc, 8);
+ }
+ while (++i < 4);
+ }
+
+ if (vp8_read_bit(bc))
+ {
+ int i = 0;
+
+ do
+ {
+ pbi->common.fc.uv_mode_prob[i] =
+ (vp8_prob) vp8_read_literal(bc, 8);
+ }
+ while (++i < 3);
+ }
+
+ read_mvcontexts(bc, mvc);
+ }
+}
+
+const vp8_prob vp8_sub_mv_ref_prob3 [8][VP8_SUBMVREFS-1] =
+{
+ { 147, 136, 18 }, /* SUBMVREF_NORMAL */
+ { 223, 1 , 34 }, /* SUBMVREF_LEFT_ABOVE_SAME */
+ { 106, 145, 1 }, /* SUBMVREF_LEFT_ZED */
+ { 208, 1 , 1 }, /* SUBMVREF_LEFT_ABOVE_ZED */
+ { 179, 121, 1 }, /* SUBMVREF_ABOVE_ZED */
+ { 223, 1 , 34 }, /* SUBMVREF_LEFT_ABOVE_SAME */
+ { 179, 121, 1 }, /* SUBMVREF_ABOVE_ZED */
+ { 208, 1 , 1 } /* SUBMVREF_LEFT_ABOVE_ZED */
+};
+
+static
+const vp8_prob * get_sub_mv_ref_prob(const int left, const int above)
+{
+ int lez = (left == 0);
+ int aez = (above == 0);
+ int lea = (left == above);
+ const vp8_prob * prob;
+
+ prob = vp8_sub_mv_ref_prob3[(aez << 2) |
+ (lez << 1) |
+ (lea)];
+
+ return prob;
+}
+
+static void decode_split_mv(vp8_reader *const bc, MODE_INFO *mi,
+ const MODE_INFO *left_mb, const MODE_INFO *above_mb,
+ MB_MODE_INFO *mbmi, int_mv best_mv,
+ MV_CONTEXT *const mvc, int mb_to_left_edge,
+ int mb_to_right_edge, int mb_to_top_edge,
+ int mb_to_bottom_edge)
+{
+ int s; /* split configuration (16x8, 8x16, 8x8, 4x4) */
+ int num_p; /* number of partitions in the split configuration
+ (see vp8_mbsplit_count) */
+ int j = 0;
+
+ s = 3;
+ num_p = 16;
+ if( vp8_read(bc, 110) )
+ {
+ s = 2;
+ num_p = 4;
+ if( vp8_read(bc, 111) )
+ {
+ s = vp8_read(bc, 150);
+ num_p = 2;
+ }
+ }
+
+ do /* for each subset j */
+ {
+ int_mv leftmv, abovemv;
+ int_mv blockmv;
+ int k; /* first block in subset j */
+
+ const vp8_prob *prob;
+ k = vp8_mbsplit_offset[s][j];
+
+ if (!(k & 3))
+ {
+ /* On L edge, get from MB to left of us */
+ if(left_mb->mbmi.mode != SPLITMV)
+ leftmv.as_int = left_mb->mbmi.mv.as_int;
+ else
+ leftmv.as_int = (left_mb->bmi + k + 4 - 1)->mv.as_int;
+ }
+ else
+ leftmv.as_int = (mi->bmi + k - 1)->mv.as_int;
+
+ if (!(k >> 2))
+ {
+ /* On top edge, get from MB above us */
+ if(above_mb->mbmi.mode != SPLITMV)
+ abovemv.as_int = above_mb->mbmi.mv.as_int;
+ else
+ abovemv.as_int = (above_mb->bmi + k + 16 - 4)->mv.as_int;
+ }
+ else
+ abovemv.as_int = (mi->bmi + k - 4)->mv.as_int;
+
+ prob = get_sub_mv_ref_prob(leftmv.as_int, abovemv.as_int);
+
+ if( vp8_read(bc, prob[0]) )
+ {
+ if( vp8_read(bc, prob[1]) )
+ {
+ blockmv.as_int = 0;
+ if( vp8_read(bc, prob[2]) )
+ {
+ blockmv.as_mv.row = read_mvcomponent(bc, &mvc[0]) * 2;
+ blockmv.as_mv.row += best_mv.as_mv.row;
+ blockmv.as_mv.col = read_mvcomponent(bc, &mvc[1]) * 2;
+ blockmv.as_mv.col += best_mv.as_mv.col;
+ }
+ }
+ else
+ {
+ blockmv.as_int = abovemv.as_int;
+ }
+ }
+ else
+ {
+ blockmv.as_int = leftmv.as_int;
+ }
+
+ mbmi->need_to_clamp_mvs |= vp8_check_mv_bounds(&blockmv,
+ mb_to_left_edge,
+ mb_to_right_edge,
+ mb_to_top_edge,
+ mb_to_bottom_edge);
+
+ {
+ /* Fill (uniform) modes, mvs of jth subset.
+ Must do it here because ensuing subsets can
+ refer back to us via "left" or "above". */
+ const unsigned char *fill_offset;
+ unsigned int fill_count = mbsplit_fill_count[s];
+
+ fill_offset = &mbsplit_fill_offset[s]
+ [(unsigned char)j * mbsplit_fill_count[s]];
+
+ do {
+ mi->bmi[ *fill_offset].mv.as_int = blockmv.as_int;
+ fill_offset++;
+ }while (--fill_count);
+ }
+
+ }
+ while (++j < num_p);
+
+ mbmi->partitioning = s;
+}
+
+static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi)
+{
+ vp8_reader *const bc = & pbi->mbc[8];
+ mbmi->ref_frame = (MV_REFERENCE_FRAME) vp8_read(bc, pbi->prob_intra);
+ if (mbmi->ref_frame) /* inter MB */
+ {
+ enum {CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV};
+ int cnt[4];
+ int *cntx = cnt;
+ int_mv near_mvs[4];
+ int_mv *nmv = near_mvs;
+ const int mis = pbi->mb.mode_info_stride;
+ const MODE_INFO *above = mi - mis;
+ const MODE_INFO *left = mi - 1;
+ const MODE_INFO *aboveleft = above - 1;
+ int *ref_frame_sign_bias = pbi->common.ref_frame_sign_bias;
+
+ mbmi->need_to_clamp_mvs = 0;
+
+ if (vp8_read(bc, pbi->prob_last))
+ {
+ mbmi->ref_frame =
+ (MV_REFERENCE_FRAME)((int)(2 + vp8_read(bc, pbi->prob_gf)));
+ }
+
+ /* Zero accumulators */
+ nmv[0].as_int = nmv[1].as_int = nmv[2].as_int = 0;
+ cnt[0] = cnt[1] = cnt[2] = cnt[3] = 0;
+
+ /* Process above */
+ if (above->mbmi.ref_frame != INTRA_FRAME)
+ {
+ if (above->mbmi.mv.as_int)
+ {
+ (++nmv)->as_int = above->mbmi.mv.as_int;
+ mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame],
+ mbmi->ref_frame, nmv, ref_frame_sign_bias);
+ ++cntx;
+ }
+
+ *cntx += 2;
+ }
+
+ /* Process left */
+ if (left->mbmi.ref_frame != INTRA_FRAME)
+ {
+ if (left->mbmi.mv.as_int)
+ {
+ int_mv this_mv;
+
+ this_mv.as_int = left->mbmi.mv.as_int;
+ mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame],
+ mbmi->ref_frame, &this_mv, ref_frame_sign_bias);
+
+ if (this_mv.as_int != nmv->as_int)
+ {
+ (++nmv)->as_int = this_mv.as_int;
+ ++cntx;
+ }
+
+ *cntx += 2;
+ }
+ else
+ cnt[CNT_INTRA] += 2;
+ }
+
+ /* Process above left */
+ if (aboveleft->mbmi.ref_frame != INTRA_FRAME)
+ {
+ if (aboveleft->mbmi.mv.as_int)
+ {
+ int_mv this_mv;
+
+ this_mv.as_int = aboveleft->mbmi.mv.as_int;
+ mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame],
+ mbmi->ref_frame, &this_mv, ref_frame_sign_bias);
+
+ if (this_mv.as_int != nmv->as_int)
+ {
+ (++nmv)->as_int = this_mv.as_int;
+ ++cntx;
+ }
+
+ *cntx += 1;
+ }
+ else
+ cnt[CNT_INTRA] += 1;
+ }
+
+ if( vp8_read(bc, vp8_mode_contexts [cnt[CNT_INTRA]] [0]) )
+ {
+
+ /* If we have three distinct MV's ... */
+ /* See if above-left MV can be merged with NEAREST */
+ cnt[CNT_NEAREST] += ( (cnt[CNT_SPLITMV] > 0) &
+ (nmv->as_int == near_mvs[CNT_NEAREST].as_int));
+
+ /* Swap near and nearest if necessary */
+ if (cnt[CNT_NEAR] > cnt[CNT_NEAREST])
+ {
+ int tmp;
+ tmp = cnt[CNT_NEAREST];
+ cnt[CNT_NEAREST] = cnt[CNT_NEAR];
+ cnt[CNT_NEAR] = tmp;
+ tmp = near_mvs[CNT_NEAREST].as_int;
+ near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int;
+ near_mvs[CNT_NEAR].as_int = tmp;
+ }
+
+ if( vp8_read(bc, vp8_mode_contexts [cnt[CNT_NEAREST]] [1]) )
+ {
+
+ if( vp8_read(bc, vp8_mode_contexts [cnt[CNT_NEAR]] [2]) )
+ {
+ int mb_to_top_edge;
+ int mb_to_bottom_edge;
+ int mb_to_left_edge;
+ int mb_to_right_edge;
+ MV_CONTEXT *const mvc = pbi->common.fc.mvc;
+ int near_index;
+
+ mb_to_top_edge = pbi->mb.mb_to_top_edge;
+ mb_to_bottom_edge = pbi->mb.mb_to_bottom_edge;
+ mb_to_top_edge -= LEFT_TOP_MARGIN;
+ mb_to_bottom_edge += RIGHT_BOTTOM_MARGIN;
+ mb_to_right_edge = pbi->mb.mb_to_right_edge;
+ mb_to_right_edge += RIGHT_BOTTOM_MARGIN;
+ mb_to_left_edge = pbi->mb.mb_to_left_edge;
+ mb_to_left_edge -= LEFT_TOP_MARGIN;
+
+ /* Use near_mvs[0] to store the "best" MV */
+ near_index = CNT_INTRA +
+ (cnt[CNT_NEAREST] >= cnt[CNT_INTRA]);
+
+ vp8_clamp_mv2(&near_mvs[near_index], &pbi->mb);
+
+ cnt[CNT_SPLITMV] = ((above->mbmi.mode == SPLITMV)
+ + (left->mbmi.mode == SPLITMV)) * 2
+ + (aboveleft->mbmi.mode == SPLITMV);
+
+ if( vp8_read(bc, vp8_mode_contexts [cnt[CNT_SPLITMV]] [3]) )
+ {
+ decode_split_mv(bc, mi, left, above,
+ mbmi,
+ near_mvs[near_index],
+ mvc, mb_to_left_edge,
+ mb_to_right_edge,
+ mb_to_top_edge,
+ mb_to_bottom_edge);
+ mbmi->mv.as_int = mi->bmi[15].mv.as_int;
+ mbmi->mode = SPLITMV;
+ mbmi->is_4x4 = 1;
+ }
+ else
+ {
+ int_mv *const mbmi_mv = & mbmi->mv;
+ read_mv(bc, &mbmi_mv->as_mv, (const MV_CONTEXT *) mvc);
+ mbmi_mv->as_mv.row += near_mvs[near_index].as_mv.row;
+ mbmi_mv->as_mv.col += near_mvs[near_index].as_mv.col;
+
+ /* Don't need to check this on NEARMV and NEARESTMV
+ * modes since those modes clamp the MV. The NEWMV mode
+ * does not, so signal to the prediction stage whether
+ * special handling may be required.
+ */
+ mbmi->need_to_clamp_mvs =
+ vp8_check_mv_bounds(mbmi_mv, mb_to_left_edge,
+ mb_to_right_edge,
+ mb_to_top_edge,
+ mb_to_bottom_edge);
+ mbmi->mode = NEWMV;
+ }
+ }
+ else
+ {
+ mbmi->mode = NEARMV;
+ mbmi->mv.as_int = near_mvs[CNT_NEAR].as_int;
+ vp8_clamp_mv2(&mbmi->mv, &pbi->mb);
+ }
+ }
+ else
+ {
+ mbmi->mode = NEARESTMV;
+ mbmi->mv.as_int = near_mvs[CNT_NEAREST].as_int;
+ vp8_clamp_mv2(&mbmi->mv, &pbi->mb);
+ }
+ }
+ else
+ {
+ mbmi->mode = ZEROMV;
+ mbmi->mv.as_int = 0;
+ }
+
+#if CONFIG_ERROR_CONCEALMENT
+ if(pbi->ec_enabled && (mbmi->mode != SPLITMV))
+ {
+ mi->bmi[ 0].mv.as_int =
+ mi->bmi[ 1].mv.as_int =
+ mi->bmi[ 2].mv.as_int =
+ mi->bmi[ 3].mv.as_int =
+ mi->bmi[ 4].mv.as_int =
+ mi->bmi[ 5].mv.as_int =
+ mi->bmi[ 6].mv.as_int =
+ mi->bmi[ 7].mv.as_int =
+ mi->bmi[ 8].mv.as_int =
+ mi->bmi[ 9].mv.as_int =
+ mi->bmi[10].mv.as_int =
+ mi->bmi[11].mv.as_int =
+ mi->bmi[12].mv.as_int =
+ mi->bmi[13].mv.as_int =
+ mi->bmi[14].mv.as_int =
+ mi->bmi[15].mv.as_int = mbmi->mv.as_int;
+ }
+#endif
+ }
+ else
+ {
+ /* required for left and above block mv */
+ mbmi->mv.as_int = 0;
+
+ /* MB is intra coded */
+ if ((mbmi->mode = read_ymode(bc, pbi->common.fc.ymode_prob)) == B_PRED)
+ {
+ int j = 0;
+ mbmi->is_4x4 = 1;
+ do
+ {
+ mi->bmi[j].as_mode = read_bmode(bc, pbi->common.fc.bmode_prob);
+ }
+ while (++j < 16);
+ }
+
+ mbmi->uv_mode = read_uv_mode(bc, pbi->common.fc.uv_mode_prob);
+ }
+
+}
+
+static void read_mb_features(vp8_reader *r, MB_MODE_INFO *mi, MACROBLOCKD *x)
+{
+ /* Is segmentation enabled */
+ if (x->segmentation_enabled && x->update_mb_segmentation_map)
+ {
+ /* If so then read the segment id. */
+ if (vp8_read(r, x->mb_segment_tree_probs[0]))
+ mi->segment_id =
+ (unsigned char)(2 + vp8_read(r, x->mb_segment_tree_probs[2]));
+ else
+ mi->segment_id =
+ (unsigned char)(vp8_read(r, x->mb_segment_tree_probs[1]));
+ }
+}
+
+static void decode_mb_mode_mvs(VP8D_COMP *pbi, MODE_INFO *mi,
+ MB_MODE_INFO *mbmi)
+{
+ (void)mbmi;
+
+ /* Read the Macroblock segmentation map if it is being updated explicitly
+ * this frame (reset to 0 above by default)
+ * By default on a key frame reset all MBs to segment 0
+ */
+ if (pbi->mb.update_mb_segmentation_map)
+ read_mb_features(&pbi->mbc[8], &mi->mbmi, &pbi->mb);
+ else if(pbi->common.frame_type == KEY_FRAME)
+ mi->mbmi.segment_id = 0;
+
+ /* Read the macroblock coeff skip flag if this feature is in use,
+ * else default to 0 */
+ if (pbi->common.mb_no_coeff_skip)
+ mi->mbmi.mb_skip_coeff = vp8_read(&pbi->mbc[8], pbi->prob_skip_false);
+ else
+ mi->mbmi.mb_skip_coeff = 0;
+
+ mi->mbmi.is_4x4 = 0;
+ if(pbi->common.frame_type == KEY_FRAME)
+ read_kf_modes(pbi, mi);
+ else
+ read_mb_modes_mv(pbi, mi, &mi->mbmi);
+
+}
+
+void vp8_decode_mode_mvs(VP8D_COMP *pbi)
+{
+ MODE_INFO *mi = pbi->common.mi;
+ int mb_row = -1;
+ int mb_to_right_edge_start;
+
+ mb_mode_mv_init(pbi);
+
+ pbi->mb.mb_to_top_edge = 0;
+ pbi->mb.mb_to_bottom_edge = ((pbi->common.mb_rows - 1) * 16) << 3;
+ mb_to_right_edge_start = ((pbi->common.mb_cols - 1) * 16) << 3;
+
+ while (++mb_row < pbi->common.mb_rows)
+ {
+ int mb_col = -1;
+
+ pbi->mb.mb_to_left_edge = 0;
+ pbi->mb.mb_to_right_edge = mb_to_right_edge_start;
+
+ while (++mb_col < pbi->common.mb_cols)
+ {
+#if CONFIG_ERROR_CONCEALMENT
+ int mb_num = mb_row * pbi->common.mb_cols + mb_col;
+#endif
+
+ decode_mb_mode_mvs(pbi, mi, &mi->mbmi);
+
+#if CONFIG_ERROR_CONCEALMENT
+ /* look for corruption. set mvs_corrupt_from_mb to the current
+ * mb_num if the frame is corrupt from this macroblock. */
+ if (vp8dx_bool_error(&pbi->mbc[8]) && mb_num <
+ (int)pbi->mvs_corrupt_from_mb)
+ {
+ pbi->mvs_corrupt_from_mb = mb_num;
+ /* no need to continue since the partition is corrupt from
+ * here on.
+ */
+ return;
+ }
+#endif
+
+ pbi->mb.mb_to_left_edge -= (16 << 3);
+ pbi->mb.mb_to_right_edge -= (16 << 3);
+ mi++; /* next macroblock */
+ }
+ pbi->mb.mb_to_top_edge -= (16 << 3);
+ pbi->mb.mb_to_bottom_edge -= (16 << 3);
+
+ mi++; /* skip left predictor each row */
+ }
+}
diff --git a/thirdparty/libvpx/vp8/decoder/decodemv.h b/thirdparty/libvpx/vp8/decoder/decodemv.h
new file mode 100644
index 0000000000..f33b07351d
--- /dev/null
+++ b/thirdparty/libvpx/vp8/decoder/decodemv.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_DECODER_DECODEMV_H_
+#define VP8_DECODER_DECODEMV_H_
+
+#include "onyxd_int.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_decode_mode_mvs(VP8D_COMP *);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_DECODER_DECODEMV_H_
diff --git a/thirdparty/libvpx/vp8/decoder/decoderthreading.h b/thirdparty/libvpx/vp8/decoder/decoderthreading.h
new file mode 100644
index 0000000000..c563cf6e93
--- /dev/null
+++ b/thirdparty/libvpx/vp8/decoder/decoderthreading.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_DECODER_DECODERTHREADING_H_
+#define VP8_DECODER_DECODERTHREADING_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if CONFIG_MULTITHREAD
+void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd);
+void vp8_decoder_remove_threads(VP8D_COMP *pbi);
+void vp8_decoder_create_threads(VP8D_COMP *pbi);
+void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows);
+void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows);
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_DECODER_DECODERTHREADING_H_
diff --git a/thirdparty/libvpx/vp8/decoder/detokenize.c b/thirdparty/libvpx/vp8/decoder/detokenize.c
new file mode 100644
index 0000000000..fcc7533c50
--- /dev/null
+++ b/thirdparty/libvpx/vp8/decoder/detokenize.c
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/common/blockd.h"
+#include "onyxd_int.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+#include "detokenize.h"
+
+void vp8_reset_mb_tokens_context(MACROBLOCKD *x)
+{
+ ENTROPY_CONTEXT *a_ctx = ((ENTROPY_CONTEXT *)x->above_context);
+ ENTROPY_CONTEXT *l_ctx = ((ENTROPY_CONTEXT *)x->left_context);
+
+ memset(a_ctx, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
+ memset(l_ctx, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
+
+ /* Clear entropy contexts for Y2 blocks */
+ if (!x->mode_info_context->mbmi.is_4x4)
+ {
+ a_ctx[8] = l_ctx[8] = 0;
+ }
+}
+
+/*
+ ------------------------------------------------------------------------------
+ Residual decoding (Paragraph 13.2 / 13.3)
+*/
+static const uint8_t kBands[16 + 1] = {
+ 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
+ 0 /* extra entry as sentinel */
+};
+
+static const uint8_t kCat3[] = { 173, 148, 140, 0 };
+static const uint8_t kCat4[] = { 176, 155, 140, 135, 0 };
+static const uint8_t kCat5[] = { 180, 157, 141, 134, 130, 0 };
+static const uint8_t kCat6[] =
+ { 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 };
+static const uint8_t* const kCat3456[] = { kCat3, kCat4, kCat5, kCat6 };
+static const uint8_t kZigzag[16] = {
+ 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
+};
+
+#define VP8GetBit vp8dx_decode_bool
+#define NUM_PROBAS 11
+#define NUM_CTX 3
+
+/* for const-casting */
+typedef const uint8_t (*ProbaArray)[NUM_CTX][NUM_PROBAS];
+
+static int GetSigned(BOOL_DECODER *br, int value_to_sign)
+{
+ int split = (br->range + 1) >> 1;
+ VP8_BD_VALUE bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8);
+ int v;
+
+ if(br->count < 0)
+ vp8dx_bool_decoder_fill(br);
+
+ if ( br->value < bigsplit )
+ {
+ br->range = split;
+ v= value_to_sign;
+ }
+ else
+ {
+ br->range = br->range-split;
+ br->value = br->value-bigsplit;
+ v = -value_to_sign;
+ }
+ br->range +=br->range;
+ br->value +=br->value;
+ br->count--;
+
+ return v;
+}
+/*
+ Returns the position of the last non-zero coeff plus one
+ (and 0 if there's no coeff at all)
+*/
+static int GetCoeffs(BOOL_DECODER *br, ProbaArray prob,
+ int ctx, int n, int16_t* out)
+{
+ const uint8_t* p = prob[n][ctx];
+ if (!VP8GetBit(br, p[0]))
+ { /* first EOB is more a 'CBP' bit. */
+ return 0;
+ }
+ while (1)
+ {
+ ++n;
+ if (!VP8GetBit(br, p[1]))
+ {
+ p = prob[kBands[n]][0];
+ }
+ else
+ { /* non zero coeff */
+ int v, j;
+ if (!VP8GetBit(br, p[2]))
+ {
+ p = prob[kBands[n]][1];
+ v = 1;
+ }
+ else
+ {
+ if (!VP8GetBit(br, p[3]))
+ {
+ if (!VP8GetBit(br, p[4]))
+ {
+ v = 2;
+ }
+ else
+ {
+ v = 3 + VP8GetBit(br, p[5]);
+ }
+ }
+ else
+ {
+ if (!VP8GetBit(br, p[6]))
+ {
+ if (!VP8GetBit(br, p[7]))
+ {
+ v = 5 + VP8GetBit(br, 159);
+ } else
+ {
+ v = 7 + 2 * VP8GetBit(br, 165);
+ v += VP8GetBit(br, 145);
+ }
+ }
+ else
+ {
+ const uint8_t* tab;
+ const int bit1 = VP8GetBit(br, p[8]);
+ const int bit0 = VP8GetBit(br, p[9 + bit1]);
+ const int cat = 2 * bit1 + bit0;
+ v = 0;
+ for (tab = kCat3456[cat]; *tab; ++tab)
+ {
+ v += v + VP8GetBit(br, *tab);
+ }
+ v += 3 + (8 << cat);
+ }
+ }
+ p = prob[kBands[n]][2];
+ }
+ j = kZigzag[n - 1];
+
+ out[j] = GetSigned(br, v);
+
+ if (n == 16 || !VP8GetBit(br, p[0]))
+ { /* EOB */
+ return n;
+ }
+ }
+ if (n == 16)
+ {
+ return 16;
+ }
+ }
+}
+
+int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
+{
+ BOOL_DECODER *bc = x->current_bc;
+ const FRAME_CONTEXT * const fc = &dx->common.fc;
+ char *eobs = x->eobs;
+
+ int i;
+ int nonzeros;
+ int eobtotal = 0;
+
+ short *qcoeff_ptr;
+ ProbaArray coef_probs;
+ ENTROPY_CONTEXT *a_ctx = ((ENTROPY_CONTEXT *)x->above_context);
+ ENTROPY_CONTEXT *l_ctx = ((ENTROPY_CONTEXT *)x->left_context);
+ ENTROPY_CONTEXT *a;
+ ENTROPY_CONTEXT *l;
+ int skip_dc = 0;
+
+ qcoeff_ptr = &x->qcoeff[0];
+
+ if (!x->mode_info_context->mbmi.is_4x4)
+ {
+ a = a_ctx + 8;
+ l = l_ctx + 8;
+
+ coef_probs = fc->coef_probs [1];
+
+ nonzeros = GetCoeffs(bc, coef_probs, (*a + *l), 0, qcoeff_ptr + 24 * 16);
+ *a = *l = (nonzeros > 0);
+
+ eobs[24] = nonzeros;
+ eobtotal += nonzeros - 16;
+
+ coef_probs = fc->coef_probs [0];
+ skip_dc = 1;
+ }
+ else
+ {
+ coef_probs = fc->coef_probs [3];
+ skip_dc = 0;
+ }
+
+ for (i = 0; i < 16; ++i)
+ {
+ a = a_ctx + (i&3);
+ l = l_ctx + ((i&0xc)>>2);
+
+ nonzeros = GetCoeffs(bc, coef_probs, (*a + *l), skip_dc, qcoeff_ptr);
+ *a = *l = (nonzeros > 0);
+
+ nonzeros += skip_dc;
+ eobs[i] = nonzeros;
+ eobtotal += nonzeros;
+ qcoeff_ptr += 16;
+ }
+
+ coef_probs = fc->coef_probs [2];
+
+ a_ctx += 4;
+ l_ctx += 4;
+ for (i = 16; i < 24; ++i)
+ {
+ a = a_ctx + ((i > 19)<<1) + (i&1);
+ l = l_ctx + ((i > 19)<<1) + ((i&3)>1);
+
+ nonzeros = GetCoeffs(bc, coef_probs, (*a + *l), 0, qcoeff_ptr);
+ *a = *l = (nonzeros > 0);
+
+ eobs[i] = nonzeros;
+ eobtotal += nonzeros;
+ qcoeff_ptr += 16;
+ }
+
+ return eobtotal;
+}
+
diff --git a/thirdparty/libvpx/vp8/decoder/detokenize.h b/thirdparty/libvpx/vp8/decoder/detokenize.h
new file mode 100644
index 0000000000..f0b125444f
--- /dev/null
+++ b/thirdparty/libvpx/vp8/decoder/detokenize.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_DECODER_DETOKENIZE_H_
+#define VP8_DECODER_DETOKENIZE_H_
+
+#include "onyxd_int.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_reset_mb_tokens_context(MACROBLOCKD *x);
+int vp8_decode_mb_tokens(VP8D_COMP *, MACROBLOCKD *);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_DECODER_DETOKENIZE_H_
diff --git a/thirdparty/libvpx/vp8/decoder/onyxd_if.c b/thirdparty/libvpx/vp8/decoder/onyxd_if.c
new file mode 100644
index 0000000000..3468268a2a
--- /dev/null
+++ b/thirdparty/libvpx/vp8/decoder/onyxd_if.c
@@ -0,0 +1,521 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/common/onyxc_int.h"
+#if CONFIG_POSTPROC
+#include "vp8/common/postproc.h"
+#endif
+#include "vp8/common/onyxd.h"
+#include "onyxd_int.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/common/alloccommon.h"
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/swapyv12buffer.h"
+#include "vp8/common/threading.h"
+#include "decoderthreading.h"
+#include <stdio.h>
+#include <assert.h>
+
+#include "vp8/common/quant_common.h"
+#include "vp8/common/reconintra.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+#include "vpx_scale/vpx_scale.h"
+#include "vp8/common/systemdependent.h"
+#include "vpx_ports/vpx_once.h"
+#include "vpx_ports/vpx_timer.h"
+#include "detokenize.h"
+#if CONFIG_ERROR_CONCEALMENT
+#include "error_concealment.h"
+#endif
+#if ARCH_ARM
+#include "vpx_ports/arm.h"
+#endif
+
+extern void vp8_init_loop_filter(VP8_COMMON *cm);
+extern void vp8cx_init_de_quantizer(VP8D_COMP *pbi);
+static int get_free_fb (VP8_COMMON *cm);
+static void ref_cnt_fb (int *buf, int *idx, int new_idx);
+
+static void initialize_dec(void) {
+ static volatile int init_done = 0;
+
+ if (!init_done)
+ {
+ vpx_dsp_rtcd();
+ vp8_init_intra_predictors();
+ init_done = 1;
+ }
+}
+
+static void remove_decompressor(VP8D_COMP *pbi)
+{
+#if CONFIG_ERROR_CONCEALMENT
+ vp8_de_alloc_overlap_lists(pbi);
+#endif
+ vp8_remove_common(&pbi->common);
+ vpx_free(pbi);
+}
+
+static struct VP8D_COMP * create_decompressor(VP8D_CONFIG *oxcf)
+{
+ VP8D_COMP *pbi = vpx_memalign(32, sizeof(VP8D_COMP));
+
+ if (!pbi)
+ return NULL;
+
+ memset(pbi, 0, sizeof(VP8D_COMP));
+
+ if (setjmp(pbi->common.error.jmp))
+ {
+ pbi->common.error.setjmp = 0;
+ remove_decompressor(pbi);
+ return 0;
+ }
+
+ pbi->common.error.setjmp = 1;
+
+ vp8_create_common(&pbi->common);
+
+ pbi->common.current_video_frame = 0;
+ pbi->ready_for_new_data = 1;
+
+ /* vp8cx_init_de_quantizer() is first called here. Add check in frame_init_dequantizer() to avoid
+ * unnecessary calling of vp8cx_init_de_quantizer() for every frame.
+ */
+ vp8cx_init_de_quantizer(pbi);
+
+ vp8_loop_filter_init(&pbi->common);
+
+ pbi->common.error.setjmp = 0;
+
+#if CONFIG_ERROR_CONCEALMENT
+ pbi->ec_enabled = oxcf->error_concealment;
+ pbi->overlaps = NULL;
+#else
+ (void)oxcf;
+ pbi->ec_enabled = 0;
+#endif
+ /* Error concealment is activated after a key frame has been
+ * decoded without errors when error concealment is enabled.
+ */
+ pbi->ec_active = 0;
+
+ pbi->decoded_key_frame = 0;
+
+ /* Independent partitions is activated when a frame updates the
+ * token probability table to have equal probabilities over the
+ * PREV_COEF context.
+ */
+ pbi->independent_partitions = 0;
+
+ vp8_setup_block_dptrs(&pbi->mb);
+
+ once(initialize_dec);
+
+ return pbi;
+}
+
+vpx_codec_err_t vp8dx_get_reference(VP8D_COMP *pbi, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+{
+ VP8_COMMON *cm = &pbi->common;
+ int ref_fb_idx;
+
+ if (ref_frame_flag == VP8_LAST_FRAME)
+ ref_fb_idx = cm->lst_fb_idx;
+ else if (ref_frame_flag == VP8_GOLD_FRAME)
+ ref_fb_idx = cm->gld_fb_idx;
+ else if (ref_frame_flag == VP8_ALTR_FRAME)
+ ref_fb_idx = cm->alt_fb_idx;
+ else{
+ vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
+ "Invalid reference frame");
+ return pbi->common.error.error_code;
+ }
+
+ if(cm->yv12_fb[ref_fb_idx].y_height != sd->y_height ||
+ cm->yv12_fb[ref_fb_idx].y_width != sd->y_width ||
+ cm->yv12_fb[ref_fb_idx].uv_height != sd->uv_height ||
+ cm->yv12_fb[ref_fb_idx].uv_width != sd->uv_width){
+ vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
+ "Incorrect buffer dimensions");
+ }
+ else
+ vp8_yv12_copy_frame(&cm->yv12_fb[ref_fb_idx], sd);
+
+ return pbi->common.error.error_code;
+}
+
+
+vpx_codec_err_t vp8dx_set_reference(VP8D_COMP *pbi, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+{
+ VP8_COMMON *cm = &pbi->common;
+ int *ref_fb_ptr = NULL;
+ int free_fb;
+
+ if (ref_frame_flag == VP8_LAST_FRAME)
+ ref_fb_ptr = &cm->lst_fb_idx;
+ else if (ref_frame_flag == VP8_GOLD_FRAME)
+ ref_fb_ptr = &cm->gld_fb_idx;
+ else if (ref_frame_flag == VP8_ALTR_FRAME)
+ ref_fb_ptr = &cm->alt_fb_idx;
+ else{
+ vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
+ "Invalid reference frame");
+ return pbi->common.error.error_code;
+ }
+
+ if(cm->yv12_fb[*ref_fb_ptr].y_height != sd->y_height ||
+ cm->yv12_fb[*ref_fb_ptr].y_width != sd->y_width ||
+ cm->yv12_fb[*ref_fb_ptr].uv_height != sd->uv_height ||
+ cm->yv12_fb[*ref_fb_ptr].uv_width != sd->uv_width){
+ vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
+ "Incorrect buffer dimensions");
+ }
+ else{
+ /* Find an empty frame buffer. */
+ free_fb = get_free_fb(cm);
+ /* Decrease fb_idx_ref_cnt since it will be increased again in
+ * ref_cnt_fb() below. */
+ cm->fb_idx_ref_cnt[free_fb]--;
+
+ /* Manage the reference counters and copy image. */
+ ref_cnt_fb (cm->fb_idx_ref_cnt, ref_fb_ptr, free_fb);
+ vp8_yv12_copy_frame(sd, &cm->yv12_fb[*ref_fb_ptr]);
+ }
+
+ return pbi->common.error.error_code;
+}
+
+static int get_free_fb (VP8_COMMON *cm)
+{
+ int i;
+ for (i = 0; i < NUM_YV12_BUFFERS; i++)
+ if (cm->fb_idx_ref_cnt[i] == 0)
+ break;
+
+ assert(i < NUM_YV12_BUFFERS);
+ cm->fb_idx_ref_cnt[i] = 1;
+ return i;
+}
+
+static void ref_cnt_fb (int *buf, int *idx, int new_idx)
+{
+ if (buf[*idx] > 0)
+ buf[*idx]--;
+
+ *idx = new_idx;
+
+ buf[new_idx]++;
+}
+
+/* If any buffer copy / swapping is signalled it should be done here. */
+static int swap_frame_buffers (VP8_COMMON *cm)
+{
+ int err = 0;
+
+ /* The alternate reference frame or golden frame can be updated
+ * using the new, last, or golden/alt ref frame. If it
+ * is updated using the newly decoded frame it is a refresh.
+ * An update using the last or golden/alt ref frame is a copy.
+ */
+ if (cm->copy_buffer_to_arf)
+ {
+ int new_fb = 0;
+
+ if (cm->copy_buffer_to_arf == 1)
+ new_fb = cm->lst_fb_idx;
+ else if (cm->copy_buffer_to_arf == 2)
+ new_fb = cm->gld_fb_idx;
+ else
+ err = -1;
+
+ ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->alt_fb_idx, new_fb);
+ }
+
+ if (cm->copy_buffer_to_gf)
+ {
+ int new_fb = 0;
+
+ if (cm->copy_buffer_to_gf == 1)
+ new_fb = cm->lst_fb_idx;
+ else if (cm->copy_buffer_to_gf == 2)
+ new_fb = cm->alt_fb_idx;
+ else
+ err = -1;
+
+ ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->gld_fb_idx, new_fb);
+ }
+
+ if (cm->refresh_golden_frame)
+ ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->gld_fb_idx, cm->new_fb_idx);
+
+ if (cm->refresh_alt_ref_frame)
+ ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->alt_fb_idx, cm->new_fb_idx);
+
+ if (cm->refresh_last_frame)
+ {
+ ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->lst_fb_idx, cm->new_fb_idx);
+
+ cm->frame_to_show = &cm->yv12_fb[cm->lst_fb_idx];
+ }
+ else
+ cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
+
+ cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
+
+ return err;
+}
+
+static int check_fragments_for_errors(VP8D_COMP *pbi)
+{
+ if (!pbi->ec_active &&
+ pbi->fragments.count <= 1 && pbi->fragments.sizes[0] == 0)
+ {
+ VP8_COMMON *cm = &pbi->common;
+
+ /* If error concealment is disabled we won't signal missing frames
+ * to the decoder.
+ */
+ if (cm->fb_idx_ref_cnt[cm->lst_fb_idx] > 1)
+ {
+ /* The last reference shares buffer with another reference
+ * buffer. Move it to its own buffer before setting it as
+ * corrupt, otherwise we will make multiple buffers corrupt.
+ */
+ const int prev_idx = cm->lst_fb_idx;
+ cm->fb_idx_ref_cnt[prev_idx]--;
+ cm->lst_fb_idx = get_free_fb(cm);
+ vp8_yv12_copy_frame(&cm->yv12_fb[prev_idx],
+ &cm->yv12_fb[cm->lst_fb_idx]);
+ }
+ /* This is used to signal that we are missing frames.
+ * We do not know if the missing frame(s) was supposed to update
+ * any of the reference buffers, but we act conservative and
+ * mark only the last buffer as corrupted.
+ */
+ cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
+
+ /* Signal that we have no frame to show. */
+ cm->show_frame = 0;
+
+ /* Nothing more to do. */
+ return 0;
+ }
+
+ return 1;
+}
+
+int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size,
+ const uint8_t *source,
+ int64_t time_stamp)
+{
+ VP8_COMMON *cm = &pbi->common;
+ int retcode = -1;
+ (void)size;
+ (void)source;
+
+ pbi->common.error.error_code = VPX_CODEC_OK;
+
+ retcode = check_fragments_for_errors(pbi);
+ if(retcode <= 0)
+ return retcode;
+
+ cm->new_fb_idx = get_free_fb (cm);
+
+ /* setup reference frames for vp8_decode_frame */
+ pbi->dec_fb_ref[INTRA_FRAME] = &cm->yv12_fb[cm->new_fb_idx];
+ pbi->dec_fb_ref[LAST_FRAME] = &cm->yv12_fb[cm->lst_fb_idx];
+ pbi->dec_fb_ref[GOLDEN_FRAME] = &cm->yv12_fb[cm->gld_fb_idx];
+ pbi->dec_fb_ref[ALTREF_FRAME] = &cm->yv12_fb[cm->alt_fb_idx];
+
+ if (setjmp(pbi->common.error.jmp))
+ {
+ /* We do not know if the missing frame(s) was supposed to update
+ * any of the reference buffers, but we act conservative and
+ * mark only the last buffer as corrupted.
+ */
+ cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
+
+ if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
+ cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
+
+ goto decode_exit;
+ }
+
+ pbi->common.error.setjmp = 1;
+
+ retcode = vp8_decode_frame(pbi);
+
+ if (retcode < 0)
+ {
+ if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
+ cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
+
+ pbi->common.error.error_code = VPX_CODEC_ERROR;
+ goto decode_exit;
+ }
+
+ if (swap_frame_buffers (cm))
+ {
+ pbi->common.error.error_code = VPX_CODEC_ERROR;
+ goto decode_exit;
+ }
+
+ vp8_clear_system_state();
+
+ if (cm->show_frame)
+ {
+ cm->current_video_frame++;
+ cm->show_frame_mi = cm->mi;
+ }
+
+ #if CONFIG_ERROR_CONCEALMENT
+ /* swap the mode infos to storage for future error concealment */
+ if (pbi->ec_enabled && pbi->common.prev_mi)
+ {
+ MODE_INFO* tmp = pbi->common.prev_mi;
+ int row, col;
+ pbi->common.prev_mi = pbi->common.mi;
+ pbi->common.mi = tmp;
+
+ /* Propagate the segment_ids to the next frame */
+ for (row = 0; row < pbi->common.mb_rows; ++row)
+ {
+ for (col = 0; col < pbi->common.mb_cols; ++col)
+ {
+ const int i = row*pbi->common.mode_info_stride + col;
+ pbi->common.mi[i].mbmi.segment_id =
+ pbi->common.prev_mi[i].mbmi.segment_id;
+ }
+ }
+ }
+#endif
+
+ pbi->ready_for_new_data = 0;
+ pbi->last_time_stamp = time_stamp;
+
+decode_exit:
+ pbi->common.error.setjmp = 0;
+ vp8_clear_system_state();
+ return retcode;
+}
+int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags)
+{
+ int ret = -1;
+
+ if (pbi->ready_for_new_data == 1)
+ return ret;
+
+ /* ie no raw frame to show!!! */
+ if (pbi->common.show_frame == 0)
+ return ret;
+
+ pbi->ready_for_new_data = 1;
+ *time_stamp = pbi->last_time_stamp;
+ *time_end_stamp = 0;
+
+#if CONFIG_POSTPROC
+ ret = vp8_post_proc_frame(&pbi->common, sd, flags);
+#else
+ (void)flags;
+
+ if (pbi->common.frame_to_show)
+ {
+ *sd = *pbi->common.frame_to_show;
+ sd->y_width = pbi->common.Width;
+ sd->y_height = pbi->common.Height;
+ sd->uv_height = pbi->common.Height / 2;
+ ret = 0;
+ }
+ else
+ {
+ ret = -1;
+ }
+
+#endif /*!CONFIG_POSTPROC*/
+ vp8_clear_system_state();
+ return ret;
+}
+
+
+/* This function as written isn't decoder specific, but the encoder has
+ * much faster ways of computing this, so it's ok for it to live in a
+ * decode specific file.
+ */
+int vp8dx_references_buffer( VP8_COMMON *oci, int ref_frame )
+{
+ const MODE_INFO *mi = oci->mi;
+ int mb_row, mb_col;
+
+ for (mb_row = 0; mb_row < oci->mb_rows; mb_row++)
+ {
+ for (mb_col = 0; mb_col < oci->mb_cols; mb_col++,mi++)
+ {
+ if( mi->mbmi.ref_frame == ref_frame)
+ return 1;
+ }
+ mi++;
+ }
+ return 0;
+
+}
+
+int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf)
+{
+ if(!fb->use_frame_threads)
+ {
+ /* decoder instance for single thread mode */
+ fb->pbi[0] = create_decompressor(oxcf);
+ if(!fb->pbi[0])
+ return VPX_CODEC_ERROR;
+
+#if CONFIG_MULTITHREAD
+ /* enable row-based threading only when use_frame_threads
+ * is disabled */
+ fb->pbi[0]->max_threads = oxcf->max_threads;
+ vp8_decoder_create_threads(fb->pbi[0]);
+#endif
+ }
+ else
+ {
+ /* TODO : create frame threads and decoder instances for each
+ * thread here */
+ }
+
+ return VPX_CODEC_OK;
+}
+
+int vp8_remove_decoder_instances(struct frame_buffers *fb)
+{
+ if(!fb->use_frame_threads)
+ {
+ VP8D_COMP *pbi = fb->pbi[0];
+
+ if (!pbi)
+ return VPX_CODEC_ERROR;
+#if CONFIG_MULTITHREAD
+ if (pbi->b_multithreaded_rd)
+ vp8mt_de_alloc_temp_buffers(pbi, pbi->common.mb_rows);
+ vp8_decoder_remove_threads(pbi);
+#endif
+
+ /* decoder instance for single thread mode */
+ remove_decompressor(pbi);
+ }
+ else
+ {
+ /* TODO : remove frame threads and decoder instances for each
+ * thread here */
+ }
+
+ return VPX_CODEC_OK;
+}
diff --git a/thirdparty/libvpx/vp8/decoder/onyxd_int.h b/thirdparty/libvpx/vp8/decoder/onyxd_int.h
new file mode 100644
index 0000000000..313fe01c07
--- /dev/null
+++ b/thirdparty/libvpx/vp8/decoder/onyxd_int.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_DECODER_ONYXD_INT_H_
+#define VP8_DECODER_ONYXD_INT_H_
+
+#include "vpx_config.h"
+#include "vp8/common/onyxd.h"
+#include "treereader.h"
+#include "vp8/common/onyxc_int.h"
+#include "vp8/common/threading.h"
+
+#if CONFIG_ERROR_CONCEALMENT
+#include "ec_types.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct
+{
+ int ithread;
+ void *ptr1;
+ void *ptr2;
+} DECODETHREAD_DATA;
+
+typedef struct
+{
+ MACROBLOCKD mbd;
+} MB_ROW_DEC;
+
+
+typedef struct
+{
+ int enabled;
+ unsigned int count;
+ const unsigned char *ptrs[MAX_PARTITIONS];
+ unsigned int sizes[MAX_PARTITIONS];
+} FRAGMENT_DATA;
+
+#define MAX_FB_MT_DEC 32
+
+struct frame_buffers
+{
+ /*
+ * this struct will be populated with frame buffer management
+ * info in future commits. */
+
+ /* enable/disable frame-based threading */
+ int use_frame_threads;
+
+ /* decoder instances */
+ struct VP8D_COMP *pbi[MAX_FB_MT_DEC];
+
+};
+
+typedef struct VP8D_COMP
+{
+ DECLARE_ALIGNED(16, MACROBLOCKD, mb);
+
+ YV12_BUFFER_CONFIG *dec_fb_ref[NUM_YV12_BUFFERS];
+
+ DECLARE_ALIGNED(16, VP8_COMMON, common);
+
+ /* the last partition will be used for the modes/mvs */
+ vp8_reader mbc[MAX_PARTITIONS];
+
+ VP8D_CONFIG oxcf;
+
+ FRAGMENT_DATA fragments;
+
+#if CONFIG_MULTITHREAD
+ /* variable for threading */
+
+ int b_multithreaded_rd;
+ int max_threads;
+ int current_mb_col_main;
+ unsigned int decoding_thread_count;
+ int allocated_decoding_thread_count;
+
+ int mt_baseline_filter_level[MAX_MB_SEGMENTS];
+ int sync_range;
+ int *mt_current_mb_col; /* Each row remembers its already decoded column. */
+ pthread_mutex_t *pmutex;
+ pthread_mutex_t mt_mutex; /* mutex for b_multithreaded_rd */
+
+ unsigned char **mt_yabove_row; /* mb_rows x width */
+ unsigned char **mt_uabove_row;
+ unsigned char **mt_vabove_row;
+ unsigned char **mt_yleft_col; /* mb_rows x 16 */
+ unsigned char **mt_uleft_col; /* mb_rows x 8 */
+ unsigned char **mt_vleft_col; /* mb_rows x 8 */
+
+ MB_ROW_DEC *mb_row_di;
+ DECODETHREAD_DATA *de_thread_data;
+
+ pthread_t *h_decoding_thread;
+ sem_t *h_event_start_decoding;
+ sem_t h_event_end_decoding;
+ /* end of threading data */
+#endif
+
+ int64_t last_time_stamp;
+ int ready_for_new_data;
+
+ vp8_prob prob_intra;
+ vp8_prob prob_last;
+ vp8_prob prob_gf;
+ vp8_prob prob_skip_false;
+
+#if CONFIG_ERROR_CONCEALMENT
+ MB_OVERLAP *overlaps;
+ /* the mb num from which modes and mvs (first partition) are corrupt */
+ unsigned int mvs_corrupt_from_mb;
+#endif
+ int ec_enabled;
+ int ec_active;
+ int decoded_key_frame;
+ int independent_partitions;
+ int frame_corrupt_residual;
+
+ vpx_decrypt_cb decrypt_cb;
+ void *decrypt_state;
+} VP8D_COMP;
+
+int vp8_decode_frame(VP8D_COMP *cpi);
+
+int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf);
+int vp8_remove_decoder_instances(struct frame_buffers *fb);
+
+#if CONFIG_DEBUG
+#define CHECK_MEM_ERROR(lval,expr) do {\
+ lval = (expr); \
+ if(!lval) \
+ vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,\
+ "Failed to allocate "#lval" at %s:%d", \
+ __FILE__,__LINE__);\
+ } while(0)
+#else
+#define CHECK_MEM_ERROR(lval,expr) do {\
+ lval = (expr); \
+ if(!lval) \
+ vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,\
+ "Failed to allocate "#lval);\
+ } while(0)
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_DECODER_ONYXD_INT_H_
diff --git a/thirdparty/libvpx/vp8/decoder/threading.c b/thirdparty/libvpx/vp8/decoder/threading.c
new file mode 100644
index 0000000000..3c1b8387ec
--- /dev/null
+++ b/thirdparty/libvpx/vp8/decoder/threading.c
@@ -0,0 +1,928 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#if !defined(WIN32) && CONFIG_OS_SUPPORT == 1
+# include <unistd.h>
+#endif
+#include "onyxd_int.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/common/threading.h"
+
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/extend.h"
+#include "vpx_ports/vpx_timer.h"
+#include "detokenize.h"
+#include "vp8/common/reconintra4x4.h"
+#include "vp8/common/reconinter.h"
+#include "vp8/common/reconintra.h"
+#include "vp8/common/setupintrarecon.h"
+#if CONFIG_ERROR_CONCEALMENT
+#include "error_concealment.h"
+#endif
+
+#define CALLOC_ARRAY(p, n) CHECK_MEM_ERROR((p), vpx_calloc(sizeof(*(p)), (n)))
+#define CALLOC_ARRAY_ALIGNED(p, n, algn) do { \
+ CHECK_MEM_ERROR((p), vpx_memalign((algn), sizeof(*(p)) * (n))); \
+ memset((p), 0, (n) * sizeof(*(p))); \
+} while (0)
+
+
+void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd);
+
+static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count)
+{
+ VP8_COMMON *const pc = & pbi->common;
+ int i;
+
+ for (i = 0; i < count; i++)
+ {
+ MACROBLOCKD *mbd = &mbrd[i].mbd;
+ mbd->subpixel_predict = xd->subpixel_predict;
+ mbd->subpixel_predict8x4 = xd->subpixel_predict8x4;
+ mbd->subpixel_predict8x8 = xd->subpixel_predict8x8;
+ mbd->subpixel_predict16x16 = xd->subpixel_predict16x16;
+
+ mbd->frame_type = pc->frame_type;
+ mbd->pre = xd->pre;
+ mbd->dst = xd->dst;
+
+ mbd->segmentation_enabled = xd->segmentation_enabled;
+ mbd->mb_segement_abs_delta = xd->mb_segement_abs_delta;
+ memcpy(mbd->segment_feature_data, xd->segment_feature_data, sizeof(xd->segment_feature_data));
+
+ /*signed char ref_lf_deltas[MAX_REF_LF_DELTAS];*/
+ memcpy(mbd->ref_lf_deltas, xd->ref_lf_deltas, sizeof(xd->ref_lf_deltas));
+ /*signed char mode_lf_deltas[MAX_MODE_LF_DELTAS];*/
+ memcpy(mbd->mode_lf_deltas, xd->mode_lf_deltas, sizeof(xd->mode_lf_deltas));
+ /*unsigned char mode_ref_lf_delta_enabled;
+ unsigned char mode_ref_lf_delta_update;*/
+ mbd->mode_ref_lf_delta_enabled = xd->mode_ref_lf_delta_enabled;
+ mbd->mode_ref_lf_delta_update = xd->mode_ref_lf_delta_update;
+
+ mbd->current_bc = &pbi->mbc[0];
+
+ memcpy(mbd->dequant_y1_dc, xd->dequant_y1_dc, sizeof(xd->dequant_y1_dc));
+ memcpy(mbd->dequant_y1, xd->dequant_y1, sizeof(xd->dequant_y1));
+ memcpy(mbd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2));
+ memcpy(mbd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv));
+
+ mbd->fullpixel_mask = 0xffffffff;
+
+ if (pc->full_pixel)
+ mbd->fullpixel_mask = 0xfffffff8;
+
+ }
+
+ for (i = 0; i < pc->mb_rows; i++)
+ pbi->mt_current_mb_col[i] = -1;
+}
+
+static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
+ unsigned int mb_idx)
+{
+ MB_PREDICTION_MODE mode;
+ int i;
+#if CONFIG_ERROR_CONCEALMENT
+ int corruption_detected = 0;
+#else
+ (void)mb_idx;
+#endif
+
+ if (xd->mode_info_context->mbmi.mb_skip_coeff)
+ {
+ vp8_reset_mb_tokens_context(xd);
+ }
+ else if (!vp8dx_bool_error(xd->current_bc))
+ {
+ int eobtotal;
+ eobtotal = vp8_decode_mb_tokens(pbi, xd);
+
+ /* Special case: Force the loopfilter to skip when eobtotal is zero */
+ xd->mode_info_context->mbmi.mb_skip_coeff = (eobtotal==0);
+ }
+
+ mode = xd->mode_info_context->mbmi.mode;
+
+ if (xd->segmentation_enabled)
+ vp8_mb_init_dequantizer(pbi, xd);
+
+
+#if CONFIG_ERROR_CONCEALMENT
+
+ if(pbi->ec_active)
+ {
+ int throw_residual;
+ /* When we have independent partitions we can apply residual even
+ * though other partitions within the frame are corrupt.
+ */
+ throw_residual = (!pbi->independent_partitions &&
+ pbi->frame_corrupt_residual);
+ throw_residual = (throw_residual || vp8dx_bool_error(xd->current_bc));
+
+ if ((mb_idx >= pbi->mvs_corrupt_from_mb || throw_residual))
+ {
+ /* MB with corrupt residuals or corrupt mode/motion vectors.
+ * Better to use the predictor as reconstruction.
+ */
+ pbi->frame_corrupt_residual = 1;
+ memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
+
+ corruption_detected = 1;
+
+ /* force idct to be skipped for B_PRED and use the
+ * prediction only for reconstruction
+ * */
+ memset(xd->eobs, 0, 25);
+ }
+ }
+#endif
+
+ /* do prediction */
+ if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
+ {
+ vp8_build_intra_predictors_mbuv_s(xd,
+ xd->recon_above[1],
+ xd->recon_above[2],
+ xd->recon_left[1],
+ xd->recon_left[2],
+ xd->recon_left_stride[1],
+ xd->dst.u_buffer, xd->dst.v_buffer,
+ xd->dst.uv_stride);
+
+ if (mode != B_PRED)
+ {
+ vp8_build_intra_predictors_mby_s(xd,
+ xd->recon_above[0],
+ xd->recon_left[0],
+ xd->recon_left_stride[0],
+ xd->dst.y_buffer,
+ xd->dst.y_stride);
+ }
+ else
+ {
+ short *DQC = xd->dequant_y1;
+ int dst_stride = xd->dst.y_stride;
+
+ /* clear out residual eob info */
+ if(xd->mode_info_context->mbmi.mb_skip_coeff)
+ memset(xd->eobs, 0, 25);
+
+ intra_prediction_down_copy(xd, xd->recon_above[0] + 16);
+
+ for (i = 0; i < 16; i++)
+ {
+ BLOCKD *b = &xd->block[i];
+ unsigned char *dst = xd->dst.y_buffer + b->offset;
+ B_PREDICTION_MODE b_mode =
+ xd->mode_info_context->bmi[i].as_mode;
+ unsigned char *Above;
+ unsigned char *yleft;
+ int left_stride;
+ unsigned char top_left;
+
+ /*Caution: For some b_mode, it needs 8 pixels (4 above + 4 above-right).*/
+ if (i < 4 && pbi->common.filter_level)
+ Above = xd->recon_above[0] + b->offset;
+ else
+ Above = dst - dst_stride;
+
+ if (i%4==0 && pbi->common.filter_level)
+ {
+ yleft = xd->recon_left[0] + i;
+ left_stride = 1;
+ }
+ else
+ {
+ yleft = dst - 1;
+ left_stride = dst_stride;
+ }
+
+ if ((i==4 || i==8 || i==12) && pbi->common.filter_level)
+ top_left = *(xd->recon_left[0] + i - 1);
+ else
+ top_left = Above[-1];
+
+ vp8_intra4x4_predict(Above, yleft, left_stride,
+ b_mode, dst, dst_stride, top_left);
+
+ if (xd->eobs[i] )
+ {
+ if (xd->eobs[i] > 1)
+ {
+ vp8_dequant_idct_add(b->qcoeff, DQC, dst, dst_stride);
+ }
+ else
+ {
+ vp8_dc_only_idct_add(b->qcoeff[0] * DQC[0],
+ dst, dst_stride, dst, dst_stride);
+ memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
+ }
+ }
+ }
+ }
+ }
+ else
+ {
+ vp8_build_inter_predictors_mb(xd);
+ }
+
+
+#if CONFIG_ERROR_CONCEALMENT
+ if (corruption_detected)
+ {
+ return;
+ }
+#endif
+
+ if(!xd->mode_info_context->mbmi.mb_skip_coeff)
+ {
+ /* dequantization and idct */
+ if (mode != B_PRED)
+ {
+ short *DQC = xd->dequant_y1;
+
+ if (mode != SPLITMV)
+ {
+ BLOCKD *b = &xd->block[24];
+
+ /* do 2nd order transform on the dc block */
+ if (xd->eobs[24] > 1)
+ {
+ vp8_dequantize_b(b, xd->dequant_y2);
+
+ vp8_short_inv_walsh4x4(&b->dqcoeff[0],
+ xd->qcoeff);
+ memset(b->qcoeff, 0, 16 * sizeof(b->qcoeff[0]));
+ }
+ else
+ {
+ b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0];
+ vp8_short_inv_walsh4x4_1(&b->dqcoeff[0],
+ xd->qcoeff);
+ memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
+ }
+
+ /* override the dc dequant constant in order to preserve the
+ * dc components
+ */
+ DQC = xd->dequant_y1_dc;
+ }
+
+ vp8_dequant_idct_add_y_block
+ (xd->qcoeff, DQC,
+ xd->dst.y_buffer,
+ xd->dst.y_stride, xd->eobs);
+ }
+
+ vp8_dequant_idct_add_uv_block
+ (xd->qcoeff+16*16, xd->dequant_uv,
+ xd->dst.u_buffer, xd->dst.v_buffer,
+ xd->dst.uv_stride, xd->eobs+16);
+ }
+}
+
+static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, int start_mb_row)
+{
+ const int *last_row_current_mb_col;
+ int *current_mb_col;
+ int mb_row;
+ VP8_COMMON *pc = &pbi->common;
+ const int nsync = pbi->sync_range;
+ const int first_row_no_sync_above = pc->mb_cols + nsync;
+ int num_part = 1 << pbi->common.multi_token_partition;
+ int last_mb_row = start_mb_row;
+
+ YV12_BUFFER_CONFIG *yv12_fb_new = pbi->dec_fb_ref[INTRA_FRAME];
+ YV12_BUFFER_CONFIG *yv12_fb_lst = pbi->dec_fb_ref[LAST_FRAME];
+
+ int recon_y_stride = yv12_fb_new->y_stride;
+ int recon_uv_stride = yv12_fb_new->uv_stride;
+
+ unsigned char *ref_buffer[MAX_REF_FRAMES][3];
+ unsigned char *dst_buffer[3];
+ int i;
+ int ref_fb_corrupted[MAX_REF_FRAMES];
+
+ ref_fb_corrupted[INTRA_FRAME] = 0;
+
+ for(i = 1; i < MAX_REF_FRAMES; i++)
+ {
+ YV12_BUFFER_CONFIG *this_fb = pbi->dec_fb_ref[i];
+
+ ref_buffer[i][0] = this_fb->y_buffer;
+ ref_buffer[i][1] = this_fb->u_buffer;
+ ref_buffer[i][2] = this_fb->v_buffer;
+
+ ref_fb_corrupted[i] = this_fb->corrupted;
+ }
+
+ dst_buffer[0] = yv12_fb_new->y_buffer;
+ dst_buffer[1] = yv12_fb_new->u_buffer;
+ dst_buffer[2] = yv12_fb_new->v_buffer;
+
+ xd->up_available = (start_mb_row != 0);
+
+ xd->mode_info_context = pc->mi + pc->mode_info_stride * start_mb_row;
+ xd->mode_info_stride = pc->mode_info_stride;
+
+ for (mb_row = start_mb_row; mb_row < pc->mb_rows; mb_row += (pbi->decoding_thread_count + 1))
+ {
+ int recon_yoffset, recon_uvoffset;
+ int mb_col;
+ int filter_level;
+ loop_filter_info_n *lfi_n = &pc->lf_info;
+
+ /* save last row processed by this thread */
+ last_mb_row = mb_row;
+ /* select bool coder for current partition */
+ xd->current_bc = &pbi->mbc[mb_row%num_part];
+
+ if (mb_row > 0)
+ last_row_current_mb_col = &pbi->mt_current_mb_col[mb_row -1];
+ else
+ last_row_current_mb_col = &first_row_no_sync_above;
+
+ current_mb_col = &pbi->mt_current_mb_col[mb_row];
+
+ recon_yoffset = mb_row * recon_y_stride * 16;
+ recon_uvoffset = mb_row * recon_uv_stride * 8;
+
+ /* reset contexts */
+ xd->above_context = pc->above_context;
+ memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+
+ xd->left_available = 0;
+
+ xd->mb_to_top_edge = -((mb_row * 16)) << 3;
+ xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
+
+ if (pbi->common.filter_level)
+ {
+ xd->recon_above[0] = pbi->mt_yabove_row[mb_row] + 0*16 +32;
+ xd->recon_above[1] = pbi->mt_uabove_row[mb_row] + 0*8 +16;
+ xd->recon_above[2] = pbi->mt_vabove_row[mb_row] + 0*8 +16;
+
+ xd->recon_left[0] = pbi->mt_yleft_col[mb_row];
+ xd->recon_left[1] = pbi->mt_uleft_col[mb_row];
+ xd->recon_left[2] = pbi->mt_vleft_col[mb_row];
+
+ /* TODO: move to outside row loop */
+ xd->recon_left_stride[0] = 1;
+ xd->recon_left_stride[1] = 1;
+ }
+ else
+ {
+ xd->recon_above[0] = dst_buffer[0] + recon_yoffset;
+ xd->recon_above[1] = dst_buffer[1] + recon_uvoffset;
+ xd->recon_above[2] = dst_buffer[2] + recon_uvoffset;
+
+ xd->recon_left[0] = xd->recon_above[0] - 1;
+ xd->recon_left[1] = xd->recon_above[1] - 1;
+ xd->recon_left[2] = xd->recon_above[2] - 1;
+
+ xd->recon_above[0] -= xd->dst.y_stride;
+ xd->recon_above[1] -= xd->dst.uv_stride;
+ xd->recon_above[2] -= xd->dst.uv_stride;
+
+ /* TODO: move to outside row loop */
+ xd->recon_left_stride[0] = xd->dst.y_stride;
+ xd->recon_left_stride[1] = xd->dst.uv_stride;
+
+ setup_intra_recon_left(xd->recon_left[0], xd->recon_left[1],
+ xd->recon_left[2], xd->dst.y_stride,
+ xd->dst.uv_stride);
+ }
+
+ for (mb_col = 0; mb_col < pc->mb_cols; mb_col++) {
+ if (((mb_col - 1) % nsync) == 0) {
+ pthread_mutex_t *mutex = &pbi->pmutex[mb_row];
+ protected_write(mutex, current_mb_col, mb_col - 1);
+ }
+
+ if (mb_row && !(mb_col & (nsync - 1))) {
+ pthread_mutex_t *mutex = &pbi->pmutex[mb_row-1];
+ sync_read(mutex, mb_col, last_row_current_mb_col, nsync);
+ }
+
+ /* Distance of MB to the various image edges.
+ * These are specified to 8th pel as they are always
+ * compared to values that are in 1/8th pel units.
+ */
+ xd->mb_to_left_edge = -((mb_col * 16) << 3);
+ xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
+
+ #if CONFIG_ERROR_CONCEALMENT
+ {
+ int corrupt_residual =
+ (!pbi->independent_partitions &&
+ pbi->frame_corrupt_residual) ||
+ vp8dx_bool_error(xd->current_bc);
+ if (pbi->ec_active &&
+ (xd->mode_info_context->mbmi.ref_frame ==
+ INTRA_FRAME) &&
+ corrupt_residual)
+ {
+ /* We have an intra block with corrupt
+ * coefficients, better to conceal with an inter
+ * block.
+ * Interpolate MVs from neighboring MBs
+ *
+ * Note that for the first mb with corrupt
+ * residual in a frame, we might not discover
+ * that before decoding the residual. That
+ * happens after this check, and therefore no
+ * inter concealment will be done.
+ */
+ vp8_interpolate_motion(xd,
+ mb_row, mb_col,
+ pc->mb_rows, pc->mb_cols);
+ }
+ }
+ #endif
+
+
+ xd->dst.y_buffer = dst_buffer[0] + recon_yoffset;
+ xd->dst.u_buffer = dst_buffer[1] + recon_uvoffset;
+ xd->dst.v_buffer = dst_buffer[2] + recon_uvoffset;
+
+ xd->pre.y_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][0] + recon_yoffset;
+ xd->pre.u_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][1] + recon_uvoffset;
+ xd->pre.v_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][2] + recon_uvoffset;
+
+ /* propagate errors from reference frames */
+ xd->corrupted |= ref_fb_corrupted[xd->mode_info_context->mbmi.ref_frame];
+
+ mt_decode_macroblock(pbi, xd, 0);
+
+ xd->left_available = 1;
+
+ /* check if the boolean decoder has suffered an error */
+ xd->corrupted |= vp8dx_bool_error(xd->current_bc);
+
+ xd->recon_above[0] += 16;
+ xd->recon_above[1] += 8;
+ xd->recon_above[2] += 8;
+
+ if (!pbi->common.filter_level)
+ {
+ xd->recon_left[0] += 16;
+ xd->recon_left[1] += 8;
+ xd->recon_left[2] += 8;
+ }
+
+ if (pbi->common.filter_level)
+ {
+ int skip_lf = (xd->mode_info_context->mbmi.mode != B_PRED &&
+ xd->mode_info_context->mbmi.mode != SPLITMV &&
+ xd->mode_info_context->mbmi.mb_skip_coeff);
+
+ const int mode_index = lfi_n->mode_lf_lut[xd->mode_info_context->mbmi.mode];
+ const int seg = xd->mode_info_context->mbmi.segment_id;
+ const int ref_frame = xd->mode_info_context->mbmi.ref_frame;
+
+ filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+ if( mb_row != pc->mb_rows-1 )
+ {
+ /* Save decoded MB last row data for next-row decoding */
+ memcpy((pbi->mt_yabove_row[mb_row + 1] + 32 + mb_col*16), (xd->dst.y_buffer + 15 * recon_y_stride), 16);
+ memcpy((pbi->mt_uabove_row[mb_row + 1] + 16 + mb_col*8), (xd->dst.u_buffer + 7 * recon_uv_stride), 8);
+ memcpy((pbi->mt_vabove_row[mb_row + 1] + 16 + mb_col*8), (xd->dst.v_buffer + 7 * recon_uv_stride), 8);
+ }
+
+ /* save left_col for next MB decoding */
+ if(mb_col != pc->mb_cols-1)
+ {
+ MODE_INFO *next = xd->mode_info_context +1;
+
+ if (next->mbmi.ref_frame == INTRA_FRAME)
+ {
+ for (i = 0; i < 16; i++)
+ pbi->mt_yleft_col[mb_row][i] = xd->dst.y_buffer [i* recon_y_stride + 15];
+ for (i = 0; i < 8; i++)
+ {
+ pbi->mt_uleft_col[mb_row][i] = xd->dst.u_buffer [i* recon_uv_stride + 7];
+ pbi->mt_vleft_col[mb_row][i] = xd->dst.v_buffer [i* recon_uv_stride + 7];
+ }
+ }
+ }
+
+ /* loopfilter on this macroblock. */
+ if (filter_level)
+ {
+ if(pc->filter_type == NORMAL_LOOPFILTER)
+ {
+ loop_filter_info lfi;
+ FRAME_TYPE frame_type = pc->frame_type;
+ const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+ lfi.mblim = lfi_n->mblim[filter_level];
+ lfi.blim = lfi_n->blim[filter_level];
+ lfi.lim = lfi_n->lim[filter_level];
+ lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+ if (mb_col > 0)
+ vp8_loop_filter_mbv
+ (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
+
+ if (!skip_lf)
+ vp8_loop_filter_bv
+ (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
+
+ /* don't apply across umv border */
+ if (mb_row > 0)
+ vp8_loop_filter_mbh
+ (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
+
+ if (!skip_lf)
+ vp8_loop_filter_bh
+ (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
+ }
+ else
+ {
+ if (mb_col > 0)
+ vp8_loop_filter_simple_mbv
+ (xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]);
+
+ if (!skip_lf)
+ vp8_loop_filter_simple_bv
+ (xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]);
+
+ /* don't apply across umv border */
+ if (mb_row > 0)
+ vp8_loop_filter_simple_mbh
+ (xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]);
+
+ if (!skip_lf)
+ vp8_loop_filter_simple_bh
+ (xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]);
+ }
+ }
+
+ }
+
+ recon_yoffset += 16;
+ recon_uvoffset += 8;
+
+ ++xd->mode_info_context; /* next mb */
+
+ xd->above_context++;
+ }
+
+ /* adjust to the next row of mbs */
+ if (pbi->common.filter_level)
+ {
+ if(mb_row != pc->mb_rows-1)
+ {
+ int lasty = yv12_fb_lst->y_width + VP8BORDERINPIXELS;
+ int lastuv = (yv12_fb_lst->y_width>>1) + (VP8BORDERINPIXELS>>1);
+
+ for (i = 0; i < 4; i++)
+ {
+ pbi->mt_yabove_row[mb_row +1][lasty + i] = pbi->mt_yabove_row[mb_row +1][lasty -1];
+ pbi->mt_uabove_row[mb_row +1][lastuv + i] = pbi->mt_uabove_row[mb_row +1][lastuv -1];
+ pbi->mt_vabove_row[mb_row +1][lastuv + i] = pbi->mt_vabove_row[mb_row +1][lastuv -1];
+ }
+ }
+ }
+ else
+ vp8_extend_mb_row(yv12_fb_new, xd->dst.y_buffer + 16,
+ xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
+
+ /* last MB of row is ready just after extension is done */
+ protected_write(&pbi->pmutex[mb_row], current_mb_col, mb_col + nsync);
+
+ ++xd->mode_info_context; /* skip prediction column */
+ xd->up_available = 1;
+
+ /* since we have multithread */
+ xd->mode_info_context += xd->mode_info_stride * pbi->decoding_thread_count;
+ }
+
+ /* signal end of frame decoding if this thread processed the last mb_row */
+ if (last_mb_row == (pc->mb_rows - 1))
+ sem_post(&pbi->h_event_end_decoding);
+
+}
+
+
+static THREAD_FUNCTION thread_decoding_proc(void *p_data)
+{
+ int ithread = ((DECODETHREAD_DATA *)p_data)->ithread;
+ VP8D_COMP *pbi = (VP8D_COMP *)(((DECODETHREAD_DATA *)p_data)->ptr1);
+ MB_ROW_DEC *mbrd = (MB_ROW_DEC *)(((DECODETHREAD_DATA *)p_data)->ptr2);
+ ENTROPY_CONTEXT_PLANES mb_row_left_context;
+
+ while (1)
+ {
+ if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd) == 0)
+ break;
+
+ if (sem_wait(&pbi->h_event_start_decoding[ithread]) == 0)
+ {
+ if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd) == 0)
+ break;
+ else
+ {
+ MACROBLOCKD *xd = &mbrd->mbd;
+ xd->left_context = &mb_row_left_context;
+
+ mt_decode_mb_rows(pbi, xd, ithread+1);
+ }
+ }
+ }
+
+ return 0 ;
+}
+
+
+void vp8_decoder_create_threads(VP8D_COMP *pbi)
+{
+ int core_count = 0;
+ unsigned int ithread;
+
+ pbi->b_multithreaded_rd = 0;
+ pbi->allocated_decoding_thread_count = 0;
+ pthread_mutex_init(&pbi->mt_mutex, NULL);
+
+ /* limit decoding threads to the max number of token partitions */
+ core_count = (pbi->max_threads > 8) ? 8 : pbi->max_threads;
+
+ /* limit decoding threads to the available cores */
+ if (core_count > pbi->common.processor_core_count)
+ core_count = pbi->common.processor_core_count;
+
+ if (core_count > 1)
+ {
+ pbi->b_multithreaded_rd = 1;
+ pbi->decoding_thread_count = core_count - 1;
+
+ CALLOC_ARRAY(pbi->h_decoding_thread, pbi->decoding_thread_count);
+ CALLOC_ARRAY(pbi->h_event_start_decoding, pbi->decoding_thread_count);
+ CALLOC_ARRAY_ALIGNED(pbi->mb_row_di, pbi->decoding_thread_count, 32);
+ CALLOC_ARRAY(pbi->de_thread_data, pbi->decoding_thread_count);
+
+ for (ithread = 0; ithread < pbi->decoding_thread_count; ithread++)
+ {
+ sem_init(&pbi->h_event_start_decoding[ithread], 0, 0);
+
+ vp8_setup_block_dptrs(&pbi->mb_row_di[ithread].mbd);
+
+ pbi->de_thread_data[ithread].ithread = ithread;
+ pbi->de_thread_data[ithread].ptr1 = (void *)pbi;
+ pbi->de_thread_data[ithread].ptr2 = (void *) &pbi->mb_row_di[ithread];
+
+ pthread_create(&pbi->h_decoding_thread[ithread], 0, thread_decoding_proc, (&pbi->de_thread_data[ithread]));
+ }
+
+ sem_init(&pbi->h_event_end_decoding, 0, 0);
+
+ pbi->allocated_decoding_thread_count = pbi->decoding_thread_count;
+ }
+}
+
+
+void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows)
+{
+ int i;
+
+ if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd))
+ {
+ /* De-allocate mutex */
+ if (pbi->pmutex != NULL) {
+ for (i = 0; i < mb_rows; i++) {
+ pthread_mutex_destroy(&pbi->pmutex[i]);
+ }
+ vpx_free(pbi->pmutex);
+ pbi->pmutex = NULL;
+ }
+
+ vpx_free(pbi->mt_current_mb_col);
+ pbi->mt_current_mb_col = NULL ;
+
+ /* Free above_row buffers. */
+ if (pbi->mt_yabove_row)
+ {
+ for (i=0; i< mb_rows; i++)
+ {
+ vpx_free(pbi->mt_yabove_row[i]);
+ pbi->mt_yabove_row[i] = NULL ;
+ }
+ vpx_free(pbi->mt_yabove_row);
+ pbi->mt_yabove_row = NULL ;
+ }
+
+ if (pbi->mt_uabove_row)
+ {
+ for (i=0; i< mb_rows; i++)
+ {
+ vpx_free(pbi->mt_uabove_row[i]);
+ pbi->mt_uabove_row[i] = NULL ;
+ }
+ vpx_free(pbi->mt_uabove_row);
+ pbi->mt_uabove_row = NULL ;
+ }
+
+ if (pbi->mt_vabove_row)
+ {
+ for (i=0; i< mb_rows; i++)
+ {
+ vpx_free(pbi->mt_vabove_row[i]);
+ pbi->mt_vabove_row[i] = NULL ;
+ }
+ vpx_free(pbi->mt_vabove_row);
+ pbi->mt_vabove_row = NULL ;
+ }
+
+ /* Free left_col buffers. */
+ if (pbi->mt_yleft_col)
+ {
+ for (i=0; i< mb_rows; i++)
+ {
+ vpx_free(pbi->mt_yleft_col[i]);
+ pbi->mt_yleft_col[i] = NULL ;
+ }
+ vpx_free(pbi->mt_yleft_col);
+ pbi->mt_yleft_col = NULL ;
+ }
+
+ if (pbi->mt_uleft_col)
+ {
+ for (i=0; i< mb_rows; i++)
+ {
+ vpx_free(pbi->mt_uleft_col[i]);
+ pbi->mt_uleft_col[i] = NULL ;
+ }
+ vpx_free(pbi->mt_uleft_col);
+ pbi->mt_uleft_col = NULL ;
+ }
+
+ if (pbi->mt_vleft_col)
+ {
+ for (i=0; i< mb_rows; i++)
+ {
+ vpx_free(pbi->mt_vleft_col[i]);
+ pbi->mt_vleft_col[i] = NULL ;
+ }
+ vpx_free(pbi->mt_vleft_col);
+ pbi->mt_vleft_col = NULL ;
+ }
+ }
+}
+
+
+void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows)
+{
+ VP8_COMMON *const pc = & pbi->common;
+ int i;
+ int uv_width;
+
+ if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd))
+ {
+ vp8mt_de_alloc_temp_buffers(pbi, prev_mb_rows);
+
+ /* our internal buffers are always multiples of 16 */
+ if ((width & 0xf) != 0)
+ width += 16 - (width & 0xf);
+
+ if (width < 640) pbi->sync_range = 1;
+ else if (width <= 1280) pbi->sync_range = 8;
+ else if (width <= 2560) pbi->sync_range =16;
+ else pbi->sync_range = 32;
+
+ uv_width = width >>1;
+
+ /* Allocate mutex */
+ CHECK_MEM_ERROR(pbi->pmutex, vpx_malloc(sizeof(*pbi->pmutex) *
+ pc->mb_rows));
+ if (pbi->pmutex) {
+ for (i = 0; i < pc->mb_rows; i++) {
+ pthread_mutex_init(&pbi->pmutex[i], NULL);
+ }
+ }
+
+ /* Allocate an int for each mb row. */
+ CALLOC_ARRAY(pbi->mt_current_mb_col, pc->mb_rows);
+
+ /* Allocate memory for above_row buffers. */
+ CALLOC_ARRAY(pbi->mt_yabove_row, pc->mb_rows);
+ for (i = 0; i < pc->mb_rows; i++)
+ CHECK_MEM_ERROR(pbi->mt_yabove_row[i], vpx_memalign(16,sizeof(unsigned char) * (width + (VP8BORDERINPIXELS<<1))));
+
+ CALLOC_ARRAY(pbi->mt_uabove_row, pc->mb_rows);
+ for (i = 0; i < pc->mb_rows; i++)
+ CHECK_MEM_ERROR(pbi->mt_uabove_row[i], vpx_memalign(16,sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));
+
+ CALLOC_ARRAY(pbi->mt_vabove_row, pc->mb_rows);
+ for (i = 0; i < pc->mb_rows; i++)
+ CHECK_MEM_ERROR(pbi->mt_vabove_row[i], vpx_memalign(16,sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));
+
+ /* Allocate memory for left_col buffers. */
+ CALLOC_ARRAY(pbi->mt_yleft_col, pc->mb_rows);
+ for (i = 0; i < pc->mb_rows; i++)
+ CHECK_MEM_ERROR(pbi->mt_yleft_col[i], vpx_calloc(sizeof(unsigned char) * 16, 1));
+
+ CALLOC_ARRAY(pbi->mt_uleft_col, pc->mb_rows);
+ for (i = 0; i < pc->mb_rows; i++)
+ CHECK_MEM_ERROR(pbi->mt_uleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1));
+
+ CALLOC_ARRAY(pbi->mt_vleft_col, pc->mb_rows);
+ for (i = 0; i < pc->mb_rows; i++)
+ CHECK_MEM_ERROR(pbi->mt_vleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1));
+ }
+}
+
+
+void vp8_decoder_remove_threads(VP8D_COMP *pbi)
+{
+ /* shutdown MB Decoding thread; */
+ if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd))
+ {
+ int i;
+
+ protected_write(&pbi->mt_mutex, &pbi->b_multithreaded_rd, 0);
+
+ /* allow all threads to exit */
+ for (i = 0; i < pbi->allocated_decoding_thread_count; i++)
+ {
+ sem_post(&pbi->h_event_start_decoding[i]);
+ pthread_join(pbi->h_decoding_thread[i], NULL);
+ }
+
+ for (i = 0; i < pbi->allocated_decoding_thread_count; i++)
+ {
+ sem_destroy(&pbi->h_event_start_decoding[i]);
+ }
+
+ sem_destroy(&pbi->h_event_end_decoding);
+
+ vpx_free(pbi->h_decoding_thread);
+ pbi->h_decoding_thread = NULL;
+
+ vpx_free(pbi->h_event_start_decoding);
+ pbi->h_event_start_decoding = NULL;
+
+ vpx_free(pbi->mb_row_di);
+ pbi->mb_row_di = NULL ;
+
+ vpx_free(pbi->de_thread_data);
+ pbi->de_thread_data = NULL;
+ }
+ pthread_mutex_destroy(&pbi->mt_mutex);
+}
+
+void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
+{
+ VP8_COMMON *pc = &pbi->common;
+ unsigned int i;
+ int j;
+
+ int filter_level = pc->filter_level;
+ YV12_BUFFER_CONFIG *yv12_fb_new = pbi->dec_fb_ref[INTRA_FRAME];
+
+ if (filter_level)
+ {
+ /* Set above_row buffer to 127 for decoding first MB row */
+ memset(pbi->mt_yabove_row[0] + VP8BORDERINPIXELS-1, 127, yv12_fb_new->y_width + 5);
+ memset(pbi->mt_uabove_row[0] + (VP8BORDERINPIXELS>>1)-1, 127, (yv12_fb_new->y_width>>1) +5);
+ memset(pbi->mt_vabove_row[0] + (VP8BORDERINPIXELS>>1)-1, 127, (yv12_fb_new->y_width>>1) +5);
+
+ for (j=1; j<pc->mb_rows; j++)
+ {
+ memset(pbi->mt_yabove_row[j] + VP8BORDERINPIXELS-1, (unsigned char)129, 1);
+ memset(pbi->mt_uabove_row[j] + (VP8BORDERINPIXELS>>1)-1, (unsigned char)129, 1);
+ memset(pbi->mt_vabove_row[j] + (VP8BORDERINPIXELS>>1)-1, (unsigned char)129, 1);
+ }
+
+ /* Set left_col to 129 initially */
+ for (j=0; j<pc->mb_rows; j++)
+ {
+ memset(pbi->mt_yleft_col[j], (unsigned char)129, 16);
+ memset(pbi->mt_uleft_col[j], (unsigned char)129, 8);
+ memset(pbi->mt_vleft_col[j], (unsigned char)129, 8);
+ }
+
+ /* Initialize the loop filter for this frame. */
+ vp8_loop_filter_frame_init(pc, &pbi->mb, filter_level);
+ }
+ else
+ vp8_setup_intra_recon_top_line(yv12_fb_new);
+
+ setup_decoding_thread_data(pbi, xd, pbi->mb_row_di, pbi->decoding_thread_count);
+
+ for (i = 0; i < pbi->decoding_thread_count; i++)
+ sem_post(&pbi->h_event_start_decoding[i]);
+
+ mt_decode_mb_rows(pbi, xd, 0);
+
+ sem_wait(&pbi->h_event_end_decoding); /* add back for each frame */
+}
diff --git a/thirdparty/libvpx/vp8/decoder/treereader.h b/thirdparty/libvpx/vp8/decoder/treereader.h
new file mode 100644
index 0000000000..f7d23c3698
--- /dev/null
+++ b/thirdparty/libvpx/vp8/decoder/treereader.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_DECODER_TREEREADER_H_
+#define VP8_DECODER_TREEREADER_H_
+
+#include "./vpx_config.h"
+#include "vp8/common/treecoder.h"
+#include "dboolhuff.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef BOOL_DECODER vp8_reader;
+
+#define vp8_read vp8dx_decode_bool
+#define vp8_read_literal vp8_decode_value
+#define vp8_read_bit(R) vp8_read(R, vp8_prob_half)
+
+
+/* Intent of tree data structure is to make decoding trivial. */
+
+static INLINE int vp8_treed_read(
+ vp8_reader *const r, /* !!! must return a 0 or 1 !!! */
+ vp8_tree t,
+ const vp8_prob *const p
+)
+{
+ register vp8_tree_index i = 0;
+
+ while ((i = t[ i + vp8_read(r, p[i>>1])]) > 0) ;
+
+ return -i;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_DECODER_TREEREADER_H_
diff --git a/thirdparty/libvpx/vp8/vp8_dx_iface.c b/thirdparty/libvpx/vp8/vp8_dx_iface.c
new file mode 100644
index 0000000000..fc9288d62b
--- /dev/null
+++ b/thirdparty/libvpx/vp8/vp8_dx_iface.c
@@ -0,0 +1,828 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include "./vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+#include "vpx/vpx_decoder.h"
+#include "vpx/vp8dx.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx_version.h"
+#include "common/alloccommon.h"
+#include "common/common.h"
+#include "common/onyxd.h"
+#include "decoder/onyxd_int.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#if CONFIG_ERROR_CONCEALMENT
+#include "decoder/error_concealment.h"
+#endif
+#include "decoder/decoderthreading.h"
+
+#define VP8_CAP_POSTPROC (CONFIG_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0)
+#define VP8_CAP_ERROR_CONCEALMENT (CONFIG_ERROR_CONCEALMENT ? \
+ VPX_CODEC_CAP_ERROR_CONCEALMENT : 0)
+
+typedef vpx_codec_stream_info_t vp8_stream_info_t;
+
+/* Structures for handling memory allocations */
+typedef enum
+{
+ VP8_SEG_ALG_PRIV = 256,
+ VP8_SEG_MAX
+} mem_seg_id_t;
+#define NELEMENTS(x) ((int)(sizeof(x)/sizeof(x[0])))
+
+struct vpx_codec_alg_priv
+{
+ vpx_codec_priv_t base;
+ vpx_codec_dec_cfg_t cfg;
+ vp8_stream_info_t si;
+ int decoder_init;
+ int postproc_cfg_set;
+ vp8_postproc_cfg_t postproc_cfg;
+#if CONFIG_POSTPROC_VISUALIZER
+ unsigned int dbg_postproc_flag;
+ int dbg_color_ref_frame_flag;
+ int dbg_color_mb_modes_flag;
+ int dbg_color_b_modes_flag;
+ int dbg_display_mv_flag;
+#endif
+ vpx_decrypt_cb decrypt_cb;
+ void *decrypt_state;
+ vpx_image_t img;
+ int img_setup;
+ struct frame_buffers yv12_frame_buffers;
+ void *user_priv;
+ FRAGMENT_DATA fragments;
+};
+
+static int vp8_init_ctx(vpx_codec_ctx_t *ctx)
+{
+ vpx_codec_alg_priv_t *priv =
+ (vpx_codec_alg_priv_t *)vpx_calloc(1, sizeof(*priv));
+ if (!priv) return 1;
+
+ ctx->priv = (vpx_codec_priv_t *)priv;
+ ctx->priv->init_flags = ctx->init_flags;
+
+ priv->si.sz = sizeof(priv->si);
+ priv->decrypt_cb = NULL;
+ priv->decrypt_state = NULL;
+
+ if (ctx->config.dec)
+ {
+ /* Update the reference to the config structure to an internal copy. */
+ priv->cfg = *ctx->config.dec;
+ ctx->config.dec = &priv->cfg;
+ }
+
+ return 0;
+}
+
+static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
+ vpx_codec_priv_enc_mr_cfg_t *data)
+{
+ vpx_codec_err_t res = VPX_CODEC_OK;
+ vpx_codec_alg_priv_t *priv = NULL;
+ (void) data;
+
+ vp8_rtcd();
+ vpx_dsp_rtcd();
+ vpx_scale_rtcd();
+
+ /* This function only allocates space for the vpx_codec_alg_priv_t
+ * structure. More memory may be required at the time the stream
+ * information becomes known.
+ */
+ if (!ctx->priv) {
+ if (vp8_init_ctx(ctx)) return VPX_CODEC_MEM_ERROR;
+ priv = (vpx_codec_alg_priv_t *)ctx->priv;
+
+ /* initialize number of fragments to zero */
+ priv->fragments.count = 0;
+ /* is input fragments enabled? */
+ priv->fragments.enabled =
+ (priv->base.init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS);
+
+ /*post processing level initialized to do nothing */
+ } else {
+ priv = (vpx_codec_alg_priv_t *)ctx->priv;
+ }
+
+ priv->yv12_frame_buffers.use_frame_threads =
+ (ctx->priv->init_flags & VPX_CODEC_USE_FRAME_THREADING);
+
+ /* for now, disable frame threading */
+ priv->yv12_frame_buffers.use_frame_threads = 0;
+
+ if (priv->yv12_frame_buffers.use_frame_threads &&
+ ((ctx->priv->init_flags & VPX_CODEC_USE_ERROR_CONCEALMENT) ||
+ (ctx->priv->init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS))) {
+ /* row-based threading, error concealment, and input fragments will
+ * not be supported when using frame-based threading */
+ res = VPX_CODEC_INVALID_PARAM;
+ }
+
+ return res;
+}
+
+static vpx_codec_err_t vp8_destroy(vpx_codec_alg_priv_t *ctx)
+{
+ vp8_remove_decoder_instances(&ctx->yv12_frame_buffers);
+
+ vpx_free(ctx);
+
+ return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t vp8_peek_si_internal(const uint8_t *data,
+ unsigned int data_sz,
+ vpx_codec_stream_info_t *si,
+ vpx_decrypt_cb decrypt_cb,
+ void *decrypt_state)
+{
+ vpx_codec_err_t res = VPX_CODEC_OK;
+
+ assert(data != NULL);
+
+ if(data + data_sz <= data)
+ {
+ res = VPX_CODEC_INVALID_PARAM;
+ }
+ else
+ {
+ /* Parse uncompresssed part of key frame header.
+ * 3 bytes:- including version, frame type and an offset
+ * 3 bytes:- sync code (0x9d, 0x01, 0x2a)
+ * 4 bytes:- including image width and height in the lowest 14 bits
+ * of each 2-byte value.
+ */
+ uint8_t clear_buffer[10];
+ const uint8_t *clear = data;
+ if (decrypt_cb)
+ {
+ int n = VPXMIN(sizeof(clear_buffer), data_sz);
+ decrypt_cb(decrypt_state, data, clear_buffer, n);
+ clear = clear_buffer;
+ }
+ si->is_kf = 0;
+
+ if (data_sz >= 10 && !(clear[0] & 0x01)) /* I-Frame */
+ {
+ si->is_kf = 1;
+
+ /* vet via sync code */
+ if (clear[3] != 0x9d || clear[4] != 0x01 || clear[5] != 0x2a)
+ return VPX_CODEC_UNSUP_BITSTREAM;
+
+ si->w = (clear[6] | (clear[7] << 8)) & 0x3fff;
+ si->h = (clear[8] | (clear[9] << 8)) & 0x3fff;
+
+ /*printf("w=%d, h=%d\n", si->w, si->h);*/
+ if (!(si->h | si->w))
+ res = VPX_CODEC_UNSUP_BITSTREAM;
+ }
+ else
+ {
+ res = VPX_CODEC_UNSUP_BITSTREAM;
+ }
+ }
+
+ return res;
+}
+
+static vpx_codec_err_t vp8_peek_si(const uint8_t *data,
+ unsigned int data_sz,
+ vpx_codec_stream_info_t *si) {
+ return vp8_peek_si_internal(data, data_sz, si, NULL, NULL);
+}
+
+static vpx_codec_err_t vp8_get_si(vpx_codec_alg_priv_t *ctx,
+ vpx_codec_stream_info_t *si)
+{
+
+ unsigned int sz;
+
+ if (si->sz >= sizeof(vp8_stream_info_t))
+ sz = sizeof(vp8_stream_info_t);
+ else
+ sz = sizeof(vpx_codec_stream_info_t);
+
+ memcpy(si, &ctx->si, sz);
+ si->sz = sz;
+
+ return VPX_CODEC_OK;
+}
+
+
+static vpx_codec_err_t
+update_error_state(vpx_codec_alg_priv_t *ctx,
+ const struct vpx_internal_error_info *error)
+{
+ vpx_codec_err_t res;
+
+ if ((res = error->error_code))
+ ctx->base.err_detail = error->has_detail
+ ? error->detail
+ : NULL;
+
+ return res;
+}
+
+static void yuvconfig2image(vpx_image_t *img,
+ const YV12_BUFFER_CONFIG *yv12,
+ void *user_priv)
+{
+ /** vpx_img_wrap() doesn't allow specifying independent strides for
+ * the Y, U, and V planes, nor other alignment adjustments that
+ * might be representable by a YV12_BUFFER_CONFIG, so we just
+ * initialize all the fields.*/
+ img->fmt = VPX_IMG_FMT_I420;
+ img->w = yv12->y_stride;
+ img->h = (yv12->y_height + 2 * VP8BORDERINPIXELS + 15) & ~15;
+ img->d_w = img->r_w = yv12->y_width;
+ img->d_h = img->r_h = yv12->y_height;
+ img->x_chroma_shift = 1;
+ img->y_chroma_shift = 1;
+ img->planes[VPX_PLANE_Y] = yv12->y_buffer;
+ img->planes[VPX_PLANE_U] = yv12->u_buffer;
+ img->planes[VPX_PLANE_V] = yv12->v_buffer;
+ img->planes[VPX_PLANE_ALPHA] = NULL;
+ img->stride[VPX_PLANE_Y] = yv12->y_stride;
+ img->stride[VPX_PLANE_U] = yv12->uv_stride;
+ img->stride[VPX_PLANE_V] = yv12->uv_stride;
+ img->stride[VPX_PLANE_ALPHA] = yv12->y_stride;
+ img->bit_depth = 8;
+ img->bps = 12;
+ img->user_priv = user_priv;
+ img->img_data = yv12->buffer_alloc;
+ img->img_data_owner = 0;
+ img->self_allocd = 0;
+}
+
+static int
+update_fragments(vpx_codec_alg_priv_t *ctx,
+ const uint8_t *data,
+ unsigned int data_sz,
+ vpx_codec_err_t *res)
+{
+ *res = VPX_CODEC_OK;
+
+ if (ctx->fragments.count == 0)
+ {
+ /* New frame, reset fragment pointers and sizes */
+ memset((void*)ctx->fragments.ptrs, 0, sizeof(ctx->fragments.ptrs));
+ memset(ctx->fragments.sizes, 0, sizeof(ctx->fragments.sizes));
+ }
+ if (ctx->fragments.enabled && !(data == NULL && data_sz == 0))
+ {
+ /* Store a pointer to this fragment and return. We haven't
+ * received the complete frame yet, so we will wait with decoding.
+ */
+ ctx->fragments.ptrs[ctx->fragments.count] = data;
+ ctx->fragments.sizes[ctx->fragments.count] = data_sz;
+ ctx->fragments.count++;
+ if (ctx->fragments.count > (1 << EIGHT_PARTITION) + 1)
+ {
+ ctx->fragments.count = 0;
+ *res = VPX_CODEC_INVALID_PARAM;
+ return -1;
+ }
+ return 0;
+ }
+
+ if (!ctx->fragments.enabled && (data == NULL && data_sz == 0))
+ {
+ return 0;
+ }
+
+ if (!ctx->fragments.enabled)
+ {
+ ctx->fragments.ptrs[0] = data;
+ ctx->fragments.sizes[0] = data_sz;
+ ctx->fragments.count = 1;
+ }
+
+ return 1;
+}
+
+static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
+ const uint8_t *data,
+ unsigned int data_sz,
+ void *user_priv,
+ long deadline)
+{
+ vpx_codec_err_t res = VPX_CODEC_OK;
+ unsigned int resolution_change = 0;
+ unsigned int w, h;
+
+ if (!ctx->fragments.enabled && (data == NULL && data_sz == 0))
+ {
+ return 0;
+ }
+
+ /* Update the input fragment data */
+ if(update_fragments(ctx, data, data_sz, &res) <= 0)
+ return res;
+
+ /* Determine the stream parameters. Note that we rely on peek_si to
+ * validate that we have a buffer that does not wrap around the top
+ * of the heap.
+ */
+ w = ctx->si.w;
+ h = ctx->si.h;
+
+ res = vp8_peek_si_internal(ctx->fragments.ptrs[0], ctx->fragments.sizes[0],
+ &ctx->si, ctx->decrypt_cb, ctx->decrypt_state);
+
+ if((res == VPX_CODEC_UNSUP_BITSTREAM) && !ctx->si.is_kf)
+ {
+ /* the peek function returns an error for non keyframes, however for
+ * this case, it is not an error */
+ res = VPX_CODEC_OK;
+ }
+
+ if(!ctx->decoder_init && !ctx->si.is_kf)
+ res = VPX_CODEC_UNSUP_BITSTREAM;
+
+ if ((ctx->si.h != h) || (ctx->si.w != w))
+ resolution_change = 1;
+
+ /* Initialize the decoder instance on the first frame*/
+ if (!res && !ctx->decoder_init)
+ {
+ VP8D_CONFIG oxcf;
+
+ oxcf.Width = ctx->si.w;
+ oxcf.Height = ctx->si.h;
+ oxcf.Version = 9;
+ oxcf.postprocess = 0;
+ oxcf.max_threads = ctx->cfg.threads;
+ oxcf.error_concealment =
+ (ctx->base.init_flags & VPX_CODEC_USE_ERROR_CONCEALMENT);
+
+ /* If postprocessing was enabled by the application and a
+ * configuration has not been provided, default it.
+ */
+ if (!ctx->postproc_cfg_set
+ && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)) {
+ ctx->postproc_cfg.post_proc_flag =
+ VP8_DEBLOCK | VP8_DEMACROBLOCK | VP8_MFQE;
+ ctx->postproc_cfg.deblocking_level = 4;
+ ctx->postproc_cfg.noise_level = 0;
+ }
+
+ res = vp8_create_decoder_instances(&ctx->yv12_frame_buffers, &oxcf);
+ ctx->decoder_init = 1;
+ }
+
+ /* Set these even if already initialized. The caller may have changed the
+ * decrypt config between frames.
+ */
+ if (ctx->decoder_init) {
+ ctx->yv12_frame_buffers.pbi[0]->decrypt_cb = ctx->decrypt_cb;
+ ctx->yv12_frame_buffers.pbi[0]->decrypt_state = ctx->decrypt_state;
+ }
+
+ if (!res)
+ {
+ VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0];
+ if (resolution_change)
+ {
+ VP8_COMMON *const pc = & pbi->common;
+ MACROBLOCKD *const xd = & pbi->mb;
+#if CONFIG_MULTITHREAD
+ int i;
+#endif
+ pc->Width = ctx->si.w;
+ pc->Height = ctx->si.h;
+ {
+ int prev_mb_rows = pc->mb_rows;
+
+ if (setjmp(pbi->common.error.jmp))
+ {
+ pbi->common.error.setjmp = 0;
+ vp8_clear_system_state();
+ /* same return value as used in vp8dx_receive_compressed_data */
+ return -1;
+ }
+
+ pbi->common.error.setjmp = 1;
+
+ if (pc->Width <= 0)
+ {
+ pc->Width = w;
+ vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+ "Invalid frame width");
+ }
+
+ if (pc->Height <= 0)
+ {
+ pc->Height = h;
+ vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+ "Invalid frame height");
+ }
+
+ if (vp8_alloc_frame_buffers(pc, pc->Width, pc->Height))
+ vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate frame buffers");
+
+ xd->pre = pc->yv12_fb[pc->lst_fb_idx];
+ xd->dst = pc->yv12_fb[pc->new_fb_idx];
+
+#if CONFIG_MULTITHREAD
+ for (i = 0; i < pbi->allocated_decoding_thread_count; i++)
+ {
+ pbi->mb_row_di[i].mbd.dst = pc->yv12_fb[pc->new_fb_idx];
+ vp8_build_block_doffsets(&pbi->mb_row_di[i].mbd);
+ }
+#endif
+ vp8_build_block_doffsets(&pbi->mb);
+
+ /* allocate memory for last frame MODE_INFO array */
+#if CONFIG_ERROR_CONCEALMENT
+
+ if (pbi->ec_enabled)
+ {
+ /* old prev_mip was released by vp8_de_alloc_frame_buffers()
+ * called in vp8_alloc_frame_buffers() */
+ pc->prev_mip = vpx_calloc(
+ (pc->mb_cols + 1) * (pc->mb_rows + 1),
+ sizeof(MODE_INFO));
+
+ if (!pc->prev_mip)
+ {
+ vp8_de_alloc_frame_buffers(pc);
+ vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate"
+ "last frame MODE_INFO array");
+ }
+
+ pc->prev_mi = pc->prev_mip + pc->mode_info_stride + 1;
+
+ if (vp8_alloc_overlap_lists(pbi))
+ vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate overlap lists "
+ "for error concealment");
+ }
+
+#endif
+
+#if CONFIG_MULTITHREAD
+ if (pbi->b_multithreaded_rd)
+ vp8mt_alloc_temp_buffers(pbi, pc->Width, prev_mb_rows);
+#else
+ (void)prev_mb_rows;
+#endif
+ }
+
+ pbi->common.error.setjmp = 0;
+
+ /* required to get past the first get_free_fb() call */
+ pbi->common.fb_idx_ref_cnt[0] = 0;
+ }
+
+ /* update the pbi fragment data */
+ pbi->fragments = ctx->fragments;
+
+ ctx->user_priv = user_priv;
+ if (vp8dx_receive_compressed_data(pbi, data_sz, data, deadline))
+ {
+ res = update_error_state(ctx, &pbi->common.error);
+ }
+
+ /* get ready for the next series of fragments */
+ ctx->fragments.count = 0;
+ }
+
+ return res;
+}
+
+static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t *ctx,
+ vpx_codec_iter_t *iter)
+{
+ vpx_image_t *img = NULL;
+
+ /* iter acts as a flip flop, so an image is only returned on the first
+ * call to get_frame.
+ */
+ if (!(*iter) && ctx->yv12_frame_buffers.pbi[0])
+ {
+ YV12_BUFFER_CONFIG sd;
+ int64_t time_stamp = 0, time_end_stamp = 0;
+ vp8_ppflags_t flags;
+ vp8_zero(flags);
+
+ if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
+ {
+ flags.post_proc_flag= ctx->postproc_cfg.post_proc_flag
+#if CONFIG_POSTPROC_VISUALIZER
+
+ | ((ctx->dbg_color_ref_frame_flag != 0) ? VP8D_DEBUG_CLR_FRM_REF_BLKS : 0)
+ | ((ctx->dbg_color_mb_modes_flag != 0) ? VP8D_DEBUG_CLR_BLK_MODES : 0)
+ | ((ctx->dbg_color_b_modes_flag != 0) ? VP8D_DEBUG_CLR_BLK_MODES : 0)
+ | ((ctx->dbg_display_mv_flag != 0) ? VP8D_DEBUG_DRAW_MV : 0)
+#endif
+ ;
+ flags.deblocking_level = ctx->postproc_cfg.deblocking_level;
+ flags.noise_level = ctx->postproc_cfg.noise_level;
+#if CONFIG_POSTPROC_VISUALIZER
+ flags.display_ref_frame_flag= ctx->dbg_color_ref_frame_flag;
+ flags.display_mb_modes_flag = ctx->dbg_color_mb_modes_flag;
+ flags.display_b_modes_flag = ctx->dbg_color_b_modes_flag;
+ flags.display_mv_flag = ctx->dbg_display_mv_flag;
+#endif
+ }
+
+ if (0 == vp8dx_get_raw_frame(ctx->yv12_frame_buffers.pbi[0], &sd,
+ &time_stamp, &time_end_stamp, &flags))
+ {
+ yuvconfig2image(&ctx->img, &sd, ctx->user_priv);
+
+ img = &ctx->img;
+ *iter = img;
+ }
+ }
+
+ return img;
+}
+
+static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
+ YV12_BUFFER_CONFIG *yv12)
+{
+ const int y_w = img->d_w;
+ const int y_h = img->d_h;
+ const int uv_w = (img->d_w + 1) / 2;
+ const int uv_h = (img->d_h + 1) / 2;
+ vpx_codec_err_t res = VPX_CODEC_OK;
+ yv12->y_buffer = img->planes[VPX_PLANE_Y];
+ yv12->u_buffer = img->planes[VPX_PLANE_U];
+ yv12->v_buffer = img->planes[VPX_PLANE_V];
+
+ yv12->y_crop_width = y_w;
+ yv12->y_crop_height = y_h;
+ yv12->y_width = y_w;
+ yv12->y_height = y_h;
+ yv12->uv_crop_width = uv_w;
+ yv12->uv_crop_height = uv_h;
+ yv12->uv_width = uv_w;
+ yv12->uv_height = uv_h;
+
+ yv12->y_stride = img->stride[VPX_PLANE_Y];
+ yv12->uv_stride = img->stride[VPX_PLANE_U];
+
+ yv12->border = (img->stride[VPX_PLANE_Y] - img->d_w) / 2;
+ return res;
+}
+
+
+static vpx_codec_err_t vp8_set_reference(vpx_codec_alg_priv_t *ctx,
+ va_list args)
+{
+
+ vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
+
+ if (data && !ctx->yv12_frame_buffers.use_frame_threads)
+ {
+ vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
+ YV12_BUFFER_CONFIG sd;
+
+ image2yuvconfig(&frame->img, &sd);
+
+ return vp8dx_set_reference(ctx->yv12_frame_buffers.pbi[0],
+ frame->frame_type, &sd);
+ }
+ else
+ return VPX_CODEC_INVALID_PARAM;
+
+}
+
+static vpx_codec_err_t vp8_get_reference(vpx_codec_alg_priv_t *ctx,
+ va_list args)
+{
+
+ vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
+
+ if (data && !ctx->yv12_frame_buffers.use_frame_threads)
+ {
+ vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
+ YV12_BUFFER_CONFIG sd;
+
+ image2yuvconfig(&frame->img, &sd);
+
+ return vp8dx_get_reference(ctx->yv12_frame_buffers.pbi[0],
+ frame->frame_type, &sd);
+ }
+ else
+ return VPX_CODEC_INVALID_PARAM;
+
+}
+
+static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx,
+ va_list args)
+{
+#if CONFIG_POSTPROC
+ vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *);
+
+ if (data)
+ {
+ ctx->postproc_cfg_set = 1;
+ ctx->postproc_cfg = *((vp8_postproc_cfg_t *)data);
+ return VPX_CODEC_OK;
+ }
+ else
+ return VPX_CODEC_INVALID_PARAM;
+
+#else
+ (void)ctx;
+ (void)args;
+ return VPX_CODEC_INCAPABLE;
+#endif
+}
+
+
+static vpx_codec_err_t vp8_set_dbg_color_ref_frame(vpx_codec_alg_priv_t *ctx,
+ va_list args) {
+#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC
+ ctx->dbg_color_ref_frame_flag = va_arg(args, int);
+ return VPX_CODEC_OK;
+#else
+ (void)ctx;
+ (void)args;
+ return VPX_CODEC_INCAPABLE;
+#endif
+}
+
+static vpx_codec_err_t vp8_set_dbg_color_mb_modes(vpx_codec_alg_priv_t *ctx,
+ va_list args) {
+#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC
+ ctx->dbg_color_mb_modes_flag = va_arg(args, int);
+ return VPX_CODEC_OK;
+#else
+ (void)ctx;
+ (void)args;
+ return VPX_CODEC_INCAPABLE;
+#endif
+}
+
+static vpx_codec_err_t vp8_set_dbg_color_b_modes(vpx_codec_alg_priv_t *ctx,
+ va_list args) {
+#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC
+ ctx->dbg_color_b_modes_flag = va_arg(args, int);
+ return VPX_CODEC_OK;
+#else
+ (void)ctx;
+ (void)args;
+ return VPX_CODEC_INCAPABLE;
+#endif
+}
+
+static vpx_codec_err_t vp8_set_dbg_display_mv(vpx_codec_alg_priv_t *ctx,
+ va_list args) {
+#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC
+ ctx->dbg_display_mv_flag = va_arg(args, int);
+ return VPX_CODEC_OK;
+#else
+ (void)ctx;
+ (void)args;
+ return VPX_CODEC_INCAPABLE;
+#endif
+}
+
+static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
+ va_list args)
+{
+ int *update_info = va_arg(args, int *);
+
+ if (update_info && !ctx->yv12_frame_buffers.use_frame_threads)
+ {
+ VP8D_COMP *pbi = (VP8D_COMP *)ctx->yv12_frame_buffers.pbi[0];
+
+ *update_info = pbi->common.refresh_alt_ref_frame * (int) VP8_ALTR_FRAME
+ + pbi->common.refresh_golden_frame * (int) VP8_GOLD_FRAME
+ + pbi->common.refresh_last_frame * (int) VP8_LAST_FRAME;
+
+ return VPX_CODEC_OK;
+ }
+ else
+ return VPX_CODEC_INVALID_PARAM;
+}
+
+extern int vp8dx_references_buffer( VP8_COMMON *oci, int ref_frame );
+static vpx_codec_err_t vp8_get_last_ref_frame(vpx_codec_alg_priv_t *ctx,
+ va_list args)
+{
+ int *ref_info = va_arg(args, int *);
+
+ if (ref_info && !ctx->yv12_frame_buffers.use_frame_threads)
+ {
+ VP8D_COMP *pbi = (VP8D_COMP *)ctx->yv12_frame_buffers.pbi[0];
+ VP8_COMMON *oci = &pbi->common;
+ *ref_info =
+ (vp8dx_references_buffer( oci, ALTREF_FRAME )?VP8_ALTR_FRAME:0) |
+ (vp8dx_references_buffer( oci, GOLDEN_FRAME )?VP8_GOLD_FRAME:0) |
+ (vp8dx_references_buffer( oci, LAST_FRAME )?VP8_LAST_FRAME:0);
+
+ return VPX_CODEC_OK;
+ }
+ else
+ return VPX_CODEC_INVALID_PARAM;
+}
+
+static vpx_codec_err_t vp8_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
+ va_list args)
+{
+
+ int *corrupted = va_arg(args, int *);
+ VP8D_COMP *pbi = (VP8D_COMP *)ctx->yv12_frame_buffers.pbi[0];
+
+ if (corrupted && pbi)
+ {
+ const YV12_BUFFER_CONFIG *const frame = pbi->common.frame_to_show;
+ if (frame == NULL) return VPX_CODEC_ERROR;
+ *corrupted = frame->corrupted;
+ return VPX_CODEC_OK;
+ }
+ else
+ return VPX_CODEC_INVALID_PARAM;
+
+}
+
+static vpx_codec_err_t vp8_set_decryptor(vpx_codec_alg_priv_t *ctx,
+ va_list args)
+{
+ vpx_decrypt_init *init = va_arg(args, vpx_decrypt_init *);
+
+ if (init)
+ {
+ ctx->decrypt_cb = init->decrypt_cb;
+ ctx->decrypt_state = init->decrypt_state;
+ }
+ else
+ {
+ ctx->decrypt_cb = NULL;
+ ctx->decrypt_state = NULL;
+ }
+ return VPX_CODEC_OK;
+}
+
+vpx_codec_ctrl_fn_map_t vp8_ctf_maps[] =
+{
+ {VP8_SET_REFERENCE, vp8_set_reference},
+ {VP8_COPY_REFERENCE, vp8_get_reference},
+ {VP8_SET_POSTPROC, vp8_set_postproc},
+ {VP8_SET_DBG_COLOR_REF_FRAME, vp8_set_dbg_color_ref_frame},
+ {VP8_SET_DBG_COLOR_MB_MODES, vp8_set_dbg_color_mb_modes},
+ {VP8_SET_DBG_COLOR_B_MODES, vp8_set_dbg_color_b_modes},
+ {VP8_SET_DBG_DISPLAY_MV, vp8_set_dbg_display_mv},
+ {VP8D_GET_LAST_REF_UPDATES, vp8_get_last_ref_updates},
+ {VP8D_GET_FRAME_CORRUPTED, vp8_get_frame_corrupted},
+ {VP8D_GET_LAST_REF_USED, vp8_get_last_ref_frame},
+ {VPXD_SET_DECRYPTOR, vp8_set_decryptor},
+ { -1, NULL},
+};
+
+
+#ifndef VERSION_STRING
+#define VERSION_STRING
+#endif
+CODEC_INTERFACE(vpx_codec_vp8_dx) =
+{
+ "WebM Project VP8 Decoder" VERSION_STRING,
+ VPX_CODEC_INTERNAL_ABI_VERSION,
+ VPX_CODEC_CAP_DECODER | VP8_CAP_POSTPROC | VP8_CAP_ERROR_CONCEALMENT |
+ VPX_CODEC_CAP_INPUT_FRAGMENTS,
+ /* vpx_codec_caps_t caps; */
+ vp8_init, /* vpx_codec_init_fn_t init; */
+ vp8_destroy, /* vpx_codec_destroy_fn_t destroy; */
+ vp8_ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */
+ {
+ vp8_peek_si, /* vpx_codec_peek_si_fn_t peek_si; */
+ vp8_get_si, /* vpx_codec_get_si_fn_t get_si; */
+ vp8_decode, /* vpx_codec_decode_fn_t decode; */
+ vp8_get_frame, /* vpx_codec_frame_get_fn_t frame_get; */
+ NULL,
+ },
+ { /* encoder functions */
+ 0,
+ NULL, /* vpx_codec_enc_cfg_map_t */
+ NULL, /* vpx_codec_encode_fn_t */
+ NULL, /* vpx_codec_get_cx_data_fn_t */
+ NULL, /* vpx_codec_enc_config_set_fn_t */
+ NULL, /* vpx_codec_get_global_headers_fn_t */
+ NULL, /* vpx_codec_get_preview_frame_fn_t */
+ NULL /* vpx_codec_enc_mr_get_mem_loc_fn_t */
+ }
+};