diff options
26 files changed, 5429 insertions, 47 deletions
diff --git a/thirdparty/libvpx/rtcd/vp8_rtcd_arm.h b/thirdparty/libvpx/rtcd/vp8_rtcd_arm.h new file mode 100644 index 0000000000..5c9b7aa392 --- /dev/null +++ b/thirdparty/libvpx/rtcd/vp8_rtcd_arm.h @@ -0,0 +1,240 @@ +#ifndef VP8_RTCD_H_ +#define VP8_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP8 + */ + +struct blockd; +struct loop_filter_info; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_c + +void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_clear_system_state_c(); +#define vp8_clear_system_state vp8_clear_system_state_c + +void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem16x16_neon(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); + +void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x4_neon(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); + +void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x8_neon(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); + +void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +void vp8_dc_only_idct_add_neon(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); + +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride); +void vp8_dequant_idct_add_neon(short *input, short *dq, unsigned char *output, int stride); +RTCD_EXTERN void (*vp8_dequant_idct_add)(short *input, short *dq, unsigned char *output, int stride); + +void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +RTCD_EXTERN void (*vp8_dequant_idct_add_uv_block)(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); + +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +RTCD_EXTERN void (*vp8_dequant_idct_add_y_block)(short *q, short *dq, unsigned char *dst, int stride, char *eobs); + +void vp8_dequantize_b_c(struct blockd*, short *dqc); +void vp8_dequantize_b_neon(struct blockd*, short *dqc); +RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *dqc); + +void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_bv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_mbh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bhs_neon(unsigned char *y, int ystride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_bh)(unsigned char *y, int ystride, const unsigned char *blimit); + +void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bvs_neon(unsigned char *y, int ystride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y, int ystride, const unsigned char *blimit); + +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_mbhs_neon(unsigned char *y, int ystride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y, int ystride, const unsigned char *blimit); + +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_mbvs_neon(unsigned char *y, int ystride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y, int ystride, const unsigned char *blimit); + +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +void vp8_short_idct4x4llm_neon(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); + +void vp8_short_inv_walsh4x4_c(short *input, short *output); +void vp8_short_inv_walsh4x4_neon(short *input, short *output); +RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *output); + +void vp8_short_inv_walsh4x4_1_c(short *input, short *output); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + +void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_c + +void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/arm.h" +static void setup_rtcd_internal(void) +{ + int flags = arm_cpu_caps(); + + vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_neon; +#endif + vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_neon; +#endif + vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_neon; +#endif + vp8_copy_mem16x16 = vp8_copy_mem16x16_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_copy_mem16x16 = vp8_copy_mem16x16_neon; +#endif + vp8_copy_mem8x4 = vp8_copy_mem8x4_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_copy_mem8x4 = vp8_copy_mem8x4_neon; +#endif + vp8_copy_mem8x8 = vp8_copy_mem8x8_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_copy_mem8x8 = vp8_copy_mem8x8_neon; +#endif + vp8_dc_only_idct_add = vp8_dc_only_idct_add_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_dc_only_idct_add = vp8_dc_only_idct_add_neon; +#endif + vp8_dequant_idct_add = vp8_dequant_idct_add_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_dequant_idct_add = vp8_dequant_idct_add_neon; +#endif + vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_neon; +#endif + vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_neon; +#endif + vp8_dequantize_b = vp8_dequantize_b_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_dequantize_b = vp8_dequantize_b_neon; +#endif + vp8_loop_filter_bh = vp8_loop_filter_bh_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_loop_filter_bh = vp8_loop_filter_bh_neon; +#endif + vp8_loop_filter_bv = vp8_loop_filter_bv_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_loop_filter_bv = vp8_loop_filter_bv_neon; +#endif + vp8_loop_filter_mbh = vp8_loop_filter_mbh_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_loop_filter_mbh = vp8_loop_filter_mbh_neon; +#endif + vp8_loop_filter_mbv = vp8_loop_filter_mbv_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_loop_filter_mbv = vp8_loop_filter_mbv_neon; +#endif + vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_neon; +#endif + vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_neon; +#endif + vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_loop_filter_simple_mbh = vp8_loop_filter_mbhs_neon; +#endif + vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_loop_filter_simple_mbv = vp8_loop_filter_mbvs_neon; +#endif + vp8_short_idct4x4llm = vp8_short_idct4x4llm_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_short_idct4x4llm = vp8_short_idct4x4llm_neon; +#endif + vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_neon; +#endif + vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_neon; +#endif + vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_neon; +#endif + vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_neon; +#endif +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/thirdparty/libvpx/rtcd/vp8_rtcd_c.h b/thirdparty/libvpx/rtcd/vp8_rtcd_c.h new file mode 100644 index 0000000000..d1657ac09d --- /dev/null +++ b/thirdparty/libvpx/rtcd/vp8_rtcd_c.h @@ -0,0 +1,117 @@ +#ifndef VP8_RTCD_H_ +#define VP8_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP8 + */ + +struct blockd; +struct loop_filter_info; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_c + +void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_c + +void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_c + +void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_c + +void vp8_clear_system_state_c(); +#define vp8_clear_system_state vp8_clear_system_state_c + +void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +#define vp8_copy_mem16x16 vp8_copy_mem16x16_c + +void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +#define vp8_copy_mem8x4 vp8_copy_mem8x4_c + +void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +#define vp8_copy_mem8x8 vp8_copy_mem8x8_c + +void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +#define vp8_dc_only_idct_add vp8_dc_only_idct_add_c + +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride); +#define vp8_dequant_idct_add vp8_dequant_idct_add_c + +void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_c + +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c + +void vp8_dequantize_b_c(struct blockd*, short *dqc); +#define vp8_dequantize_b vp8_dequantize_b_c + +void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bh vp8_loop_filter_bh_c + +void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bv vp8_loop_filter_bv_c + +void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbh vp8_loop_filter_mbh_c + +void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbv vp8_loop_filter_mbv_c + +void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_c + +void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_c + +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_c + +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_c + +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +#define vp8_short_idct4x4llm vp8_short_idct4x4llm_c + +void vp8_short_inv_walsh4x4_c(short *input, short *output); +#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_c + +void vp8_short_inv_walsh4x4_1_c(short *input, short *output); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + +void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_c + +void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_c + +void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_c + +void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_c + +void vp8_rtcd(void); + +#ifdef RTCD_C +static void setup_rtcd_internal(void) +{ +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/thirdparty/libvpx/rtcd/vp8_rtcd_x86.h b/thirdparty/libvpx/rtcd/vp8_rtcd_x86.h new file mode 100644 index 0000000000..cbe1f47b2a --- /dev/null +++ b/thirdparty/libvpx/rtcd/vp8_rtcd_x86.h @@ -0,0 +1,247 @@ +#ifndef VP8_RTCD_H_ +#define VP8_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP8 + */ + +struct blockd; +struct loop_filter_info; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_clear_system_state_c(); +void vpx_reset_mmx_state(); +RTCD_EXTERN void (*vp8_clear_system_state)(); + +void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem16x16_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); + +void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); + +void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); + +void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); + +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride); +void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *output, int stride); +RTCD_EXTERN void (*vp8_dequant_idct_add)(short *input, short *dq, unsigned char *output, int stride); + +void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +void vp8_dequant_idct_add_uv_block_mmx(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +void vp8_dequant_idct_add_uv_block_sse2(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +RTCD_EXTERN void (*vp8_dequant_idct_add_uv_block)(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); + +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_mmx(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +RTCD_EXTERN void (*vp8_dequant_idct_add_y_block)(short *q, short *dq, unsigned char *dst, int stride, char *eobs); + +void vp8_dequantize_b_c(struct blockd*, short *dqc); +void vp8_dequantize_b_mmx(struct blockd*, short *dqc); +RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *dqc); + +void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_bv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_mbh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bhs_mmx(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bhs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_bh)(unsigned char *y, int ystride, const unsigned char *blimit); + +void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bvs_mmx(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y, int ystride, const unsigned char *blimit); + +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y, int ystride, const unsigned char *blimit); + +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y, int ystride, const unsigned char *blimit); + +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); + +void vp8_short_inv_walsh4x4_c(short *input, short *output); +void vp8_short_inv_walsh4x4_mmx(short *input, short *output); +void vp8_short_inv_walsh4x4_sse2(short *input, short *output); +RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *output); + +void vp8_short_inv_walsh4x4_1_c(short *input, short *output); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + +void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict4x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/x86.h" +static void setup_rtcd_internal(void) +{ + int flags = x86_simd_caps(); + + vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_c; + if (flags & HAS_MMX) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_mmx; + if (flags & HAS_SSE2) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_sse2; + if (flags & HAS_SSSE3) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_ssse3; + vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_c; + if (flags & HAS_MMX) vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_mmx; + vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_c; + if (flags & HAS_MMX) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_mmx; + vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_c; + if (flags & HAS_MMX) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_mmx; + if (flags & HAS_SSE2) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_sse2; + if (flags & HAS_SSSE3) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_ssse3; + vp8_clear_system_state = vp8_clear_system_state_c; + if (flags & HAS_MMX) vp8_clear_system_state = vpx_reset_mmx_state; + vp8_copy_mem16x16 = vp8_copy_mem16x16_c; + if (flags & HAS_MMX) vp8_copy_mem16x16 = vp8_copy_mem16x16_mmx; + if (flags & HAS_SSE2) vp8_copy_mem16x16 = vp8_copy_mem16x16_sse2; + vp8_copy_mem8x4 = vp8_copy_mem8x4_c; + if (flags & HAS_MMX) vp8_copy_mem8x4 = vp8_copy_mem8x4_mmx; + vp8_copy_mem8x8 = vp8_copy_mem8x8_c; + if (flags & HAS_MMX) vp8_copy_mem8x8 = vp8_copy_mem8x8_mmx; + vp8_dc_only_idct_add = vp8_dc_only_idct_add_c; + if (flags & HAS_MMX) vp8_dc_only_idct_add = vp8_dc_only_idct_add_mmx; + vp8_dequant_idct_add = vp8_dequant_idct_add_c; + if (flags & HAS_MMX) vp8_dequant_idct_add = vp8_dequant_idct_add_mmx; + vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_c; + if (flags & HAS_MMX) vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_mmx; + if (flags & HAS_SSE2) vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_sse2; + vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_c; + if (flags & HAS_MMX) vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_mmx; + if (flags & HAS_SSE2) vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_sse2; + vp8_dequantize_b = vp8_dequantize_b_c; + if (flags & HAS_MMX) vp8_dequantize_b = vp8_dequantize_b_mmx; + vp8_loop_filter_bh = vp8_loop_filter_bh_c; + if (flags & HAS_MMX) vp8_loop_filter_bh = vp8_loop_filter_bh_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_bh = vp8_loop_filter_bh_sse2; + vp8_loop_filter_bv = vp8_loop_filter_bv_c; + if (flags & HAS_MMX) vp8_loop_filter_bv = vp8_loop_filter_bv_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_bv = vp8_loop_filter_bv_sse2; + vp8_loop_filter_mbh = vp8_loop_filter_mbh_c; + if (flags & HAS_MMX) vp8_loop_filter_mbh = vp8_loop_filter_mbh_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_mbh = vp8_loop_filter_mbh_sse2; + vp8_loop_filter_mbv = vp8_loop_filter_mbv_c; + if (flags & HAS_MMX) vp8_loop_filter_mbv = vp8_loop_filter_mbv_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_mbv = vp8_loop_filter_mbv_sse2; + vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_c; + if (flags & HAS_MMX) vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_sse2; + vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_c; + if (flags & HAS_MMX) vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_sse2; + vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_c; + if (flags & HAS_MMX) vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_sse2; + vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_c; + if (flags & HAS_MMX) vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_sse2; + vp8_short_idct4x4llm = vp8_short_idct4x4llm_c; + if (flags & HAS_MMX) vp8_short_idct4x4llm = vp8_short_idct4x4llm_mmx; + vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_c; + if (flags & HAS_MMX) vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_mmx; + if (flags & HAS_SSE2) vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_sse2; + vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_c; + if (flags & HAS_MMX) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_mmx; + if (flags & HAS_SSE2) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_ssse3; + vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_c; + if (flags & HAS_MMX) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_mmx; + if (flags & HAS_SSSE3) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_ssse3; + vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_c; + if (flags & HAS_MMX) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_mmx; + if (flags & HAS_SSE2) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_ssse3; + vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_c; + if (flags & HAS_MMX) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_mmx; + if (flags & HAS_SSE2) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_ssse3; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/thirdparty/libvpx/rtcd/vp9_rtcd_arm.h b/thirdparty/libvpx/rtcd/vp9_rtcd_arm.h new file mode 100644 index 0000000000..afdc7e179e --- /dev/null +++ b/thirdparty/libvpx/rtcd/vp9_rtcd_arm.h @@ -0,0 +1,54 @@ +#ifndef VP9_RTCD_H_ +#define VP9_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP9 + */ + +#include "vp9/common/vp9_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); +#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c + +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +RTCD_EXTERN void (*vp9_iht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); + +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); + +void vp9_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/arm.h" +static void setup_rtcd_internal(void) +{ + int flags = arm_cpu_caps(); + + vp9_iht4x4_16_add = vp9_iht4x4_16_add_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp9_iht4x4_16_add = vp9_iht4x4_16_add_neon; +#endif + vp9_iht8x8_64_add = vp9_iht8x8_64_add_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp9_iht8x8_64_add = vp9_iht8x8_64_add_neon; +#endif +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/thirdparty/libvpx/rtcd/vp9_rtcd_c.h b/thirdparty/libvpx/rtcd/vp9_rtcd_c.h new file mode 100644 index 0000000000..329cb9d04c --- /dev/null +++ b/thirdparty/libvpx/rtcd/vp9_rtcd_c.h @@ -0,0 +1,41 @@ +#ifndef VP9_RTCD_H_ +#define VP9_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP9 + */ + +#include "vp9/common/vp9_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); +#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c + +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c + +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c + +void vp9_rtcd(void); + +#ifdef RTCD_C +static void setup_rtcd_internal(void) +{ +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/thirdparty/libvpx/rtcd/vp9_rtcd_x86.h b/thirdparty/libvpx/rtcd/vp9_rtcd_x86.h new file mode 100644 index 0000000000..8ce8067674 --- /dev/null +++ b/thirdparty/libvpx/rtcd/vp9_rtcd_x86.h @@ -0,0 +1,55 @@ +#ifndef VP9_RTCD_H_ +#define VP9_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP9 + */ + +#include "vp9/common/vp9_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); +void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); +RTCD_EXTERN void (*vp9_iht16x16_256_add)(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); + +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +RTCD_EXTERN void (*vp9_iht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); + +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); + +void vp9_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/x86.h" +static void setup_rtcd_internal(void) +{ + int flags = x86_simd_caps(); + + vp9_iht16x16_256_add = vp9_iht16x16_256_add_c; + if (flags & HAS_SSE2) vp9_iht16x16_256_add = vp9_iht16x16_256_add_sse2; + + vp9_iht4x4_16_add = vp9_iht4x4_16_add_c; + if (flags & HAS_SSE2) vp9_iht4x4_16_add = vp9_iht4x4_16_add_sse2; + + vp9_iht8x8_64_add = vp9_iht8x8_64_add_c; + if (flags & HAS_SSE2) vp9_iht8x8_64_add = vp9_iht8x8_64_add_sse2; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/thirdparty/libvpx/rtcd/vpx_dsp_rtcd_arm.h b/thirdparty/libvpx/rtcd/vpx_dsp_rtcd_arm.h new file mode 100644 index 0000000000..df42f3d24a --- /dev/null +++ b/thirdparty/libvpx/rtcd/vpx_dsp_rtcd_arm.h @@ -0,0 +1,678 @@ +#ifndef VPX_DSP_RTCD_H_ +#define VPX_DSP_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * DSP + */ + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c + +void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c + +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c + +void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c + +void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c + +void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c + +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d135_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c + +void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c + +void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_c + +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_c + +void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_c + +void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_c + +void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_c + +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_c + +void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c + +void vpx_d207e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207e_predictor_16x16 vpx_d207e_predictor_16x16_c + +void vpx_d207e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207e_predictor_32x32 vpx_d207e_predictor_32x32_c + +void vpx_d207e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207e_predictor_4x4 vpx_d207e_predictor_4x4_c + +void vpx_d207e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207e_predictor_8x8 vpx_d207e_predictor_8x8_c + +void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c + +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_16x16 vpx_d45e_predictor_16x16_c + +void vpx_d45e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_32x32 vpx_d45e_predictor_32x32_c + +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c + +void vpx_d45e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_8x8 vpx_d45e_predictor_8x8_c + +void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c + +void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_c + +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_c + +void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c + +void vpx_d63e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_16x16 vpx_d63e_predictor_16x16_c + +void vpx_d63e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_32x32 vpx_d63e_predictor_32x32_c + +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c + +void vpx_d63e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_8x8 vpx_d63e_predictor_8x8_c + +void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63f_predictor_4x4 vpx_d63f_predictor_4x4_c + +void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c + +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct8x8_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c + +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c + +void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_horizontal_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_horizontal_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_lpf_horizontal_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_edge_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_edge_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_edge_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_16_dual)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_vertical_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_2d vpx_scaled_2d_c + +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c + +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c + +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c + +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_horiz vpx_scaled_horiz_c + +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_vert vpx_scaled_vert_c + +void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c + +void vpx_dsp_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/arm.h" +static void setup_rtcd_internal(void) +{ + int flags = arm_cpu_caps(); + + vpx_convolve8 = vpx_convolve8_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_convolve8 = vpx_convolve8_neon; +#endif + vpx_convolve8_avg = vpx_convolve8_avg_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_convolve8_avg = vpx_convolve8_avg_neon; +#endif + vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_neon; +#endif + vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_neon; +#endif + vpx_convolve8_horiz = vpx_convolve8_horiz_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_convolve8_horiz = vpx_convolve8_horiz_neon; +#endif + vpx_convolve8_vert = vpx_convolve8_vert_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_convolve8_vert = vpx_convolve8_vert_neon; +#endif + vpx_convolve_avg = vpx_convolve_avg_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_convolve_avg = vpx_convolve_avg_neon; +#endif + vpx_convolve_copy = vpx_convolve_copy_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_convolve_copy = vpx_convolve_copy_neon; +#endif + vpx_d135_predictor_4x4 = vpx_d135_predictor_4x4_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_d135_predictor_4x4 = vpx_d135_predictor_4x4_neon; +#endif + vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_neon; +#endif + vpx_d45_predictor_4x4 = vpx_d45_predictor_4x4_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_d45_predictor_4x4 = vpx_d45_predictor_4x4_neon; +#endif + vpx_d45_predictor_8x8 = vpx_d45_predictor_8x8_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_d45_predictor_8x8 = vpx_d45_predictor_8x8_neon; +#endif + vpx_dc_128_predictor_16x16 = vpx_dc_128_predictor_16x16_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_dc_128_predictor_16x16 = vpx_dc_128_predictor_16x16_neon; +#endif + vpx_dc_128_predictor_32x32 = vpx_dc_128_predictor_32x32_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_dc_128_predictor_32x32 = vpx_dc_128_predictor_32x32_neon; +#endif + vpx_dc_128_predictor_4x4 = vpx_dc_128_predictor_4x4_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_dc_128_predictor_4x4 = vpx_dc_128_predictor_4x4_neon; +#endif + vpx_dc_128_predictor_8x8 = vpx_dc_128_predictor_8x8_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_dc_128_predictor_8x8 = vpx_dc_128_predictor_8x8_neon; +#endif + vpx_dc_left_predictor_16x16 = vpx_dc_left_predictor_16x16_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_dc_left_predictor_16x16 = vpx_dc_left_predictor_16x16_neon; +#endif + vpx_dc_left_predictor_32x32 = vpx_dc_left_predictor_32x32_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_dc_left_predictor_32x32 = vpx_dc_left_predictor_32x32_neon; +#endif + vpx_dc_left_predictor_4x4 = vpx_dc_left_predictor_4x4_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_dc_left_predictor_4x4 = vpx_dc_left_predictor_4x4_neon; +#endif + vpx_dc_left_predictor_8x8 = vpx_dc_left_predictor_8x8_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_dc_left_predictor_8x8 = vpx_dc_left_predictor_8x8_neon; +#endif + vpx_dc_predictor_16x16 = vpx_dc_predictor_16x16_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_dc_predictor_16x16 = vpx_dc_predictor_16x16_neon; +#endif + vpx_dc_predictor_32x32 = vpx_dc_predictor_32x32_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_dc_predictor_32x32 = vpx_dc_predictor_32x32_neon; +#endif + vpx_dc_predictor_4x4 = vpx_dc_predictor_4x4_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_dc_predictor_4x4 = vpx_dc_predictor_4x4_neon; +#endif + vpx_dc_predictor_8x8 = vpx_dc_predictor_8x8_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_dc_predictor_8x8 = vpx_dc_predictor_8x8_neon; +#endif + vpx_dc_top_predictor_16x16 = vpx_dc_top_predictor_16x16_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_dc_top_predictor_16x16 = vpx_dc_top_predictor_16x16_neon; +#endif + vpx_dc_top_predictor_32x32 = vpx_dc_top_predictor_32x32_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_dc_top_predictor_32x32 = vpx_dc_top_predictor_32x32_neon; +#endif + vpx_dc_top_predictor_4x4 = vpx_dc_top_predictor_4x4_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_dc_top_predictor_4x4 = vpx_dc_top_predictor_4x4_neon; +#endif + vpx_dc_top_predictor_8x8 = vpx_dc_top_predictor_8x8_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_dc_top_predictor_8x8 = vpx_dc_top_predictor_8x8_neon; +#endif + vpx_h_predictor_16x16 = vpx_h_predictor_16x16_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_h_predictor_16x16 = vpx_h_predictor_16x16_neon; +#endif + vpx_h_predictor_32x32 = vpx_h_predictor_32x32_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_h_predictor_32x32 = vpx_h_predictor_32x32_neon; +#endif + vpx_h_predictor_4x4 = vpx_h_predictor_4x4_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_h_predictor_4x4 = vpx_h_predictor_4x4_neon; +#endif + vpx_h_predictor_8x8 = vpx_h_predictor_8x8_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_h_predictor_8x8 = vpx_h_predictor_8x8_neon; +#endif + vpx_idct16x16_10_add = vpx_idct16x16_10_add_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_idct16x16_10_add = vpx_idct16x16_10_add_neon; +#endif + vpx_idct16x16_1_add = vpx_idct16x16_1_add_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_idct16x16_1_add = vpx_idct16x16_1_add_neon; +#endif + vpx_idct16x16_256_add = vpx_idct16x16_256_add_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_idct16x16_256_add = vpx_idct16x16_256_add_neon; +#endif + vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_neon; +#endif + vpx_idct32x32_135_add = vpx_idct32x32_135_add_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_idct32x32_135_add = vpx_idct32x32_1024_add_neon; +#endif + vpx_idct32x32_1_add = vpx_idct32x32_1_add_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_idct32x32_1_add = vpx_idct32x32_1_add_neon; +#endif + vpx_idct32x32_34_add = vpx_idct32x32_34_add_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_idct32x32_34_add = vpx_idct32x32_1024_add_neon; +#endif + vpx_idct4x4_16_add = vpx_idct4x4_16_add_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_idct4x4_16_add = vpx_idct4x4_16_add_neon; +#endif + vpx_idct4x4_1_add = vpx_idct4x4_1_add_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_idct4x4_1_add = vpx_idct4x4_1_add_neon; +#endif + vpx_idct8x8_12_add = vpx_idct8x8_12_add_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_idct8x8_12_add = vpx_idct8x8_12_add_neon; +#endif + vpx_idct8x8_1_add = vpx_idct8x8_1_add_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_idct8x8_1_add = vpx_idct8x8_1_add_neon; +#endif + vpx_idct8x8_64_add = vpx_idct8x8_64_add_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_idct8x8_64_add = vpx_idct8x8_64_add_neon; +#endif + vpx_lpf_horizontal_4 = vpx_lpf_horizontal_4_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_lpf_horizontal_4 = vpx_lpf_horizontal_4_neon; +#endif + vpx_lpf_horizontal_4_dual = vpx_lpf_horizontal_4_dual_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_lpf_horizontal_4_dual = vpx_lpf_horizontal_4_dual_neon; +#endif + vpx_lpf_horizontal_8 = vpx_lpf_horizontal_8_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_lpf_horizontal_8 = vpx_lpf_horizontal_8_neon; +#endif + vpx_lpf_horizontal_8_dual = vpx_lpf_horizontal_8_dual_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_lpf_horizontal_8_dual = vpx_lpf_horizontal_8_dual_neon; +#endif + vpx_lpf_horizontal_edge_16 = vpx_lpf_horizontal_edge_16_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_lpf_horizontal_edge_16 = vpx_lpf_horizontal_edge_16_neon; +#endif + vpx_lpf_horizontal_edge_8 = vpx_lpf_horizontal_edge_8_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_lpf_horizontal_edge_8 = vpx_lpf_horizontal_edge_8_neon; +#endif + vpx_lpf_vertical_16 = vpx_lpf_vertical_16_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_lpf_vertical_16 = vpx_lpf_vertical_16_neon; +#endif + vpx_lpf_vertical_16_dual = vpx_lpf_vertical_16_dual_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_lpf_vertical_16_dual = vpx_lpf_vertical_16_dual_neon; +#endif + vpx_lpf_vertical_4 = vpx_lpf_vertical_4_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_lpf_vertical_4 = vpx_lpf_vertical_4_neon; +#endif + vpx_lpf_vertical_4_dual = vpx_lpf_vertical_4_dual_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_lpf_vertical_4_dual = vpx_lpf_vertical_4_dual_neon; +#endif + vpx_lpf_vertical_8 = vpx_lpf_vertical_8_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_lpf_vertical_8 = vpx_lpf_vertical_8_neon; +#endif + vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_neon; +#endif + vpx_tm_predictor_16x16 = vpx_tm_predictor_16x16_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_tm_predictor_16x16 = vpx_tm_predictor_16x16_neon; +#endif + vpx_tm_predictor_32x32 = vpx_tm_predictor_32x32_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_tm_predictor_32x32 = vpx_tm_predictor_32x32_neon; +#endif + vpx_tm_predictor_4x4 = vpx_tm_predictor_4x4_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_tm_predictor_4x4 = vpx_tm_predictor_4x4_neon; +#endif + vpx_tm_predictor_8x8 = vpx_tm_predictor_8x8_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_tm_predictor_8x8 = vpx_tm_predictor_8x8_neon; +#endif + vpx_v_predictor_16x16 = vpx_v_predictor_16x16_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_v_predictor_16x16 = vpx_v_predictor_16x16_neon; +#endif + vpx_v_predictor_32x32 = vpx_v_predictor_32x32_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_v_predictor_32x32 = vpx_v_predictor_32x32_neon; +#endif + vpx_v_predictor_4x4 = vpx_v_predictor_4x4_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_v_predictor_4x4 = vpx_v_predictor_4x4_neon; +#endif + vpx_v_predictor_8x8 = vpx_v_predictor_8x8_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_v_predictor_8x8 = vpx_v_predictor_8x8_neon; +#endif +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/thirdparty/libvpx/rtcd/vpx_dsp_rtcd_c.h b/thirdparty/libvpx/rtcd/vpx_dsp_rtcd_c.h new file mode 100644 index 0000000000..9fcc2f0066 --- /dev/null +++ b/thirdparty/libvpx/rtcd/vpx_dsp_rtcd_c.h @@ -0,0 +1,355 @@ +#ifndef VPX_DSP_RTCD_H_ +#define VPX_DSP_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * DSP + */ + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_convolve8 vpx_convolve8_c + +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_convolve8_avg vpx_convolve8_avg_c + +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_c + +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_c + +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_convolve8_horiz vpx_convolve8_horiz_c + +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_convolve8_vert vpx_convolve8_vert_c + +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_convolve_avg vpx_convolve_avg_c + +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_convolve_copy vpx_convolve_copy_c + +void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c + +void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c + +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c + +void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c + +void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c + +void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c + +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c + +void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c + +void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c + +void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_c + +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_c + +void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_c + +void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_c + +void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_c + +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_c + +void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c + +void vpx_d207e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207e_predictor_16x16 vpx_d207e_predictor_16x16_c + +void vpx_d207e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207e_predictor_32x32 vpx_d207e_predictor_32x32_c + +void vpx_d207e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207e_predictor_4x4 vpx_d207e_predictor_4x4_c + +void vpx_d207e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207e_predictor_8x8 vpx_d207e_predictor_8x8_c + +void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_c + +void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c + +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_c + +void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_c + +void vpx_d45e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_16x16 vpx_d45e_predictor_16x16_c + +void vpx_d45e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_32x32 vpx_d45e_predictor_32x32_c + +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c + +void vpx_d45e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_8x8 vpx_d45e_predictor_8x8_c + +void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c + +void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_c + +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_c + +void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c + +void vpx_d63e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_16x16 vpx_d63e_predictor_16x16_c + +void vpx_d63e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_32x32 vpx_d63e_predictor_32x32_c + +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c + +void vpx_d63e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_8x8 vpx_d63e_predictor_8x8_c + +void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63f_predictor_4x4 vpx_d63f_predictor_4x4_c + +void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_c + +void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_c + +void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_c + +void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_c + +void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_c + +void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_c + +void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_c + +void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_c + +void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_c + +void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_c + +void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_c + +void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_c + +void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_c + +void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_c + +void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_c + +void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_c + +void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_16x16 vpx_h_predictor_16x16_c + +void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_32x32 vpx_h_predictor_32x32_c + +void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_4x4 vpx_h_predictor_4x4_c + +void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_c + +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c + +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct16x16_10_add vpx_idct16x16_10_add_c + +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct16x16_1_add vpx_idct16x16_1_add_c + +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct16x16_256_add vpx_idct16x16_256_add_c + +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c + +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct32x32_135_add vpx_idct32x32_135_add_c + +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct32x32_1_add vpx_idct32x32_1_add_c + +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct32x32_34_add vpx_idct32x32_34_add_c + +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct4x4_16_add vpx_idct4x4_16_add_c + +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct4x4_1_add vpx_idct4x4_1_add_c + +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct8x8_12_add vpx_idct8x8_12_add_c + +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct8x8_1_add vpx_idct8x8_1_add_c + +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct8x8_64_add vpx_idct8x8_64_add_c + +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c + +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c + +void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_4 vpx_lpf_horizontal_4_c + +void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_horizontal_4_dual vpx_lpf_horizontal_4_dual_c + +void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_8 vpx_lpf_horizontal_8_c + +void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_horizontal_8_dual vpx_lpf_horizontal_8_dual_c + +void vpx_lpf_horizontal_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_edge_16 vpx_lpf_horizontal_edge_16_c + +void vpx_lpf_horizontal_edge_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_edge_8 vpx_lpf_horizontal_edge_8_c + +void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_16 vpx_lpf_vertical_16_c + +void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_16_dual vpx_lpf_vertical_16_dual_c + +void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_4 vpx_lpf_vertical_4_c + +void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_vertical_4_dual vpx_lpf_vertical_4_dual_c + +void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_8 vpx_lpf_vertical_8_c + +void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_c + +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_2d vpx_scaled_2d_c + +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c + +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c + +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c + +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_horiz vpx_scaled_horiz_c + +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_vert vpx_scaled_vert_c + +void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_c + +void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_c + +void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_c + +void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_c + +void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_16x16 vpx_v_predictor_16x16_c + +void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_32x32 vpx_v_predictor_32x32_c + +void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_4x4 vpx_v_predictor_4x4_c + +void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_8x8 vpx_v_predictor_8x8_c + +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c + +void vpx_dsp_rtcd(void); + +#ifdef RTCD_C +static void setup_rtcd_internal(void) +{ +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/thirdparty/libvpx/rtcd/vpx_dsp_rtcd_x86.h b/thirdparty/libvpx/rtcd/vpx_dsp_rtcd_x86.h new file mode 100644 index 0000000000..82574e096c --- /dev/null +++ b/thirdparty/libvpx/rtcd/vpx_dsp_rtcd_x86.h @@ -0,0 +1,614 @@ +#ifndef VPX_DSP_RTCD_H_ +#define VPX_DSP_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * DSP + */ + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c + +void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c + +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c + +void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c + +void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c + +void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c + +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c + +void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c + +void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207e_predictor_16x16 vpx_d207e_predictor_16x16_c + +void vpx_d207e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207e_predictor_32x32 vpx_d207e_predictor_32x32_c + +void vpx_d207e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207e_predictor_4x4 vpx_d207e_predictor_4x4_c + +void vpx_d207e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207e_predictor_8x8 vpx_d207e_predictor_8x8_c + +void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_16x16 vpx_d45e_predictor_16x16_c + +void vpx_d45e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_32x32 vpx_d45e_predictor_32x32_c + +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c + +void vpx_d45e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_8x8 vpx_d45e_predictor_8x8_c + +void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_16x16 vpx_d63e_predictor_16x16_c + +void vpx_d63e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_32x32 vpx_d63e_predictor_32x32_c + +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c + +void vpx_d63e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_8x8 vpx_d63e_predictor_8x8_c + +void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63f_predictor_4x4 vpx_d63f_predictor_4x4_c + +void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c + +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct8x8_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_iwht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c + +void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_horizontal_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_horizontal_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_lpf_horizontal_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_edge_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_edge_16_avx2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_edge_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_edge_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_edge_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_edge_8_avx2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_edge_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_16_dual)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_vertical_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c + +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c + +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c + +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_horiz vpx_scaled_horiz_c + +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_vert vpx_scaled_vert_c + +void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c + +void vpx_dsp_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/x86.h" +static void setup_rtcd_internal(void) +{ + int flags = x86_simd_caps(); + + vpx_convolve8 = vpx_convolve8_c; + if (flags & HAS_SSE2) vpx_convolve8 = vpx_convolve8_sse2; + if (flags & HAS_SSSE3) vpx_convolve8 = vpx_convolve8_ssse3; + if (flags & HAS_AVX2) vpx_convolve8 = vpx_convolve8_avx2; + vpx_convolve8_avg = vpx_convolve8_avg_c; + if (flags & HAS_SSE2) vpx_convolve8_avg = vpx_convolve8_avg_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_avg = vpx_convolve8_avg_ssse3; + vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_c; + if (flags & HAS_SSE2) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_ssse3; + vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_c; + if (flags & HAS_SSE2) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_ssse3; + vpx_convolve8_horiz = vpx_convolve8_horiz_c; + if (flags & HAS_SSE2) vpx_convolve8_horiz = vpx_convolve8_horiz_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_horiz = vpx_convolve8_horiz_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_horiz = vpx_convolve8_horiz_avx2; + vpx_convolve8_vert = vpx_convolve8_vert_c; + if (flags & HAS_SSE2) vpx_convolve8_vert = vpx_convolve8_vert_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_vert = vpx_convolve8_vert_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_vert = vpx_convolve8_vert_avx2; + vpx_convolve_avg = vpx_convolve_avg_c; + if (flags & HAS_SSE2) vpx_convolve_avg = vpx_convolve_avg_sse2; + vpx_convolve_copy = vpx_convolve_copy_c; + if (flags & HAS_SSE2) vpx_convolve_copy = vpx_convolve_copy_sse2; + vpx_d153_predictor_16x16 = vpx_d153_predictor_16x16_c; + if (flags & HAS_SSSE3) vpx_d153_predictor_16x16 = vpx_d153_predictor_16x16_ssse3; + vpx_d153_predictor_32x32 = vpx_d153_predictor_32x32_c; + if (flags & HAS_SSSE3) vpx_d153_predictor_32x32 = vpx_d153_predictor_32x32_ssse3; + vpx_d153_predictor_4x4 = vpx_d153_predictor_4x4_c; + if (flags & HAS_SSSE3) vpx_d153_predictor_4x4 = vpx_d153_predictor_4x4_ssse3; + vpx_d153_predictor_8x8 = vpx_d153_predictor_8x8_c; + if (flags & HAS_SSSE3) vpx_d153_predictor_8x8 = vpx_d153_predictor_8x8_ssse3; + vpx_d207_predictor_16x16 = vpx_d207_predictor_16x16_c; + if (flags & HAS_SSSE3) vpx_d207_predictor_16x16 = vpx_d207_predictor_16x16_ssse3; + vpx_d207_predictor_32x32 = vpx_d207_predictor_32x32_c; + if (flags & HAS_SSSE3) vpx_d207_predictor_32x32 = vpx_d207_predictor_32x32_ssse3; + vpx_d207_predictor_4x4 = vpx_d207_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_d207_predictor_4x4 = vpx_d207_predictor_4x4_sse2; + vpx_d207_predictor_8x8 = vpx_d207_predictor_8x8_c; + if (flags & HAS_SSSE3) vpx_d207_predictor_8x8 = vpx_d207_predictor_8x8_ssse3; + vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_c; + if (flags & HAS_SSSE3) vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_ssse3; + vpx_d45_predictor_32x32 = vpx_d45_predictor_32x32_c; + if (flags & HAS_SSSE3) vpx_d45_predictor_32x32 = vpx_d45_predictor_32x32_ssse3; + vpx_d45_predictor_4x4 = vpx_d45_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_d45_predictor_4x4 = vpx_d45_predictor_4x4_sse2; + vpx_d45_predictor_8x8 = vpx_d45_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_d45_predictor_8x8 = vpx_d45_predictor_8x8_sse2; + vpx_d63_predictor_16x16 = vpx_d63_predictor_16x16_c; + if (flags & HAS_SSSE3) vpx_d63_predictor_16x16 = vpx_d63_predictor_16x16_ssse3; + vpx_d63_predictor_32x32 = vpx_d63_predictor_32x32_c; + if (flags & HAS_SSSE3) vpx_d63_predictor_32x32 = vpx_d63_predictor_32x32_ssse3; + vpx_d63_predictor_4x4 = vpx_d63_predictor_4x4_c; + if (flags & HAS_SSSE3) vpx_d63_predictor_4x4 = vpx_d63_predictor_4x4_ssse3; + vpx_d63_predictor_8x8 = vpx_d63_predictor_8x8_c; + if (flags & HAS_SSSE3) vpx_d63_predictor_8x8 = vpx_d63_predictor_8x8_ssse3; + vpx_dc_128_predictor_16x16 = vpx_dc_128_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_dc_128_predictor_16x16 = vpx_dc_128_predictor_16x16_sse2; + vpx_dc_128_predictor_32x32 = vpx_dc_128_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_dc_128_predictor_32x32 = vpx_dc_128_predictor_32x32_sse2; + vpx_dc_128_predictor_4x4 = vpx_dc_128_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_dc_128_predictor_4x4 = vpx_dc_128_predictor_4x4_sse2; + vpx_dc_128_predictor_8x8 = vpx_dc_128_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_dc_128_predictor_8x8 = vpx_dc_128_predictor_8x8_sse2; + vpx_dc_left_predictor_16x16 = vpx_dc_left_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_dc_left_predictor_16x16 = vpx_dc_left_predictor_16x16_sse2; + vpx_dc_left_predictor_32x32 = vpx_dc_left_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_dc_left_predictor_32x32 = vpx_dc_left_predictor_32x32_sse2; + vpx_dc_left_predictor_4x4 = vpx_dc_left_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_dc_left_predictor_4x4 = vpx_dc_left_predictor_4x4_sse2; + vpx_dc_left_predictor_8x8 = vpx_dc_left_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_dc_left_predictor_8x8 = vpx_dc_left_predictor_8x8_sse2; + vpx_dc_predictor_16x16 = vpx_dc_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_dc_predictor_16x16 = vpx_dc_predictor_16x16_sse2; + vpx_dc_predictor_32x32 = vpx_dc_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_dc_predictor_32x32 = vpx_dc_predictor_32x32_sse2; + vpx_dc_predictor_4x4 = vpx_dc_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_dc_predictor_4x4 = vpx_dc_predictor_4x4_sse2; + vpx_dc_predictor_8x8 = vpx_dc_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_dc_predictor_8x8 = vpx_dc_predictor_8x8_sse2; + vpx_dc_top_predictor_16x16 = vpx_dc_top_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_dc_top_predictor_16x16 = vpx_dc_top_predictor_16x16_sse2; + vpx_dc_top_predictor_32x32 = vpx_dc_top_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_dc_top_predictor_32x32 = vpx_dc_top_predictor_32x32_sse2; + vpx_dc_top_predictor_4x4 = vpx_dc_top_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_dc_top_predictor_4x4 = vpx_dc_top_predictor_4x4_sse2; + vpx_dc_top_predictor_8x8 = vpx_dc_top_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_dc_top_predictor_8x8 = vpx_dc_top_predictor_8x8_sse2; + vpx_h_predictor_16x16 = vpx_h_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_h_predictor_16x16 = vpx_h_predictor_16x16_sse2; + vpx_h_predictor_32x32 = vpx_h_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_h_predictor_32x32 = vpx_h_predictor_32x32_sse2; + vpx_h_predictor_4x4 = vpx_h_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_h_predictor_4x4 = vpx_h_predictor_4x4_sse2; + vpx_h_predictor_8x8 = vpx_h_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_h_predictor_8x8 = vpx_h_predictor_8x8_sse2; + vpx_idct16x16_10_add = vpx_idct16x16_10_add_c; + if (flags & HAS_SSE2) vpx_idct16x16_10_add = vpx_idct16x16_10_add_sse2; + vpx_idct16x16_1_add = vpx_idct16x16_1_add_c; + if (flags & HAS_SSE2) vpx_idct16x16_1_add = vpx_idct16x16_1_add_sse2; + vpx_idct16x16_256_add = vpx_idct16x16_256_add_c; + if (flags & HAS_SSE2) vpx_idct16x16_256_add = vpx_idct16x16_256_add_sse2; + vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_c; + if (flags & HAS_SSE2) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_sse2; + vpx_idct32x32_135_add = vpx_idct32x32_135_add_c; + if (flags & HAS_SSE2) vpx_idct32x32_135_add = vpx_idct32x32_1024_add_sse2; + vpx_idct32x32_1_add = vpx_idct32x32_1_add_c; + if (flags & HAS_SSE2) vpx_idct32x32_1_add = vpx_idct32x32_1_add_sse2; + vpx_idct32x32_34_add = vpx_idct32x32_34_add_c; + if (flags & HAS_SSE2) vpx_idct32x32_34_add = vpx_idct32x32_34_add_sse2; + vpx_idct4x4_16_add = vpx_idct4x4_16_add_c; + if (flags & HAS_SSE2) vpx_idct4x4_16_add = vpx_idct4x4_16_add_sse2; + vpx_idct4x4_1_add = vpx_idct4x4_1_add_c; + if (flags & HAS_SSE2) vpx_idct4x4_1_add = vpx_idct4x4_1_add_sse2; + vpx_idct8x8_12_add = vpx_idct8x8_12_add_c; + if (flags & HAS_SSE2) vpx_idct8x8_12_add = vpx_idct8x8_12_add_sse2; + vpx_idct8x8_1_add = vpx_idct8x8_1_add_c; + if (flags & HAS_SSE2) vpx_idct8x8_1_add = vpx_idct8x8_1_add_sse2; + vpx_idct8x8_64_add = vpx_idct8x8_64_add_c; + if (flags & HAS_SSE2) vpx_idct8x8_64_add = vpx_idct8x8_64_add_sse2; + vpx_iwht4x4_16_add = vpx_iwht4x4_16_add_c; + if (flags & HAS_SSE2) vpx_iwht4x4_16_add = vpx_iwht4x4_16_add_sse2; + vpx_lpf_horizontal_4 = vpx_lpf_horizontal_4_c; + if (flags & HAS_SSE2) vpx_lpf_horizontal_4 = vpx_lpf_horizontal_4_sse2; + vpx_lpf_horizontal_4_dual = vpx_lpf_horizontal_4_dual_c; + if (flags & HAS_SSE2) vpx_lpf_horizontal_4_dual = vpx_lpf_horizontal_4_dual_sse2; + vpx_lpf_horizontal_8 = vpx_lpf_horizontal_8_c; + if (flags & HAS_SSE2) vpx_lpf_horizontal_8 = vpx_lpf_horizontal_8_sse2; + vpx_lpf_horizontal_8_dual = vpx_lpf_horizontal_8_dual_c; + if (flags & HAS_SSE2) vpx_lpf_horizontal_8_dual = vpx_lpf_horizontal_8_dual_sse2; + vpx_lpf_horizontal_edge_16 = vpx_lpf_horizontal_edge_16_c; + if (flags & HAS_SSE2) vpx_lpf_horizontal_edge_16 = vpx_lpf_horizontal_edge_16_sse2; + if (flags & HAS_AVX2) vpx_lpf_horizontal_edge_16 = vpx_lpf_horizontal_edge_16_avx2; + vpx_lpf_horizontal_edge_8 = vpx_lpf_horizontal_edge_8_c; + if (flags & HAS_SSE2) vpx_lpf_horizontal_edge_8 = vpx_lpf_horizontal_edge_8_sse2; + if (flags & HAS_AVX2) vpx_lpf_horizontal_edge_8 = vpx_lpf_horizontal_edge_8_avx2; + vpx_lpf_vertical_16 = vpx_lpf_vertical_16_c; + if (flags & HAS_SSE2) vpx_lpf_vertical_16 = vpx_lpf_vertical_16_sse2; + vpx_lpf_vertical_16_dual = vpx_lpf_vertical_16_dual_c; + if (flags & HAS_SSE2) vpx_lpf_vertical_16_dual = vpx_lpf_vertical_16_dual_sse2; + vpx_lpf_vertical_4 = vpx_lpf_vertical_4_c; + if (flags & HAS_SSE2) vpx_lpf_vertical_4 = vpx_lpf_vertical_4_sse2; + vpx_lpf_vertical_4_dual = vpx_lpf_vertical_4_dual_c; + if (flags & HAS_SSE2) vpx_lpf_vertical_4_dual = vpx_lpf_vertical_4_dual_sse2; + vpx_lpf_vertical_8 = vpx_lpf_vertical_8_c; + if (flags & HAS_SSE2) vpx_lpf_vertical_8 = vpx_lpf_vertical_8_sse2; + vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_c; + if (flags & HAS_SSE2) vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_sse2; + vpx_scaled_2d = vpx_scaled_2d_c; + if (flags & HAS_SSSE3) vpx_scaled_2d = vpx_scaled_2d_ssse3; + vpx_tm_predictor_16x16 = vpx_tm_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_tm_predictor_16x16 = vpx_tm_predictor_16x16_sse2; + vpx_tm_predictor_32x32 = vpx_tm_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_tm_predictor_32x32 = vpx_tm_predictor_32x32_sse2; + vpx_tm_predictor_4x4 = vpx_tm_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_tm_predictor_4x4 = vpx_tm_predictor_4x4_sse2; + vpx_tm_predictor_8x8 = vpx_tm_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_tm_predictor_8x8 = vpx_tm_predictor_8x8_sse2; + vpx_v_predictor_16x16 = vpx_v_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_v_predictor_16x16 = vpx_v_predictor_16x16_sse2; + vpx_v_predictor_32x32 = vpx_v_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_v_predictor_32x32 = vpx_v_predictor_32x32_sse2; + vpx_v_predictor_4x4 = vpx_v_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_v_predictor_4x4 = vpx_v_predictor_4x4_sse2; + vpx_v_predictor_8x8 = vpx_v_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_v_predictor_8x8 = vpx_v_predictor_8x8_sse2; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/thirdparty/libvpx/vp8_rtcd.h b/thirdparty/libvpx/vp8_rtcd.h new file mode 100644 index 0000000000..c5eeb5e579 --- /dev/null +++ b/thirdparty/libvpx/vp8_rtcd.h @@ -0,0 +1,9 @@ +#include "vpx_config.h" + +#if defined(WEBM_X86ASM) && (ARCH_X86 || ARCH_X86_64) + #include "rtcd/vp8_rtcd_x86.h" +#elif defined(WEBM_ARMASM) && ARCH_ARM + #include "rtcd/vp8_rtcd_arm.h" +#else + #include "rtcd/vp8_rtcd_c.h" +#endif diff --git a/thirdparty/libvpx/vp9_rtcd.h b/thirdparty/libvpx/vp9_rtcd.h new file mode 100644 index 0000000000..cf2b463d63 --- /dev/null +++ b/thirdparty/libvpx/vp9_rtcd.h @@ -0,0 +1,9 @@ +#include "vpx_config.h" + +#if defined(WEBM_X86ASM) && (ARCH_X86 || ARCH_X86_64) + #include "rtcd/vp9_rtcd_x86.h" +#elif defined(WEBM_ARMASM) && ARCH_ARM + #include "rtcd/vp9_rtcd_arm.h" +#else + #include "rtcd/vp9_rtcd_c.h" +#endif diff --git a/thirdparty/libvpx/vpx_config.asm b/thirdparty/libvpx/vpx_config.asm new file mode 100644 index 0000000000..8f2119e7bc --- /dev/null +++ b/thirdparty/libvpx/vpx_config.asm @@ -0,0 +1,65 @@ +%ifdef X86_32 + ARCH_X86 equ 1 + ARCH_X86_64 equ 0 +%elifdef X86_64 + ARCH_X86 equ 0 + ARCH_X86_64 equ 1 +%endif + +HAVE_VPX_PORTS equ 1 +CONFIG_DEPENDENCY_TRACKING equ 0 +CONFIG_EXTERNAL_BUILD equ 0 +CONFIG_INSTALL_DOCS equ 0 +CONFIG_INSTALL_BINS equ 0 +CONFIG_INSTALL_LIBS equ 0 +CONFIG_INSTALL_SRCS equ 0 +CONFIG_USE_X86INC equ 1 +CONFIG_DEBUG equ 0 +CONFIG_GPROF equ 0 +CONFIG_GCOV equ 0 +CONFIG_RVCT equ 0 +CONFIG_PIC equ 1 ;TODO: autodetect +CONFIG_CODEC_SRCS equ 0 +CONFIG_DEBUG_LIBS equ 0 +CONFIG_DEQUANT_TOKENS equ 0 +CONFIG_DC_RECON equ 0 +CONFIG_RUNTIME_CPU_DETECT equ 1 +CONFIG_POSTPROC equ 0 +CONFIG_VP9_POSTPROC equ 0 +CONFIG_MULTITHREAD equ 1 +CONFIG_INTERNAL_STATS equ 0 +CONFIG_VP8_ENCODER equ 0 +CONFIG_VP8_DECODER equ 1 +CONFIG_VP9_ENCODER equ 0 +CONFIG_VP9_DECODER equ 1 +CONFIG_VP8 equ 1 +CONFIG_VP9 equ 1 +CONFIG_ENCODERS equ 0 +CONFIG_DECODERS equ 1 +CONFIG_STATIC_MSVCRT equ 0 +CONFIG_SPATIAL_RESAMPLING equ 0 +CONFIG_REALTIME_ONLY equ 0 +CONFIG_ONTHEFLY_BITPACKING equ 0 +CONFIG_ERROR_CONCEALMENT equ 0 +CONFIG_SHARED equ 0 +CONFIG_STATIC equ 0 +CONFIG_SMALL equ 0 +CONFIG_POSTPROC_VISUALIZER equ 0 +CONFIG_OS_SUPPORT equ 1 +CONFIG_UNIT_TESTS equ 0 +CONFIG_WEBM_IO equ 0 +CONFIG_LIBYUV equ 0 +CONFIG_DECODE_PERF_TESTS equ 0 +CONFIG_ENCODE_PERF_TESTS equ 0 +CONFIG_MULTI_RES_ENCODING equ 0 +CONFIG_TEMPORAL_DENOISING equ 1 +CONFIG_VP9_TEMPORAL_DENOISING equ 0 +CONFIG_COEFFICIENT_RANGE_CHECKING equ 0 +CONFIG_VP9_HIGHBITDEPTH equ 0 +CONFIG_BETTER_HW_COMPATIBILITY equ 0 +CONFIG_EXPERIMENTAL equ 0 +CONFIG_SIZE_LIMIT equ 0 +CONFIG_SPATIAL_SVC equ 0 +CONFIG_FP_MB_STATS equ 0 +CONFIG_EMULATE_HARDWARE equ 0 +CONFIG_MISC_FIXES equ 0 diff --git a/thirdparty/libvpx/vpx_config.h b/thirdparty/libvpx/vpx_config.h new file mode 100644 index 0000000000..7058f0327b --- /dev/null +++ b/thirdparty/libvpx/vpx_config.h @@ -0,0 +1,124 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +/* This file automatically generated by configure. Do not edit! */ +#ifndef VPX_CONFIG_H +#define VPX_CONFIG_H +#define RESTRICT +#define INLINE inline + +#define HAVE_MIPS32 0 +#define HAVE_MEDIA 0 + +#if defined(__i386) || defined(__i386__) || defined(_M_IX86) + #define ARCH_X86 1 + #define ARCH_X86_64 0 + + #define ARCH_ARM 0 + #define HAVE_NEON 0 + #define HAVE_NEON_ASM 0 + + #define HAVE_MMX 1 + #define HAVE_SSE2 1 + #define HAVE_SSSE3 1 + #define HAVE_AVX2 1 +#elif defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(_M_X64) + #define ARCH_X86 0 + #define ARCH_X86_64 1 + + #define ARCH_ARM 0 + #define HAVE_NEON 0 + #define HAVE_NEON_ASM 0 + + #define HAVE_MMX 1 + #define HAVE_SSE2 1 + #define HAVE_SSSE3 1 + #define HAVE_AVX2 1 +#elif defined(__arm__) || defined(__TARGET_ARCH_ARM) || defined(_M_ARM) || defined(__aarch64__) + #define ARCH_X86 0 + #define ARCH_X86_64 0 + + #define ARCH_ARM 1 + #define HAVE_NEON 1 + #define HAVE_NEON_ASM 1 +#else + #define ARCH_X86 0 + #define ARCH_X86_64 0 + + #define ARCH_ARM 0 + #define HAVE_NEON 0 + #define HAVE_NEON_ASM 0 +#endif + +#define CONFIG_BIG_ENDIAN 0 //TODO: Autodetect + +#ifdef _WIN32 + #define HAVE_PTHREAD_H 0 + #define HAVE_UNISTD_H 0 +#else + #define HAVE_PTHREAD_H 1 + #define HAVE_UNISTD_H 1 +#endif + +/**/ + +#define HAVE_VPX_PORTS 1 +#define CONFIG_DEPENDENCY_TRACKING 0 +#define CONFIG_EXTERNAL_BUILD 0 +#define CONFIG_INSTALL_DOCS 0 +#define CONFIG_INSTALL_BINS 0 +#define CONFIG_INSTALL_LIBS 0 +#define CONFIG_INSTALL_SRCS 0 +#define CONFIG_DEBUG 0 +#define CONFIG_GPROF 0 +#define CONFIG_GCOV 0 +#define CONFIG_RVCT 0 +#define CONFIG_CODEC_SRCS 0 +#define CONFIG_DEBUG_LIBS 0 +#define CONFIG_DEQUANT_TOKENS 0 +#define CONFIG_DC_RECON 0 +#define CONFIG_RUNTIME_CPU_DETECT 1 +#define CONFIG_POSTPROC 0 +#define CONFIG_VP9_POSTPROC 0 +#define CONFIG_MULTITHREAD 1 +#define CONFIG_INTERNAL_STATS 0 +#define CONFIG_VP8_ENCODER 0 +#define CONFIG_VP8_DECODER 1 +#define CONFIG_VP9_ENCODER 0 +#define CONFIG_VP9_DECODER 1 +#define CONFIG_VP8 1 +#define CONFIG_VP9 1 +#define CONFIG_ENCODERS 0 +#define CONFIG_DECODERS 1 +#define CONFIG_STATIC_MSVCRT 0 +#define CONFIG_SPATIAL_RESAMPLING 0 +#define CONFIG_REALTIME_ONLY 0 +#define CONFIG_ONTHEFLY_BITPACKING 0 +#define CONFIG_ERROR_CONCEALMENT 0 +#define CONFIG_SHARED 0 +#define CONFIG_STATIC 0 +#define CONFIG_SMALL 0 +#define CONFIG_POSTPROC_VISUALIZER 0 +#define CONFIG_OS_SUPPORT 1 +#define CONFIG_UNIT_TESTS 0 +#define CONFIG_WEBM_IO 0 +#define CONFIG_LIBYUV 0 +#define CONFIG_DECODE_PERF_TESTS 0 +#define CONFIG_ENCODE_PERF_TESTS 0 +#define CONFIG_MULTI_RES_ENCODING 0 +#define CONFIG_TEMPORAL_DENOISING 0 +#define CONFIG_VP9_TEMPORAL_DENOISING 0 +#define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 +#define CONFIG_BETTER_HW_COMPATIBILITY 0 +#define CONFIG_EXPERIMENTAL 0 +#define CONFIG_SIZE_LIMIT 0 +#define CONFIG_SPATIAL_SVC 0 +#define CONFIG_FP_MB_STATS 0 +#define CONFIG_EMULATE_HARDWARE 0 +#define CONFIG_MISC_FIXES 0 +#endif /* VPX_CONFIG_H */ diff --git a/thirdparty/libvpx/vpx_dsp/arm/intrapred_neon_asm.asm b/thirdparty/libvpx/vpx_dsp/arm/armasm_ms/intrapred_neon_asm.asm index 115790d480..b2846c410b 100644 --- a/thirdparty/libvpx/vpx_dsp/arm/intrapred_neon_asm.asm +++ b/thirdparty/libvpx/vpx_dsp/arm/armasm_ms/intrapred_neon_asm.asm @@ -1,3 +1,5 @@ +; This file was created from a .asm file +; using the ads2armasm_ms.pl script. ; ; Copyright (c) 2014 The WebM project authors. All Rights Reserved. ; @@ -20,11 +22,10 @@ EXPORT |vpx_tm_predictor_8x8_neon| EXPORT |vpx_tm_predictor_16x16_neon| EXPORT |vpx_tm_predictor_32x32_neon| - ARM - REQUIRE8 - PRESERVE8 + + - AREA ||.text||, CODE, READONLY, ALIGN=2 + AREA |.text|, CODE, READONLY, ALIGN=2 ;void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, ; const uint8_t *above, @@ -42,6 +43,7 @@ vst1.32 {d0[0]}, [r0], r1 bx lr ENDP ; |vpx_v_predictor_4x4_neon| + ALIGN 4 ;void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, ; const uint8_t *above, @@ -63,6 +65,7 @@ vst1.8 {d0}, [r0], r1 bx lr ENDP ; |vpx_v_predictor_8x8_neon| + ALIGN 4 ;void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, ; const uint8_t *above, @@ -92,6 +95,7 @@ vst1.8 {q0}, [r0], r1 bx lr ENDP ; |vpx_v_predictor_16x16_neon| + ALIGN 4 ;void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, ; const uint8_t *above, @@ -125,6 +129,7 @@ loop_v bgt loop_v bx lr ENDP ; |vpx_v_predictor_32x32_neon| + ALIGN 4 ;void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, ; const uint8_t *above, @@ -146,6 +151,7 @@ loop_v vst1.32 {d0[0]}, [r0], r1 bx lr ENDP ; |vpx_h_predictor_4x4_neon| + ALIGN 4 ;void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, ; const uint8_t *above, @@ -175,6 +181,7 @@ loop_v vst1.64 {d0}, [r0], r1 bx lr ENDP ; |vpx_h_predictor_8x8_neon| + ALIGN 4 ;void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, ; const uint8_t *above, @@ -220,6 +227,7 @@ loop_v vst1.8 {q0}, [r0], r1 bx lr ENDP ; |vpx_h_predictor_16x16_neon| + ALIGN 4 ;void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, ; const uint8_t *above, @@ -286,6 +294,7 @@ loop_h bgt loop_h bx lr ENDP ; |vpx_h_predictor_32x32_neon| + ALIGN 4 ;void vpx_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride, ; const uint8_t *above, @@ -332,6 +341,7 @@ loop_h vst1.32 {d1[0]}, [r0], r1 bx lr ENDP ; |vpx_tm_predictor_4x4_neon| + ALIGN 4 ;void vpx_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride, ; const uint8_t *above, @@ -404,6 +414,7 @@ loop_h bx lr ENDP ; |vpx_tm_predictor_8x8_neon| + ALIGN 4 ;void vpx_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride, ; const uint8_t *above, @@ -497,6 +508,7 @@ loop_16x16_neon bx lr ENDP ; |vpx_tm_predictor_16x16_neon| + ALIGN 4 ;void vpx_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride, ; const uint8_t *above, @@ -626,5 +638,6 @@ loop_32x32_neon bx lr ENDP ; |vpx_tm_predictor_32x32_neon| + ALIGN 4 END diff --git a/thirdparty/libvpx/vpx_dsp/arm/loopfilter_mb_neon.asm b/thirdparty/libvpx/vpx_dsp/arm/armasm_ms/loopfilter_mb_neon.asm index d5da7a8409..9c3736faf8 100644 --- a/thirdparty/libvpx/vpx_dsp/arm/loopfilter_mb_neon.asm +++ b/thirdparty/libvpx/vpx_dsp/arm/armasm_ms/loopfilter_mb_neon.asm @@ -1,3 +1,5 @@ +; This file was created from a .asm file +; using the ads2armasm_ms.pl script. ; ; Copyright (c) 2013 The WebM project authors. All Rights Reserved. ; @@ -11,9 +13,8 @@ EXPORT |vpx_lpf_horizontal_edge_8_neon| EXPORT |vpx_lpf_horizontal_edge_16_neon| EXPORT |vpx_lpf_vertical_16_neon| - ARM - AREA ||.text||, CODE, READONLY, ALIGN=2 + AREA |.text|, CODE, READONLY, ALIGN=2 ; void mb_lpf_horizontal_edge(uint8_t *s, int p, ; const uint8_t *blimit, @@ -117,6 +118,7 @@ h_next pop {r4-r8, pc} ENDP ; |mb_lpf_horizontal_edge| + ALIGN 4 ; void vpx_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch, ; const uint8_t *blimit, @@ -131,6 +133,7 @@ h_next mov r12, #1 b mb_lpf_horizontal_edge ENDP ; |vpx_lpf_horizontal_edge_8_neon| + ALIGN 4 ; void vpx_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch, ; const uint8_t *blimit, @@ -145,6 +148,7 @@ h_next mov r12, #2 b mb_lpf_horizontal_edge ENDP ; |vpx_lpf_horizontal_edge_16_neon| + ALIGN 4 ; void vpx_lpf_vertical_16_neon(uint8_t *s, int p, ; const uint8_t *blimit, @@ -309,6 +313,7 @@ v_end pop {r4-r8, pc} ENDP ; |vpx_lpf_vertical_16_neon| + ALIGN 4 ; void vpx_wide_mbfilter_neon(); ; This is a helper function for the loopfilters. The invidual functions do the @@ -631,5 +636,6 @@ v_end bx lr ENDP ; |vpx_wide_mbfilter_neon| + ALIGN 4 END diff --git a/thirdparty/libvpx/vpx_dsp/arm/save_reg_neon.asm b/thirdparty/libvpx/vpx_dsp/arm/armasm_ms/save_reg_neon.asm index c9ca10801d..4cf9988e65 100644 --- a/thirdparty/libvpx/vpx_dsp/arm/save_reg_neon.asm +++ b/thirdparty/libvpx/vpx_dsp/arm/armasm_ms/save_reg_neon.asm @@ -1,3 +1,5 @@ +; This file was created from a .asm file +; using the ads2armasm_ms.pl script. ; ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. ; @@ -12,11 +14,10 @@ EXPORT |vpx_push_neon| EXPORT |vpx_pop_neon| - ARM - REQUIRE8 - PRESERVE8 + + - AREA ||.text||, CODE, READONLY, ALIGN=2 + AREA |.text|, CODE, READONLY, ALIGN=2 |vpx_push_neon| PROC vst1.i64 {d8, d9, d10, d11}, [r0]! @@ -24,6 +25,7 @@ bx lr ENDP + ALIGN 4 |vpx_pop_neon| PROC vld1.i64 {d8, d9, d10, d11}, [r0]! @@ -31,6 +33,7 @@ bx lr ENDP + ALIGN 4 END diff --git a/thirdparty/libvpx/vpx_dsp/arm/gas/intrapred_neon_asm.s b/thirdparty/libvpx/vpx_dsp/arm/gas/intrapred_neon_asm.s new file mode 100644 index 0000000000..3932227fc5 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/arm/gas/intrapred_neon_asm.s @@ -0,0 +1,658 @@ +@ This file was created from a .asm file +@ using the ads2gas.pl script. + .equ DO1STROUNDING, 0 +@ +@ Copyright (c) 2014 The WebM project authors. All Rights Reserved. +@ +@ Use of this source code is governed by a BSD-style license +@ that can be found in the LICENSE file in the root of the source +@ tree. An additional intellectual property rights grant can be found +@ in the file PATENTS. All contributing project authors may +@ be found in the AUTHORS file in the root of the source tree. +@ + + .global vpx_v_predictor_4x4_neon + .type vpx_v_predictor_4x4_neon, function + .global vpx_v_predictor_8x8_neon + .type vpx_v_predictor_8x8_neon, function + .global vpx_v_predictor_16x16_neon + .type vpx_v_predictor_16x16_neon, function + .global vpx_v_predictor_32x32_neon + .type vpx_v_predictor_32x32_neon, function + .global vpx_h_predictor_4x4_neon + .type vpx_h_predictor_4x4_neon, function + .global vpx_h_predictor_8x8_neon + .type vpx_h_predictor_8x8_neon, function + .global vpx_h_predictor_16x16_neon + .type vpx_h_predictor_16x16_neon, function + .global vpx_h_predictor_32x32_neon + .type vpx_h_predictor_32x32_neon, function + .global vpx_tm_predictor_4x4_neon + .type vpx_tm_predictor_4x4_neon, function + .global vpx_tm_predictor_8x8_neon + .type vpx_tm_predictor_8x8_neon, function + .global vpx_tm_predictor_16x16_neon + .type vpx_tm_predictor_16x16_neon, function + .global vpx_tm_predictor_32x32_neon + .type vpx_tm_predictor_32x32_neon, function + .arm + .eabi_attribute 24, 1 @Tag_ABI_align_needed + .eabi_attribute 25, 1 @Tag_ABI_align_preserved + +.text +.p2align 2 + +@void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, +@ const uint8_t *above, +@ const uint8_t *left) +@ r0 uint8_t *dst +@ r1 ptrdiff_t y_stride +@ r2 const uint8_t *above +@ r3 const uint8_t *left + +_vpx_v_predictor_4x4_neon: + vpx_v_predictor_4x4_neon: @ PROC + vld1.32 {d0[0]}, [r2] + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + bx lr + .size vpx_v_predictor_4x4_neon, .-vpx_v_predictor_4x4_neon @ ENDP @ |vpx_v_predictor_4x4_neon| + +@void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, +@ const uint8_t *above, +@ const uint8_t *left) +@ r0 uint8_t *dst +@ r1 ptrdiff_t y_stride +@ r2 const uint8_t *above +@ r3 const uint8_t *left + +_vpx_v_predictor_8x8_neon: + vpx_v_predictor_8x8_neon: @ PROC + vld1.8 {d0}, [r2] + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + bx lr + .size vpx_v_predictor_8x8_neon, .-vpx_v_predictor_8x8_neon @ ENDP @ |vpx_v_predictor_8x8_neon| + +@void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, +@ const uint8_t *above, +@ const uint8_t *left) +@ r0 uint8_t *dst +@ r1 ptrdiff_t y_stride +@ r2 const uint8_t *above +@ r3 const uint8_t *left + +_vpx_v_predictor_16x16_neon: + vpx_v_predictor_16x16_neon: @ PROC + vld1.8 {q0}, [r2] + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + bx lr + .size vpx_v_predictor_16x16_neon, .-vpx_v_predictor_16x16_neon @ ENDP @ |vpx_v_predictor_16x16_neon| + +@void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, +@ const uint8_t *above, +@ const uint8_t *left) +@ r0 uint8_t *dst +@ r1 ptrdiff_t y_stride +@ r2 const uint8_t *above +@ r3 const uint8_t *left + +_vpx_v_predictor_32x32_neon: + vpx_v_predictor_32x32_neon: @ PROC + vld1.8 {q0, q1}, [r2] + mov r2, #2 +loop_v: + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + subs r2, r2, #1 + bgt loop_v + bx lr + .size vpx_v_predictor_32x32_neon, .-vpx_v_predictor_32x32_neon @ ENDP @ |vpx_v_predictor_32x32_neon| + +@void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, +@ const uint8_t *above, +@ const uint8_t *left) +@ r0 uint8_t *dst +@ r1 ptrdiff_t y_stride +@ r2 const uint8_t *above +@ r3 const uint8_t *left + +_vpx_h_predictor_4x4_neon: + vpx_h_predictor_4x4_neon: @ PROC + vld1.32 {d1[0]}, [r3] + vdup.8 d0, d1[0] + vst1.32 {d0[0]}, [r0], r1 + vdup.8 d0, d1[1] + vst1.32 {d0[0]}, [r0], r1 + vdup.8 d0, d1[2] + vst1.32 {d0[0]}, [r0], r1 + vdup.8 d0, d1[3] + vst1.32 {d0[0]}, [r0], r1 + bx lr + .size vpx_h_predictor_4x4_neon, .-vpx_h_predictor_4x4_neon @ ENDP @ |vpx_h_predictor_4x4_neon| + +@void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, +@ const uint8_t *above, +@ const uint8_t *left) +@ r0 uint8_t *dst +@ r1 ptrdiff_t y_stride +@ r2 const uint8_t *above +@ r3 const uint8_t *left + +_vpx_h_predictor_8x8_neon: + vpx_h_predictor_8x8_neon: @ PROC + vld1.64 {d1}, [r3] + vdup.8 d0, d1[0] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[1] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[2] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[3] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[4] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[5] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[6] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[7] + vst1.64 {d0}, [r0], r1 + bx lr + .size vpx_h_predictor_8x8_neon, .-vpx_h_predictor_8x8_neon @ ENDP @ |vpx_h_predictor_8x8_neon| + +@void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, +@ const uint8_t *above, +@ const uint8_t *left) +@ r0 uint8_t *dst +@ r1 ptrdiff_t y_stride +@ r2 const uint8_t *above +@ r3 const uint8_t *left + +_vpx_h_predictor_16x16_neon: + vpx_h_predictor_16x16_neon: @ PROC + vld1.8 {q1}, [r3] + vdup.8 q0, d2[0] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[1] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[2] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[3] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[4] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[5] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[6] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[7] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[0] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[1] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[2] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[3] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[4] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[5] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[6] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[7] + vst1.8 {q0}, [r0], r1 + bx lr + .size vpx_h_predictor_16x16_neon, .-vpx_h_predictor_16x16_neon @ ENDP @ |vpx_h_predictor_16x16_neon| + +@void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, +@ const uint8_t *above, +@ const uint8_t *left) +@ r0 uint8_t *dst +@ r1 ptrdiff_t y_stride +@ r2 const uint8_t *above +@ r3 const uint8_t *left + +_vpx_h_predictor_32x32_neon: + vpx_h_predictor_32x32_neon: @ PROC + sub r1, r1, #16 + mov r2, #2 +loop_h: + vld1.8 {q1}, [r3]! + vdup.8 q0, d2[0] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[1] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[2] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[3] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[4] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[5] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[6] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[7] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[0] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[1] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[2] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[3] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[4] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[5] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[6] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[7] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + subs r2, r2, #1 + bgt loop_h + bx lr + .size vpx_h_predictor_32x32_neon, .-vpx_h_predictor_32x32_neon @ ENDP @ |vpx_h_predictor_32x32_neon| + +@void vpx_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride, +@ const uint8_t *above, +@ const uint8_t *left) +@ r0 uint8_t *dst +@ r1 ptrdiff_t y_stride +@ r2 const uint8_t *above +@ r3 const uint8_t *left + +_vpx_tm_predictor_4x4_neon: + vpx_tm_predictor_4x4_neon: @ PROC + @ Load ytop_left = above[-1]; + sub r12, r2, #1 + vld1.u8 {d0[]}, [r12] + + @ Load above 4 pixels + vld1.32 {d2[0]}, [r2] + + @ Compute above - ytop_left + vsubl.u8 q3, d2, d0 + + @ Load left row by row and compute left + (above - ytop_left) + @ 1st row and 2nd row + vld1.u8 {d2[]}, [r3]! + vld1.u8 {d4[]}, [r3]! + vmovl.u8 q1, d2 + vmovl.u8 q2, d4 + vadd.s16 q1, q1, q3 + vadd.s16 q2, q2, q3 + vqmovun.s16 d0, q1 + vqmovun.s16 d1, q2 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d1[0]}, [r0], r1 + + @ 3rd row and 4th row + vld1.u8 {d2[]}, [r3]! + vld1.u8 {d4[]}, [r3] + vmovl.u8 q1, d2 + vmovl.u8 q2, d4 + vadd.s16 q1, q1, q3 + vadd.s16 q2, q2, q3 + vqmovun.s16 d0, q1 + vqmovun.s16 d1, q2 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d1[0]}, [r0], r1 + bx lr + .size vpx_tm_predictor_4x4_neon, .-vpx_tm_predictor_4x4_neon @ ENDP @ |vpx_tm_predictor_4x4_neon| + +@void vpx_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride, +@ const uint8_t *above, +@ const uint8_t *left) +@ r0 uint8_t *dst +@ r1 ptrdiff_t y_stride +@ r2 const uint8_t *above +@ r3 const uint8_t *left + +_vpx_tm_predictor_8x8_neon: + vpx_tm_predictor_8x8_neon: @ PROC + @ Load ytop_left = above[-1]; + sub r12, r2, #1 + vld1.8 {d0[]}, [r12] + + @ preload 8 left + vld1.8 {d30}, [r3] + + @ Load above 8 pixels + vld1.64 {d2}, [r2] + + vmovl.u8 q10, d30 + + @ Compute above - ytop_left + vsubl.u8 q3, d2, d0 + + @ Load left row by row and compute left + (above - ytop_left) + @ 1st row and 2nd row + vdup.16 q0, d20[0] + vdup.16 q1, d20[1] + vadd.s16 q0, q3, q0 + vadd.s16 q1, q3, q1 + + @ 3rd row and 4th row + vdup.16 q8, d20[2] + vdup.16 q9, d20[3] + vadd.s16 q8, q3, q8 + vadd.s16 q9, q3, q9 + + vqmovun.s16 d0, q0 + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q8 + vqmovun.s16 d3, q9 + + vst1.64 {d0}, [r0], r1 + vst1.64 {d1}, [r0], r1 + vst1.64 {d2}, [r0], r1 + vst1.64 {d3}, [r0], r1 + + @ 5th row and 6th row + vdup.16 q0, d21[0] + vdup.16 q1, d21[1] + vadd.s16 q0, q3, q0 + vadd.s16 q1, q3, q1 + + @ 7th row and 8th row + vdup.16 q8, d21[2] + vdup.16 q9, d21[3] + vadd.s16 q8, q3, q8 + vadd.s16 q9, q3, q9 + + vqmovun.s16 d0, q0 + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q8 + vqmovun.s16 d3, q9 + + vst1.64 {d0}, [r0], r1 + vst1.64 {d1}, [r0], r1 + vst1.64 {d2}, [r0], r1 + vst1.64 {d3}, [r0], r1 + + bx lr + .size vpx_tm_predictor_8x8_neon, .-vpx_tm_predictor_8x8_neon @ ENDP @ |vpx_tm_predictor_8x8_neon| + +@void vpx_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride, +@ const uint8_t *above, +@ const uint8_t *left) +@ r0 uint8_t *dst +@ r1 ptrdiff_t y_stride +@ r2 const uint8_t *above +@ r3 const uint8_t *left + +_vpx_tm_predictor_16x16_neon: + vpx_tm_predictor_16x16_neon: @ PROC + @ Load ytop_left = above[-1]; + sub r12, r2, #1 + vld1.8 {d0[]}, [r12] + + @ Load above 8 pixels + vld1.8 {q1}, [r2] + + @ preload 8 left into r12 + vld1.8 {d18}, [r3]! + + @ Compute above - ytop_left + vsubl.u8 q2, d2, d0 + vsubl.u8 q3, d3, d0 + + vmovl.u8 q10, d18 + + @ Load left row by row and compute left + (above - ytop_left) + @ Process 8 rows in each single loop and loop 2 times to process 16 rows. + mov r2, #2 + +loop_16x16_neon: + @ Process two rows. + vdup.16 q0, d20[0] + vdup.16 q8, d20[1] + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vdup.16 q0, d20[2] @ proload next 2 rows data + vdup.16 q8, d20[3] + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + @ Process two rows. + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vdup.16 q0, d21[0] @ proload next 2 rows data + vdup.16 q8, d21[1] + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vdup.16 q0, d21[2] @ proload next 2 rows data + vdup.16 q8, d21[3] + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vld1.8 {d18}, [r3]! @ preload 8 left into r12 + vmovl.u8 q10, d18 + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + subs r2, r2, #1 + bgt loop_16x16_neon + + bx lr + .size vpx_tm_predictor_16x16_neon, .-vpx_tm_predictor_16x16_neon @ ENDP @ |vpx_tm_predictor_16x16_neon| + +@void vpx_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride, +@ const uint8_t *above, +@ const uint8_t *left) +@ r0 uint8_t *dst +@ r1 ptrdiff_t y_stride +@ r2 const uint8_t *above +@ r3 const uint8_t *left + +_vpx_tm_predictor_32x32_neon: + vpx_tm_predictor_32x32_neon: @ PROC + @ Load ytop_left = above[-1]; + sub r12, r2, #1 + vld1.8 {d0[]}, [r12] + + @ Load above 32 pixels + vld1.8 {q1}, [r2]! + vld1.8 {q2}, [r2] + + @ preload 8 left pixels + vld1.8 {d26}, [r3]! + + @ Compute above - ytop_left + vsubl.u8 q8, d2, d0 + vsubl.u8 q9, d3, d0 + vsubl.u8 q10, d4, d0 + vsubl.u8 q11, d5, d0 + + vmovl.u8 q3, d26 + + @ Load left row by row and compute left + (above - ytop_left) + @ Process 8 rows in each single loop and loop 4 times to process 32 rows. + mov r2, #4 + +loop_32x32_neon: + @ Process two rows. + vdup.16 q0, d6[0] + vdup.16 q2, d6[1] + vadd.s16 q12, q0, q8 + vadd.s16 q13, q0, q9 + vadd.s16 q14, q0, q10 + vadd.s16 q15, q0, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vdup.16 q1, d6[2] + vdup.16 q2, d6[3] + vst1.64 {d24-d27}, [r0], r1 + + @ Process two rows. + vadd.s16 q12, q1, q8 + vadd.s16 q13, q1, q9 + vadd.s16 q14, q1, q10 + vadd.s16 q15, q1, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vdup.16 q0, d7[0] + vdup.16 q2, d7[1] + vst1.64 {d24-d27}, [r0], r1 + + @ Process two rows. + vadd.s16 q12, q0, q8 + vadd.s16 q13, q0, q9 + vadd.s16 q14, q0, q10 + vadd.s16 q15, q0, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vdup.16 q0, d7[2] + vdup.16 q2, d7[3] + vst1.64 {d24-d27}, [r0], r1 + + @ Process two rows. + vadd.s16 q12, q0, q8 + vadd.s16 q13, q0, q9 + vadd.s16 q14, q0, q10 + vadd.s16 q15, q0, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vld1.8 {d0}, [r3]! @ preload 8 left pixels + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vmovl.u8 q3, d0 + vst1.64 {d24-d27}, [r0], r1 + + subs r2, r2, #1 + bgt loop_32x32_neon + + bx lr + .size vpx_tm_predictor_32x32_neon, .-vpx_tm_predictor_32x32_neon @ ENDP @ |vpx_tm_predictor_32x32_neon| + + .section .note.GNU-stack,"",%progbits diff --git a/thirdparty/libvpx/vpx_dsp/arm/gas/loopfilter_mb_neon.s b/thirdparty/libvpx/vpx_dsp/arm/gas/loopfilter_mb_neon.s new file mode 100644 index 0000000000..f6b05406fb --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/arm/gas/loopfilter_mb_neon.s @@ -0,0 +1,647 @@ +@ This file was created from a .asm file +@ using the ads2gas.pl script. + .equ DO1STROUNDING, 0 +@ +@ Copyright (c) 2013 The WebM project authors. All Rights Reserved. +@ +@ Use of this source code is governed by a BSD-style license +@ that can be found in the LICENSE file in the root of the source +@ tree. An additional intellectual property rights grant can be found +@ in the file PATENTS. All contributing project authors may +@ be found in the AUTHORS file in the root of the source tree. +@ + + .global vpx_lpf_horizontal_edge_8_neon + .type vpx_lpf_horizontal_edge_8_neon, function + .global vpx_lpf_horizontal_edge_16_neon + .type vpx_lpf_horizontal_edge_16_neon, function + .global vpx_lpf_vertical_16_neon + .type vpx_lpf_vertical_16_neon, function + .arm + +.text +.p2align 2 + +@ void mb_lpf_horizontal_edge(uint8_t *s, int p, +@ const uint8_t *blimit, +@ const uint8_t *limit, +@ const uint8_t *thresh, +@ int count) +@ r0 uint8_t *s, +@ r1 int p, /* pitch */ +@ r2 const uint8_t *blimit, +@ r3 const uint8_t *limit, +@ sp const uint8_t *thresh, +@ r12 int count +_mb_lpf_horizontal_edge: + mb_lpf_horizontal_edge: @ PROC + push {r4-r8, lr} + vpush {d8-d15} + ldr r4, [sp, #88] @ load thresh + +h_count: + vld1.8 {d16[]}, [r2] @ load *blimit + vld1.8 {d17[]}, [r3] @ load *limit + vld1.8 {d18[]}, [r4] @ load *thresh + + sub r8, r0, r1, lsl #3 @ move src pointer down by 8 lines + + vld1.u8 {d0}, [r8,:64], r1 @ p7 + vld1.u8 {d1}, [r8,:64], r1 @ p6 + vld1.u8 {d2}, [r8,:64], r1 @ p5 + vld1.u8 {d3}, [r8,:64], r1 @ p4 + vld1.u8 {d4}, [r8,:64], r1 @ p3 + vld1.u8 {d5}, [r8,:64], r1 @ p2 + vld1.u8 {d6}, [r8,:64], r1 @ p1 + vld1.u8 {d7}, [r8,:64], r1 @ p0 + vld1.u8 {d8}, [r8,:64], r1 @ q0 + vld1.u8 {d9}, [r8,:64], r1 @ q1 + vld1.u8 {d10}, [r8,:64], r1 @ q2 + vld1.u8 {d11}, [r8,:64], r1 @ q3 + vld1.u8 {d12}, [r8,:64], r1 @ q4 + vld1.u8 {d13}, [r8,:64], r1 @ q5 + vld1.u8 {d14}, [r8,:64], r1 @ q6 + vld1.u8 {d15}, [r8,:64], r1 @ q7 + + bl vpx_wide_mbfilter_neon + + tst r7, #1 + beq h_mbfilter + + @ flat && mask were not set for any of the channels. Just store the values + @ from filter. + sub r8, r0, r1, lsl #1 + + vst1.u8 {d25}, [r8,:64], r1 @ store op1 + vst1.u8 {d24}, [r8,:64], r1 @ store op0 + vst1.u8 {d23}, [r8,:64], r1 @ store oq0 + vst1.u8 {d26}, [r8,:64], r1 @ store oq1 + + b h_next + +h_mbfilter: + tst r7, #2 + beq h_wide_mbfilter + + @ flat2 was not set for any of the channels. Just store the values from + @ mbfilter. + sub r8, r0, r1, lsl #1 + sub r8, r8, r1 + + vst1.u8 {d18}, [r8,:64], r1 @ store op2 + vst1.u8 {d19}, [r8,:64], r1 @ store op1 + vst1.u8 {d20}, [r8,:64], r1 @ store op0 + vst1.u8 {d21}, [r8,:64], r1 @ store oq0 + vst1.u8 {d22}, [r8,:64], r1 @ store oq1 + vst1.u8 {d23}, [r8,:64], r1 @ store oq2 + + b h_next + +h_wide_mbfilter: + sub r8, r0, r1, lsl #3 + add r8, r8, r1 + + vst1.u8 {d16}, [r8,:64], r1 @ store op6 + vst1.u8 {d24}, [r8,:64], r1 @ store op5 + vst1.u8 {d25}, [r8,:64], r1 @ store op4 + vst1.u8 {d26}, [r8,:64], r1 @ store op3 + vst1.u8 {d27}, [r8,:64], r1 @ store op2 + vst1.u8 {d18}, [r8,:64], r1 @ store op1 + vst1.u8 {d19}, [r8,:64], r1 @ store op0 + vst1.u8 {d20}, [r8,:64], r1 @ store oq0 + vst1.u8 {d21}, [r8,:64], r1 @ store oq1 + vst1.u8 {d22}, [r8,:64], r1 @ store oq2 + vst1.u8 {d23}, [r8,:64], r1 @ store oq3 + vst1.u8 {d1}, [r8,:64], r1 @ store oq4 + vst1.u8 {d2}, [r8,:64], r1 @ store oq5 + vst1.u8 {d3}, [r8,:64], r1 @ store oq6 + +h_next: + add r0, r0, #8 + subs r12, r12, #1 + bne h_count + + vpop {d8-d15} + pop {r4-r8, pc} + + .size mb_lpf_horizontal_edge, .-mb_lpf_horizontal_edge @ ENDP @ |mb_lpf_horizontal_edge| + +@ void vpx_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch, +@ const uint8_t *blimit, +@ const uint8_t *limit, +@ const uint8_t *thresh) +@ r0 uint8_t *s, +@ r1 int pitch, +@ r2 const uint8_t *blimit, +@ r3 const uint8_t *limit, +@ sp const uint8_t *thresh +_vpx_lpf_horizontal_edge_8_neon: + vpx_lpf_horizontal_edge_8_neon: @ PROC + mov r12, #1 + b mb_lpf_horizontal_edge + .size vpx_lpf_horizontal_edge_8_neon, .-vpx_lpf_horizontal_edge_8_neon @ ENDP @ |vpx_lpf_horizontal_edge_8_neon| + +@ void vpx_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch, +@ const uint8_t *blimit, +@ const uint8_t *limit, +@ const uint8_t *thresh) +@ r0 uint8_t *s, +@ r1 int pitch, +@ r2 const uint8_t *blimit, +@ r3 const uint8_t *limit, +@ sp const uint8_t *thresh +_vpx_lpf_horizontal_edge_16_neon: + vpx_lpf_horizontal_edge_16_neon: @ PROC + mov r12, #2 + b mb_lpf_horizontal_edge + .size vpx_lpf_horizontal_edge_16_neon, .-vpx_lpf_horizontal_edge_16_neon @ ENDP @ |vpx_lpf_horizontal_edge_16_neon| + +@ void vpx_lpf_vertical_16_neon(uint8_t *s, int p, +@ const uint8_t *blimit, +@ const uint8_t *limit, +@ const uint8_t *thresh) +@ r0 uint8_t *s, +@ r1 int p, /* pitch */ +@ r2 const uint8_t *blimit, +@ r3 const uint8_t *limit, +@ sp const uint8_t *thresh, +_vpx_lpf_vertical_16_neon: + vpx_lpf_vertical_16_neon: @ PROC + push {r4-r8, lr} + vpush {d8-d15} + ldr r4, [sp, #88] @ load thresh + + vld1.8 {d16[]}, [r2] @ load *blimit + vld1.8 {d17[]}, [r3] @ load *limit + vld1.8 {d18[]}, [r4] @ load *thresh + + sub r8, r0, #8 + + vld1.8 {d0}, [r8,:64], r1 + vld1.8 {d8}, [r0,:64], r1 + vld1.8 {d1}, [r8,:64], r1 + vld1.8 {d9}, [r0,:64], r1 + vld1.8 {d2}, [r8,:64], r1 + vld1.8 {d10}, [r0,:64], r1 + vld1.8 {d3}, [r8,:64], r1 + vld1.8 {d11}, [r0,:64], r1 + vld1.8 {d4}, [r8,:64], r1 + vld1.8 {d12}, [r0,:64], r1 + vld1.8 {d5}, [r8,:64], r1 + vld1.8 {d13}, [r0,:64], r1 + vld1.8 {d6}, [r8,:64], r1 + vld1.8 {d14}, [r0,:64], r1 + vld1.8 {d7}, [r8,:64], r1 + vld1.8 {d15}, [r0,:64], r1 + + sub r0, r0, r1, lsl #3 + + vtrn.32 q0, q2 + vtrn.32 q1, q3 + vtrn.32 q4, q6 + vtrn.32 q5, q7 + + vtrn.16 q0, q1 + vtrn.16 q2, q3 + vtrn.16 q4, q5 + vtrn.16 q6, q7 + + vtrn.8 d0, d1 + vtrn.8 d2, d3 + vtrn.8 d4, d5 + vtrn.8 d6, d7 + + vtrn.8 d8, d9 + vtrn.8 d10, d11 + vtrn.8 d12, d13 + vtrn.8 d14, d15 + + bl vpx_wide_mbfilter_neon + + tst r7, #1 + beq v_mbfilter + + @ flat && mask were not set for any of the channels. Just store the values + @ from filter. + sub r8, r0, #2 + + vswp d23, d25 + + vst4.8 {d23[0], d24[0], d25[0], d26[0]}, [r8], r1 + vst4.8 {d23[1], d24[1], d25[1], d26[1]}, [r8], r1 + vst4.8 {d23[2], d24[2], d25[2], d26[2]}, [r8], r1 + vst4.8 {d23[3], d24[3], d25[3], d26[3]}, [r8], r1 + vst4.8 {d23[4], d24[4], d25[4], d26[4]}, [r8], r1 + vst4.8 {d23[5], d24[5], d25[5], d26[5]}, [r8], r1 + vst4.8 {d23[6], d24[6], d25[6], d26[6]}, [r8], r1 + vst4.8 {d23[7], d24[7], d25[7], d26[7]}, [r8], r1 + + b v_end + +v_mbfilter: + tst r7, #2 + beq v_wide_mbfilter + + @ flat2 was not set for any of the channels. Just store the values from + @ mbfilter. + sub r8, r0, #3 + + vst3.8 {d18[0], d19[0], d20[0]}, [r8], r1 + vst3.8 {d21[0], d22[0], d23[0]}, [r0], r1 + vst3.8 {d18[1], d19[1], d20[1]}, [r8], r1 + vst3.8 {d21[1], d22[1], d23[1]}, [r0], r1 + vst3.8 {d18[2], d19[2], d20[2]}, [r8], r1 + vst3.8 {d21[2], d22[2], d23[2]}, [r0], r1 + vst3.8 {d18[3], d19[3], d20[3]}, [r8], r1 + vst3.8 {d21[3], d22[3], d23[3]}, [r0], r1 + vst3.8 {d18[4], d19[4], d20[4]}, [r8], r1 + vst3.8 {d21[4], d22[4], d23[4]}, [r0], r1 + vst3.8 {d18[5], d19[5], d20[5]}, [r8], r1 + vst3.8 {d21[5], d22[5], d23[5]}, [r0], r1 + vst3.8 {d18[6], d19[6], d20[6]}, [r8], r1 + vst3.8 {d21[6], d22[6], d23[6]}, [r0], r1 + vst3.8 {d18[7], d19[7], d20[7]}, [r8], r1 + vst3.8 {d21[7], d22[7], d23[7]}, [r0], r1 + + b v_end + +v_wide_mbfilter: + sub r8, r0, #8 + + vtrn.32 d0, d26 + vtrn.32 d16, d27 + vtrn.32 d24, d18 + vtrn.32 d25, d19 + + vtrn.16 d0, d24 + vtrn.16 d16, d25 + vtrn.16 d26, d18 + vtrn.16 d27, d19 + + vtrn.8 d0, d16 + vtrn.8 d24, d25 + vtrn.8 d26, d27 + vtrn.8 d18, d19 + + vtrn.32 d20, d1 + vtrn.32 d21, d2 + vtrn.32 d22, d3 + vtrn.32 d23, d15 + + vtrn.16 d20, d22 + vtrn.16 d21, d23 + vtrn.16 d1, d3 + vtrn.16 d2, d15 + + vtrn.8 d20, d21 + vtrn.8 d22, d23 + vtrn.8 d1, d2 + vtrn.8 d3, d15 + + vst1.8 {d0}, [r8,:64], r1 + vst1.8 {d20}, [r0,:64], r1 + vst1.8 {d16}, [r8,:64], r1 + vst1.8 {d21}, [r0,:64], r1 + vst1.8 {d24}, [r8,:64], r1 + vst1.8 {d22}, [r0,:64], r1 + vst1.8 {d25}, [r8,:64], r1 + vst1.8 {d23}, [r0,:64], r1 + vst1.8 {d26}, [r8,:64], r1 + vst1.8 {d1}, [r0,:64], r1 + vst1.8 {d27}, [r8,:64], r1 + vst1.8 {d2}, [r0,:64], r1 + vst1.8 {d18}, [r8,:64], r1 + vst1.8 {d3}, [r0,:64], r1 + vst1.8 {d19}, [r8,:64], r1 + vst1.8 {d15}, [r0,:64], r1 + +v_end: + vpop {d8-d15} + pop {r4-r8, pc} + + .size vpx_lpf_vertical_16_neon, .-vpx_lpf_vertical_16_neon @ ENDP @ |vpx_lpf_vertical_16_neon| + +@ void vpx_wide_mbfilter_neon(); +@ This is a helper function for the loopfilters. The invidual functions do the +@ necessary load, transpose (if necessary) and store. +@ +@ r0-r3 PRESERVE +@ d16 blimit +@ d17 limit +@ d18 thresh +@ d0 p7 +@ d1 p6 +@ d2 p5 +@ d3 p4 +@ d4 p3 +@ d5 p2 +@ d6 p1 +@ d7 p0 +@ d8 q0 +@ d9 q1 +@ d10 q2 +@ d11 q3 +@ d12 q4 +@ d13 q5 +@ d14 q6 +@ d15 q7 +_vpx_wide_mbfilter_neon: + vpx_wide_mbfilter_neon: @ PROC + mov r7, #0 + + @ filter_mask + vabd.u8 d19, d4, d5 @ abs(p3 - p2) + vabd.u8 d20, d5, d6 @ abs(p2 - p1) + vabd.u8 d21, d6, d7 @ abs(p1 - p0) + vabd.u8 d22, d9, d8 @ abs(q1 - q0) + vabd.u8 d23, d10, d9 @ abs(q2 - q1) + vabd.u8 d24, d11, d10 @ abs(q3 - q2) + + @ only compare the largest value to limit + vmax.u8 d19, d19, d20 @ max(abs(p3 - p2), abs(p2 - p1)) + vmax.u8 d20, d21, d22 @ max(abs(p1 - p0), abs(q1 - q0)) + vmax.u8 d23, d23, d24 @ max(abs(q2 - q1), abs(q3 - q2)) + vmax.u8 d19, d19, d20 + + vabd.u8 d24, d7, d8 @ abs(p0 - q0) + + vmax.u8 d19, d19, d23 + + vabd.u8 d23, d6, d9 @ a = abs(p1 - q1) + vqadd.u8 d24, d24, d24 @ b = abs(p0 - q0) * 2 + + @ abs () > limit + vcge.u8 d19, d17, d19 + + @ flatmask4 + vabd.u8 d25, d7, d5 @ abs(p0 - p2) + vabd.u8 d26, d8, d10 @ abs(q0 - q2) + vabd.u8 d27, d4, d7 @ abs(p3 - p0) + vabd.u8 d28, d11, d8 @ abs(q3 - q0) + + @ only compare the largest value to thresh + vmax.u8 d25, d25, d26 @ max(abs(p0 - p2), abs(q0 - q2)) + vmax.u8 d26, d27, d28 @ max(abs(p3 - p0), abs(q3 - q0)) + vmax.u8 d25, d25, d26 + vmax.u8 d20, d20, d25 + + vshr.u8 d23, d23, #1 @ a = a / 2 + vqadd.u8 d24, d24, d23 @ a = b + a + + vmov.u8 d30, #1 + vcge.u8 d24, d16, d24 @ (a > blimit * 2 + limit) * -1 + + vcge.u8 d20, d30, d20 @ flat + + vand d19, d19, d24 @ mask + + @ hevmask + vcgt.u8 d21, d21, d18 @ (abs(p1 - p0) > thresh)*-1 + vcgt.u8 d22, d22, d18 @ (abs(q1 - q0) > thresh)*-1 + vorr d21, d21, d22 @ hev + + vand d16, d20, d19 @ flat && mask + vmov r5, r6, d16 + + @ flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7) + vabd.u8 d22, d3, d7 @ abs(p4 - p0) + vabd.u8 d23, d12, d8 @ abs(q4 - q0) + vabd.u8 d24, d7, d2 @ abs(p0 - p5) + vabd.u8 d25, d8, d13 @ abs(q0 - q5) + vabd.u8 d26, d1, d7 @ abs(p6 - p0) + vabd.u8 d27, d14, d8 @ abs(q6 - q0) + vabd.u8 d28, d0, d7 @ abs(p7 - p0) + vabd.u8 d29, d15, d8 @ abs(q7 - q0) + + @ only compare the largest value to thresh + vmax.u8 d22, d22, d23 @ max(abs(p4 - p0), abs(q4 - q0)) + vmax.u8 d23, d24, d25 @ max(abs(p0 - p5), abs(q0 - q5)) + vmax.u8 d24, d26, d27 @ max(abs(p6 - p0), abs(q6 - q0)) + vmax.u8 d25, d28, d29 @ max(abs(p7 - p0), abs(q7 - q0)) + + vmax.u8 d26, d22, d23 + vmax.u8 d27, d24, d25 + vmax.u8 d23, d26, d27 + + vcge.u8 d18, d30, d23 @ flat2 + + vmov.u8 d22, #0x80 + + orrs r5, r5, r6 @ Check for 0 + orreq r7, r7, #1 @ Only do filter branch + + vand d17, d18, d16 @ flat2 && flat && mask + vmov r5, r6, d17 + + @ mbfilter() function + + @ filter() function + @ convert to signed + veor d23, d8, d22 @ qs0 + veor d24, d7, d22 @ ps0 + veor d25, d6, d22 @ ps1 + veor d26, d9, d22 @ qs1 + + vmov.u8 d27, #3 + + vsub.s8 d28, d23, d24 @ ( qs0 - ps0) + vqsub.s8 d29, d25, d26 @ filter = clamp(ps1-qs1) + vmull.s8 q15, d28, d27 @ 3 * ( qs0 - ps0) + vand d29, d29, d21 @ filter &= hev + vaddw.s8 q15, q15, d29 @ filter + 3 * (qs0 - ps0) + vmov.u8 d29, #4 + + @ filter = clamp(filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d28, q15 + + vand d28, d28, d19 @ filter &= mask + + vqadd.s8 d30, d28, d27 @ filter2 = clamp(filter+3) + vqadd.s8 d29, d28, d29 @ filter1 = clamp(filter+4) + vshr.s8 d30, d30, #3 @ filter2 >>= 3 + vshr.s8 d29, d29, #3 @ filter1 >>= 3 + + + vqadd.s8 d24, d24, d30 @ op0 = clamp(ps0 + filter2) + vqsub.s8 d23, d23, d29 @ oq0 = clamp(qs0 - filter1) + + @ outer tap adjustments: ++filter1 >> 1 + vrshr.s8 d29, d29, #1 + vbic d29, d29, d21 @ filter &= ~hev + + vqadd.s8 d25, d25, d29 @ op1 = clamp(ps1 + filter) + vqsub.s8 d26, d26, d29 @ oq1 = clamp(qs1 - filter) + + veor d24, d24, d22 @ *f_op0 = u^0x80 + veor d23, d23, d22 @ *f_oq0 = u^0x80 + veor d25, d25, d22 @ *f_op1 = u^0x80 + veor d26, d26, d22 @ *f_oq1 = u^0x80 + + tst r7, #1 + bxne lr + + orrs r5, r5, r6 @ Check for 0 + orreq r7, r7, #2 @ Only do mbfilter branch + + @ mbfilter flat && mask branch + @ TODO(fgalligan): Can I decrease the cycles shifting to consective d's + @ and using vibt on the q's? + vmov.u8 d29, #2 + vaddl.u8 q15, d7, d8 @ op2 = p0 + q0 + vmlal.u8 q15, d4, d27 @ op2 = p0 + q0 + p3 * 3 + vmlal.u8 q15, d5, d29 @ op2 = p0 + q0 + p3 * 3 + p2 * 2 + vaddl.u8 q10, d4, d5 + vaddw.u8 q15, d6 @ op2=p1 + p0 + q0 + p3 * 3 + p2 *2 + vaddl.u8 q14, d6, d9 + vqrshrn.u16 d18, q15, #3 @ r_op2 + + vsub.i16 q15, q10 + vaddl.u8 q10, d4, d6 + vadd.i16 q15, q14 + vaddl.u8 q14, d7, d10 + vqrshrn.u16 d19, q15, #3 @ r_op1 + + vsub.i16 q15, q10 + vadd.i16 q15, q14 + vaddl.u8 q14, d8, d11 + vqrshrn.u16 d20, q15, #3 @ r_op0 + + vsubw.u8 q15, d4 @ oq0 = op0 - p3 + vsubw.u8 q15, d7 @ oq0 -= p0 + vadd.i16 q15, q14 + vaddl.u8 q14, d9, d11 + vqrshrn.u16 d21, q15, #3 @ r_oq0 + + vsubw.u8 q15, d5 @ oq1 = oq0 - p2 + vsubw.u8 q15, d8 @ oq1 -= q0 + vadd.i16 q15, q14 + vaddl.u8 q14, d10, d11 + vqrshrn.u16 d22, q15, #3 @ r_oq1 + + vsubw.u8 q15, d6 @ oq2 = oq0 - p1 + vsubw.u8 q15, d9 @ oq2 -= q1 + vadd.i16 q15, q14 + vqrshrn.u16 d27, q15, #3 @ r_oq2 + + @ Filter does not set op2 or oq2, so use p2 and q2. + vbif d18, d5, d16 @ t_op2 |= p2 & ~(flat & mask) + vbif d19, d25, d16 @ t_op1 |= f_op1 & ~(flat & mask) + vbif d20, d24, d16 @ t_op0 |= f_op0 & ~(flat & mask) + vbif d21, d23, d16 @ t_oq0 |= f_oq0 & ~(flat & mask) + vbif d22, d26, d16 @ t_oq1 |= f_oq1 & ~(flat & mask) + + vbit d23, d27, d16 @ t_oq2 |= r_oq2 & (flat & mask) + vbif d23, d10, d16 @ t_oq2 |= q2 & ~(flat & mask) + + tst r7, #2 + bxne lr + + @ wide_mbfilter flat2 && flat && mask branch + vmov.u8 d16, #7 + vaddl.u8 q15, d7, d8 @ op6 = p0 + q0 + vaddl.u8 q12, d2, d3 + vaddl.u8 q13, d4, d5 + vaddl.u8 q14, d1, d6 + vmlal.u8 q15, d0, d16 @ op6 += p7 * 3 + vadd.i16 q12, q13 + vadd.i16 q15, q14 + vaddl.u8 q14, d2, d9 + vadd.i16 q15, q12 + vaddl.u8 q12, d0, d1 + vaddw.u8 q15, d1 + vaddl.u8 q13, d0, d2 + vadd.i16 q14, q15, q14 + vqrshrn.u16 d16, q15, #4 @ w_op6 + + vsub.i16 q15, q14, q12 + vaddl.u8 q14, d3, d10 + vqrshrn.u16 d24, q15, #4 @ w_op5 + + vsub.i16 q15, q13 + vaddl.u8 q13, d0, d3 + vadd.i16 q15, q14 + vaddl.u8 q14, d4, d11 + vqrshrn.u16 d25, q15, #4 @ w_op4 + + vadd.i16 q15, q14 + vaddl.u8 q14, d0, d4 + vsub.i16 q15, q13 + vsub.i16 q14, q15, q14 + vqrshrn.u16 d26, q15, #4 @ w_op3 + + vaddw.u8 q15, q14, d5 @ op2 += p2 + vaddl.u8 q14, d0, d5 + vaddw.u8 q15, d12 @ op2 += q4 + vbif d26, d4, d17 @ op3 |= p3 & ~(f2 & f & m) + vqrshrn.u16 d27, q15, #4 @ w_op2 + + vsub.i16 q15, q14 + vaddl.u8 q14, d0, d6 + vaddw.u8 q15, d6 @ op1 += p1 + vaddw.u8 q15, d13 @ op1 += q5 + vbif d27, d18, d17 @ op2 |= t_op2 & ~(f2 & f & m) + vqrshrn.u16 d18, q15, #4 @ w_op1 + + vsub.i16 q15, q14 + vaddl.u8 q14, d0, d7 + vaddw.u8 q15, d7 @ op0 += p0 + vaddw.u8 q15, d14 @ op0 += q6 + vbif d18, d19, d17 @ op1 |= t_op1 & ~(f2 & f & m) + vqrshrn.u16 d19, q15, #4 @ w_op0 + + vsub.i16 q15, q14 + vaddl.u8 q14, d1, d8 + vaddw.u8 q15, d8 @ oq0 += q0 + vaddw.u8 q15, d15 @ oq0 += q7 + vbif d19, d20, d17 @ op0 |= t_op0 & ~(f2 & f & m) + vqrshrn.u16 d20, q15, #4 @ w_oq0 + + vsub.i16 q15, q14 + vaddl.u8 q14, d2, d9 + vaddw.u8 q15, d9 @ oq1 += q1 + vaddl.u8 q4, d10, d15 + vaddw.u8 q15, d15 @ oq1 += q7 + vbif d20, d21, d17 @ oq0 |= t_oq0 & ~(f2 & f & m) + vqrshrn.u16 d21, q15, #4 @ w_oq1 + + vsub.i16 q15, q14 + vaddl.u8 q14, d3, d10 + vadd.i16 q15, q4 + vaddl.u8 q4, d11, d15 + vbif d21, d22, d17 @ oq1 |= t_oq1 & ~(f2 & f & m) + vqrshrn.u16 d22, q15, #4 @ w_oq2 + + vsub.i16 q15, q14 + vaddl.u8 q14, d4, d11 + vadd.i16 q15, q4 + vaddl.u8 q4, d12, d15 + vbif d22, d23, d17 @ oq2 |= t_oq2 & ~(f2 & f & m) + vqrshrn.u16 d23, q15, #4 @ w_oq3 + + vsub.i16 q15, q14 + vaddl.u8 q14, d5, d12 + vadd.i16 q15, q4 + vaddl.u8 q4, d13, d15 + vbif d16, d1, d17 @ op6 |= p6 & ~(f2 & f & m) + vqrshrn.u16 d1, q15, #4 @ w_oq4 + + vsub.i16 q15, q14 + vaddl.u8 q14, d6, d13 + vadd.i16 q15, q4 + vaddl.u8 q4, d14, d15 + vbif d24, d2, d17 @ op5 |= p5 & ~(f2 & f & m) + vqrshrn.u16 d2, q15, #4 @ w_oq5 + + vsub.i16 q15, q14 + vbif d25, d3, d17 @ op4 |= p4 & ~(f2 & f & m) + vadd.i16 q15, q4 + vbif d23, d11, d17 @ oq3 |= q3 & ~(f2 & f & m) + vqrshrn.u16 d3, q15, #4 @ w_oq6 + vbif d1, d12, d17 @ oq4 |= q4 & ~(f2 & f & m) + vbif d2, d13, d17 @ oq5 |= q5 & ~(f2 & f & m) + vbif d3, d14, d17 @ oq6 |= q6 & ~(f2 & f & m) + + bx lr + .size vpx_wide_mbfilter_neon, .-vpx_wide_mbfilter_neon @ ENDP @ |vpx_wide_mbfilter_neon| + + .section .note.GNU-stack,"",%progbits diff --git a/thirdparty/libvpx/vpx_dsp/arm/gas/save_reg_neon.s b/thirdparty/libvpx/vpx_dsp/arm/gas/save_reg_neon.s new file mode 100644 index 0000000000..e8852fa0d0 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/arm/gas/save_reg_neon.s @@ -0,0 +1,44 @@ +@ This file was created from a .asm file +@ using the ads2gas.pl script. + .equ DO1STROUNDING, 0 +@ +@ Copyright (c) 2010 The WebM project authors. All Rights Reserved. +@ +@ Use of this source code is governed by a BSD-style license +@ that can be found in the LICENSE file in the root of the source +@ tree. An additional intellectual property rights grant can be found +@ in the file PATENTS. All contributing project authors may +@ be found in the AUTHORS file in the root of the source tree. +@ + + + .global vpx_push_neon + .type vpx_push_neon, function + .global vpx_pop_neon + .type vpx_pop_neon, function + + .arm + .eabi_attribute 24, 1 @Tag_ABI_align_needed + .eabi_attribute 25, 1 @Tag_ABI_align_preserved + +.text +.p2align 2 + +_vpx_push_neon: + vpx_push_neon: @ PROC + vst1.i64 {d8, d9, d10, d11}, [r0]! + vst1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + + .size vpx_push_neon, .-vpx_push_neon @ ENDP + +_vpx_pop_neon: + vpx_pop_neon: @ PROC + vld1.i64 {d8, d9, d10, d11}, [r0]! + vld1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + + .size vpx_pop_neon, .-vpx_pop_neon @ ENDP + + + .section .note.GNU-stack,"",%progbits diff --git a/thirdparty/libvpx/vpx_dsp/arm/gas_apple/intrapred_neon_asm.s b/thirdparty/libvpx/vpx_dsp/arm/gas_apple/intrapred_neon_asm.s new file mode 100644 index 0000000000..1c527afcff --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/arm/gas_apple/intrapred_neon_asm.s @@ -0,0 +1,660 @@ +@ This file was created from a .asm file +@ using the ads2gas_apple.pl script. + + .set WIDE_REFERENCE, 0 + .set ARCHITECTURE, 5 + .set DO1STROUNDING, 0 + @ + @ Copyright (c) 2014 The WebM project authors. All Rights Reserved. + @ + @ Use of this source code is governed by a BSD-style license + @ that can be found in the LICENSE file in the root of the source + @ tree. An additional intellectual property rights grant can be found + @ in the file PATENTS. All contributing project authors may + @ be found in the AUTHORS file in the root of the source tree. + @ + + .globl _vpx_v_predictor_4x4_neon + .globl vpx_v_predictor_4x4_neon + .globl _vpx_v_predictor_8x8_neon + .globl vpx_v_predictor_8x8_neon + .globl _vpx_v_predictor_16x16_neon + .globl vpx_v_predictor_16x16_neon + .globl _vpx_v_predictor_32x32_neon + .globl vpx_v_predictor_32x32_neon + .globl _vpx_h_predictor_4x4_neon + .globl vpx_h_predictor_4x4_neon + .globl _vpx_h_predictor_8x8_neon + .globl vpx_h_predictor_8x8_neon + .globl _vpx_h_predictor_16x16_neon + .globl vpx_h_predictor_16x16_neon + .globl _vpx_h_predictor_32x32_neon + .globl vpx_h_predictor_32x32_neon + .globl _vpx_tm_predictor_4x4_neon + .globl vpx_tm_predictor_4x4_neon + .globl _vpx_tm_predictor_8x8_neon + .globl vpx_tm_predictor_8x8_neon + .globl _vpx_tm_predictor_16x16_neon + .globl vpx_tm_predictor_16x16_neon + .globl _vpx_tm_predictor_32x32_neon + .globl vpx_tm_predictor_32x32_neon + @ ARM + @ + @ PRESERVE8 + +.text +.p2align 2 + + @void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, + @ const uint8_t *above, + @ const uint8_t *left) + @ r0 uint8_t *dst + @ r1 ptrdiff_t y_stride + @ r2 const uint8_t *above + @ r3 const uint8_t *left + +_vpx_v_predictor_4x4_neon: + vpx_v_predictor_4x4_neon: @ + vld1.32 {d0[0]}, [r2] + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + bx lr + @ @ |vpx_v_predictor_4x4_neon| + + @void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, + @ const uint8_t *above, + @ const uint8_t *left) + @ r0 uint8_t *dst + @ r1 ptrdiff_t y_stride + @ r2 const uint8_t *above + @ r3 const uint8_t *left + +_vpx_v_predictor_8x8_neon: + vpx_v_predictor_8x8_neon: @ + vld1.8 {d0}, [r2] + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + bx lr + @ @ |vpx_v_predictor_8x8_neon| + + @void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, + @ const uint8_t *above, + @ const uint8_t *left) + @ r0 uint8_t *dst + @ r1 ptrdiff_t y_stride + @ r2 const uint8_t *above + @ r3 const uint8_t *left + +_vpx_v_predictor_16x16_neon: + vpx_v_predictor_16x16_neon: @ + vld1.8 {q0}, [r2] + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + bx lr + @ @ |vpx_v_predictor_16x16_neon| + + @void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, + @ const uint8_t *above, + @ const uint8_t *left) + @ r0 uint8_t *dst + @ r1 ptrdiff_t y_stride + @ r2 const uint8_t *above + @ r3 const uint8_t *left + +_vpx_v_predictor_32x32_neon: + vpx_v_predictor_32x32_neon: @ + vld1.8 {q0, q1}, [r2] + mov r2, #2 +loop_v: + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + subs r2, r2, #1 + bgt loop_v + bx lr + @ @ |vpx_v_predictor_32x32_neon| + + @void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, + @ const uint8_t *above, + @ const uint8_t *left) + @ r0 uint8_t *dst + @ r1 ptrdiff_t y_stride + @ r2 const uint8_t *above + @ r3 const uint8_t *left + +_vpx_h_predictor_4x4_neon: + vpx_h_predictor_4x4_neon: @ + vld1.32 {d1[0]}, [r3] + vdup.8 d0, d1[0] + vst1.32 {d0[0]}, [r0], r1 + vdup.8 d0, d1[1] + vst1.32 {d0[0]}, [r0], r1 + vdup.8 d0, d1[2] + vst1.32 {d0[0]}, [r0], r1 + vdup.8 d0, d1[3] + vst1.32 {d0[0]}, [r0], r1 + bx lr + @ @ |vpx_h_predictor_4x4_neon| + + @void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, + @ const uint8_t *above, + @ const uint8_t *left) + @ r0 uint8_t *dst + @ r1 ptrdiff_t y_stride + @ r2 const uint8_t *above + @ r3 const uint8_t *left + +_vpx_h_predictor_8x8_neon: + vpx_h_predictor_8x8_neon: @ + vld1.64 {d1}, [r3] + vdup.8 d0, d1[0] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[1] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[2] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[3] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[4] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[5] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[6] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[7] + vst1.64 {d0}, [r0], r1 + bx lr + @ @ |vpx_h_predictor_8x8_neon| + + @void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, + @ const uint8_t *above, + @ const uint8_t *left) + @ r0 uint8_t *dst + @ r1 ptrdiff_t y_stride + @ r2 const uint8_t *above + @ r3 const uint8_t *left + +_vpx_h_predictor_16x16_neon: + vpx_h_predictor_16x16_neon: @ + vld1.8 {q1}, [r3] + vdup.8 q0, d2[0] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[1] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[2] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[3] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[4] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[5] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[6] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[7] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[0] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[1] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[2] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[3] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[4] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[5] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[6] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[7] + vst1.8 {q0}, [r0], r1 + bx lr + @ @ |vpx_h_predictor_16x16_neon| + + @void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, + @ const uint8_t *above, + @ const uint8_t *left) + @ r0 uint8_t *dst + @ r1 ptrdiff_t y_stride + @ r2 const uint8_t *above + @ r3 const uint8_t *left + +_vpx_h_predictor_32x32_neon: + vpx_h_predictor_32x32_neon: @ + sub r1, r1, #16 + mov r2, #2 +loop_h: + vld1.8 {q1}, [r3]! + vdup.8 q0, d2[0] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[1] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[2] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[3] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[4] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[5] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[6] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[7] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[0] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[1] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[2] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[3] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[4] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[5] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[6] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[7] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + subs r2, r2, #1 + bgt loop_h + bx lr + @ @ |vpx_h_predictor_32x32_neon| + + @void vpx_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride, + @ const uint8_t *above, + @ const uint8_t *left) + @ r0 uint8_t *dst + @ r1 ptrdiff_t y_stride + @ r2 const uint8_t *above + @ r3 const uint8_t *left + +_vpx_tm_predictor_4x4_neon: + vpx_tm_predictor_4x4_neon: @ + @ Load ytop_left = above[-1] @ + sub r12, r2, #1 + vld1.u8 {d0[]}, [r12] + + @ Load above 4 pixels + vld1.32 {d2[0]}, [r2] + + @ Compute above - ytop_left + vsubl.u8 q3, d2, d0 + + @ Load left row by row and compute left + (above - ytop_left) + @ 1st row and 2nd row + vld1.u8 {d2[]}, [r3]! + vld1.u8 {d4[]}, [r3]! + vmovl.u8 q1, d2 + vmovl.u8 q2, d4 + vadd.s16 q1, q1, q3 + vadd.s16 q2, q2, q3 + vqmovun.s16 d0, q1 + vqmovun.s16 d1, q2 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d1[0]}, [r0], r1 + + @ 3rd row and 4th row + vld1.u8 {d2[]}, [r3]! + vld1.u8 {d4[]}, [r3] + vmovl.u8 q1, d2 + vmovl.u8 q2, d4 + vadd.s16 q1, q1, q3 + vadd.s16 q2, q2, q3 + vqmovun.s16 d0, q1 + vqmovun.s16 d1, q2 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d1[0]}, [r0], r1 + bx lr + @ @ |vpx_tm_predictor_4x4_neon| + + @void vpx_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride, + @ const uint8_t *above, + @ const uint8_t *left) + @ r0 uint8_t *dst + @ r1 ptrdiff_t y_stride + @ r2 const uint8_t *above + @ r3 const uint8_t *left + +_vpx_tm_predictor_8x8_neon: + vpx_tm_predictor_8x8_neon: @ + @ Load ytop_left = above[-1] @ + sub r12, r2, #1 + vld1.8 {d0[]}, [r12] + + @ preload 8 left + vld1.8 {d30}, [r3] + + @ Load above 8 pixels + vld1.64 {d2}, [r2] + + vmovl.u8 q10, d30 + + @ Compute above - ytop_left + vsubl.u8 q3, d2, d0 + + @ Load left row by row and compute left + (above - ytop_left) + @ 1st row and 2nd row + vdup.16 q0, d20[0] + vdup.16 q1, d20[1] + vadd.s16 q0, q3, q0 + vadd.s16 q1, q3, q1 + + @ 3rd row and 4th row + vdup.16 q8, d20[2] + vdup.16 q9, d20[3] + vadd.s16 q8, q3, q8 + vadd.s16 q9, q3, q9 + + vqmovun.s16 d0, q0 + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q8 + vqmovun.s16 d3, q9 + + vst1.64 {d0}, [r0], r1 + vst1.64 {d1}, [r0], r1 + vst1.64 {d2}, [r0], r1 + vst1.64 {d3}, [r0], r1 + + @ 5th row and 6th row + vdup.16 q0, d21[0] + vdup.16 q1, d21[1] + vadd.s16 q0, q3, q0 + vadd.s16 q1, q3, q1 + + @ 7th row and 8th row + vdup.16 q8, d21[2] + vdup.16 q9, d21[3] + vadd.s16 q8, q3, q8 + vadd.s16 q9, q3, q9 + + vqmovun.s16 d0, q0 + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q8 + vqmovun.s16 d3, q9 + + vst1.64 {d0}, [r0], r1 + vst1.64 {d1}, [r0], r1 + vst1.64 {d2}, [r0], r1 + vst1.64 {d3}, [r0], r1 + + bx lr + @ @ |vpx_tm_predictor_8x8_neon| + + @void vpx_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride, + @ const uint8_t *above, + @ const uint8_t *left) + @ r0 uint8_t *dst + @ r1 ptrdiff_t y_stride + @ r2 const uint8_t *above + @ r3 const uint8_t *left + +_vpx_tm_predictor_16x16_neon: + vpx_tm_predictor_16x16_neon: @ + @ Load ytop_left = above[-1] @ + sub r12, r2, #1 + vld1.8 {d0[]}, [r12] + + @ Load above 8 pixels + vld1.8 {q1}, [r2] + + @ preload 8 left into r12 + vld1.8 {d18}, [r3]! + + @ Compute above - ytop_left + vsubl.u8 q2, d2, d0 + vsubl.u8 q3, d3, d0 + + vmovl.u8 q10, d18 + + @ Load left row by row and compute left + (above - ytop_left) + @ Process 8 rows in each single loop and loop 2 times to process 16 rows. + mov r2, #2 + +loop_16x16_neon: + @ Process two rows. + vdup.16 q0, d20[0] + vdup.16 q8, d20[1] + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vdup.16 q0, d20[2] @ proload next 2 rows data + vdup.16 q8, d20[3] + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + @ Process two rows. + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vdup.16 q0, d21[0] @ proload next 2 rows data + vdup.16 q8, d21[1] + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vdup.16 q0, d21[2] @ proload next 2 rows data + vdup.16 q8, d21[3] + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vld1.8 {d18}, [r3]! @ preload 8 left into r12 + vmovl.u8 q10, d18 + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + subs r2, r2, #1 + bgt loop_16x16_neon + + bx lr + @ @ |vpx_tm_predictor_16x16_neon| + + @void vpx_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride, + @ const uint8_t *above, + @ const uint8_t *left) + @ r0 uint8_t *dst + @ r1 ptrdiff_t y_stride + @ r2 const uint8_t *above + @ r3 const uint8_t *left + +_vpx_tm_predictor_32x32_neon: + vpx_tm_predictor_32x32_neon: @ + @ Load ytop_left = above[-1] @ + sub r12, r2, #1 + vld1.8 {d0[]}, [r12] + + @ Load above 32 pixels + vld1.8 {q1}, [r2]! + vld1.8 {q2}, [r2] + + @ preload 8 left pixels + vld1.8 {d26}, [r3]! + + @ Compute above - ytop_left + vsubl.u8 q8, d2, d0 + vsubl.u8 q9, d3, d0 + vsubl.u8 q10, d4, d0 + vsubl.u8 q11, d5, d0 + + vmovl.u8 q3, d26 + + @ Load left row by row and compute left + (above - ytop_left) + @ Process 8 rows in each single loop and loop 4 times to process 32 rows. + mov r2, #4 + +loop_32x32_neon: + @ Process two rows. + vdup.16 q0, d6[0] + vdup.16 q2, d6[1] + vadd.s16 q12, q0, q8 + vadd.s16 q13, q0, q9 + vadd.s16 q14, q0, q10 + vadd.s16 q15, q0, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vdup.16 q1, d6[2] + vdup.16 q2, d6[3] + vst1.64 {d24-d27}, [r0], r1 + + @ Process two rows. + vadd.s16 q12, q1, q8 + vadd.s16 q13, q1, q9 + vadd.s16 q14, q1, q10 + vadd.s16 q15, q1, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vdup.16 q0, d7[0] + vdup.16 q2, d7[1] + vst1.64 {d24-d27}, [r0], r1 + + @ Process two rows. + vadd.s16 q12, q0, q8 + vadd.s16 q13, q0, q9 + vadd.s16 q14, q0, q10 + vadd.s16 q15, q0, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vdup.16 q0, d7[2] + vdup.16 q2, d7[3] + vst1.64 {d24-d27}, [r0], r1 + + @ Process two rows. + vadd.s16 q12, q0, q8 + vadd.s16 q13, q0, q9 + vadd.s16 q14, q0, q10 + vadd.s16 q15, q0, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vld1.8 {d0}, [r3]! @ preload 8 left pixels + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vmovl.u8 q3, d0 + vst1.64 {d24-d27}, [r0], r1 + + subs r2, r2, #1 + bgt loop_32x32_neon + + bx lr + @ @ |vpx_tm_predictor_32x32_neon| + diff --git a/thirdparty/libvpx/vpx_dsp/arm/gas_apple/loopfilter_mb_neon.s b/thirdparty/libvpx/vpx_dsp/arm/gas_apple/loopfilter_mb_neon.s new file mode 100644 index 0000000000..69f7e5207e --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/arm/gas_apple/loopfilter_mb_neon.s @@ -0,0 +1,649 @@ +@ This file was created from a .asm file +@ using the ads2gas_apple.pl script. + + .set WIDE_REFERENCE, 0 + .set ARCHITECTURE, 5 + .set DO1STROUNDING, 0 + @ + @ Copyright (c) 2013 The WebM project authors. All Rights Reserved. + @ + @ Use of this source code is governed by a BSD-style license + @ that can be found in the LICENSE file in the root of the source + @ tree. An additional intellectual property rights grant can be found + @ in the file PATENTS. All contributing project authors may + @ be found in the AUTHORS file in the root of the source tree. + @ + + .globl _vpx_lpf_horizontal_edge_8_neon + .globl vpx_lpf_horizontal_edge_8_neon + .globl _vpx_lpf_horizontal_edge_16_neon + .globl vpx_lpf_horizontal_edge_16_neon + .globl _vpx_lpf_vertical_16_neon + .globl vpx_lpf_vertical_16_neon + @ ARM + +.text +.p2align 2 + + @ void mb_lpf_horizontal_edge(uint8_t *s, int p, + @ const uint8_t *blimit, + @ const uint8_t *limit, + @ const uint8_t *thresh, + @ int count) + @ r0 uint8_t *s, + @ r1 int p, /* pitch */ + @ r2 const uint8_t *blimit, + @ r3 const uint8_t *limit, + @ sp const uint8_t *thresh, + @ r12 int count +_mb_lpf_horizontal_edge: + mb_lpf_horizontal_edge: @ + push {r4-r8, lr} + vpush {d8-d15} + ldr r4, [sp, #88] @ load thresh + +h_count: + vld1.8 {d16[]}, [r2] @ load *blimit + vld1.8 {d17[]}, [r3] @ load *limit + vld1.8 {d18[]}, [r4] @ load *thresh + + sub r8, r0, r1, lsl #3 @ move src pointer down by 8 lines + + vld1.u8 {d0}, [r8,:64], r1 @ p7 + vld1.u8 {d1}, [r8,:64], r1 @ p6 + vld1.u8 {d2}, [r8,:64], r1 @ p5 + vld1.u8 {d3}, [r8,:64], r1 @ p4 + vld1.u8 {d4}, [r8,:64], r1 @ p3 + vld1.u8 {d5}, [r8,:64], r1 @ p2 + vld1.u8 {d6}, [r8,:64], r1 @ p1 + vld1.u8 {d7}, [r8,:64], r1 @ p0 + vld1.u8 {d8}, [r8,:64], r1 @ q0 + vld1.u8 {d9}, [r8,:64], r1 @ q1 + vld1.u8 {d10}, [r8,:64], r1 @ q2 + vld1.u8 {d11}, [r8,:64], r1 @ q3 + vld1.u8 {d12}, [r8,:64], r1 @ q4 + vld1.u8 {d13}, [r8,:64], r1 @ q5 + vld1.u8 {d14}, [r8,:64], r1 @ q6 + vld1.u8 {d15}, [r8,:64], r1 @ q7 + + bl vpx_wide_mbfilter_neon + + tst r7, #1 + beq h_mbfilter + + @ flat && mask were not set for any of the channels. Just store the values + @ from filter. + sub r8, r0, r1, lsl #1 + + vst1.u8 {d25}, [r8,:64], r1 @ store op1 + vst1.u8 {d24}, [r8,:64], r1 @ store op0 + vst1.u8 {d23}, [r8,:64], r1 @ store oq0 + vst1.u8 {d26}, [r8,:64], r1 @ store oq1 + + b h_next + +h_mbfilter: + tst r7, #2 + beq h_wide_mbfilter + + @ flat2 was not set for any of the channels. Just store the values from + @ mbfilter. + sub r8, r0, r1, lsl #1 + sub r8, r8, r1 + + vst1.u8 {d18}, [r8,:64], r1 @ store op2 + vst1.u8 {d19}, [r8,:64], r1 @ store op1 + vst1.u8 {d20}, [r8,:64], r1 @ store op0 + vst1.u8 {d21}, [r8,:64], r1 @ store oq0 + vst1.u8 {d22}, [r8,:64], r1 @ store oq1 + vst1.u8 {d23}, [r8,:64], r1 @ store oq2 + + b h_next + +h_wide_mbfilter: + sub r8, r0, r1, lsl #3 + add r8, r8, r1 + + vst1.u8 {d16}, [r8,:64], r1 @ store op6 + vst1.u8 {d24}, [r8,:64], r1 @ store op5 + vst1.u8 {d25}, [r8,:64], r1 @ store op4 + vst1.u8 {d26}, [r8,:64], r1 @ store op3 + vst1.u8 {d27}, [r8,:64], r1 @ store op2 + vst1.u8 {d18}, [r8,:64], r1 @ store op1 + vst1.u8 {d19}, [r8,:64], r1 @ store op0 + vst1.u8 {d20}, [r8,:64], r1 @ store oq0 + vst1.u8 {d21}, [r8,:64], r1 @ store oq1 + vst1.u8 {d22}, [r8,:64], r1 @ store oq2 + vst1.u8 {d23}, [r8,:64], r1 @ store oq3 + vst1.u8 {d1}, [r8,:64], r1 @ store oq4 + vst1.u8 {d2}, [r8,:64], r1 @ store oq5 + vst1.u8 {d3}, [r8,:64], r1 @ store oq6 + +h_next: + add r0, r0, #8 + subs r12, r12, #1 + bne h_count + + vpop {d8-d15} + pop {r4-r8, pc} + + @ @ |mb_lpf_horizontal_edge| + + @ void vpx_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch, + @ const uint8_t *blimit, + @ const uint8_t *limit, + @ const uint8_t *thresh) + @ r0 uint8_t *s, + @ r1 int pitch, + @ r2 const uint8_t *blimit, + @ r3 const uint8_t *limit, + @ sp const uint8_t *thresh +_vpx_lpf_horizontal_edge_8_neon: + vpx_lpf_horizontal_edge_8_neon: @ + mov r12, #1 + b mb_lpf_horizontal_edge + @ @ |vpx_lpf_horizontal_edge_8_neon| + + @ void vpx_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch, + @ const uint8_t *blimit, + @ const uint8_t *limit, + @ const uint8_t *thresh) + @ r0 uint8_t *s, + @ r1 int pitch, + @ r2 const uint8_t *blimit, + @ r3 const uint8_t *limit, + @ sp const uint8_t *thresh +_vpx_lpf_horizontal_edge_16_neon: + vpx_lpf_horizontal_edge_16_neon: @ + mov r12, #2 + b mb_lpf_horizontal_edge + @ @ |vpx_lpf_horizontal_edge_16_neon| + + @ void vpx_lpf_vertical_16_neon(uint8_t *s, int p, + @ const uint8_t *blimit, + @ const uint8_t *limit, + @ const uint8_t *thresh) + @ r0 uint8_t *s, + @ r1 int p, /* pitch */ + @ r2 const uint8_t *blimit, + @ r3 const uint8_t *limit, + @ sp const uint8_t *thresh, +_vpx_lpf_vertical_16_neon: + vpx_lpf_vertical_16_neon: @ + push {r4-r8, lr} + vpush {d8-d15} + ldr r4, [sp, #88] @ load thresh + + vld1.8 {d16[]}, [r2] @ load *blimit + vld1.8 {d17[]}, [r3] @ load *limit + vld1.8 {d18[]}, [r4] @ load *thresh + + sub r8, r0, #8 + + vld1.8 {d0}, [r8,:64], r1 + vld1.8 {d8}, [r0,:64], r1 + vld1.8 {d1}, [r8,:64], r1 + vld1.8 {d9}, [r0,:64], r1 + vld1.8 {d2}, [r8,:64], r1 + vld1.8 {d10}, [r0,:64], r1 + vld1.8 {d3}, [r8,:64], r1 + vld1.8 {d11}, [r0,:64], r1 + vld1.8 {d4}, [r8,:64], r1 + vld1.8 {d12}, [r0,:64], r1 + vld1.8 {d5}, [r8,:64], r1 + vld1.8 {d13}, [r0,:64], r1 + vld1.8 {d6}, [r8,:64], r1 + vld1.8 {d14}, [r0,:64], r1 + vld1.8 {d7}, [r8,:64], r1 + vld1.8 {d15}, [r0,:64], r1 + + sub r0, r0, r1, lsl #3 + + vtrn.32 q0, q2 + vtrn.32 q1, q3 + vtrn.32 q4, q6 + vtrn.32 q5, q7 + + vtrn.16 q0, q1 + vtrn.16 q2, q3 + vtrn.16 q4, q5 + vtrn.16 q6, q7 + + vtrn.8 d0, d1 + vtrn.8 d2, d3 + vtrn.8 d4, d5 + vtrn.8 d6, d7 + + vtrn.8 d8, d9 + vtrn.8 d10, d11 + vtrn.8 d12, d13 + vtrn.8 d14, d15 + + bl vpx_wide_mbfilter_neon + + tst r7, #1 + beq v_mbfilter + + @ flat && mask were not set for any of the channels. Just store the values + @ from filter. + sub r8, r0, #2 + + vswp d23, d25 + + vst4.8 {d23[0], d24[0], d25[0], d26[0]}, [r8], r1 + vst4.8 {d23[1], d24[1], d25[1], d26[1]}, [r8], r1 + vst4.8 {d23[2], d24[2], d25[2], d26[2]}, [r8], r1 + vst4.8 {d23[3], d24[3], d25[3], d26[3]}, [r8], r1 + vst4.8 {d23[4], d24[4], d25[4], d26[4]}, [r8], r1 + vst4.8 {d23[5], d24[5], d25[5], d26[5]}, [r8], r1 + vst4.8 {d23[6], d24[6], d25[6], d26[6]}, [r8], r1 + vst4.8 {d23[7], d24[7], d25[7], d26[7]}, [r8], r1 + + b v_end + +v_mbfilter: + tst r7, #2 + beq v_wide_mbfilter + + @ flat2 was not set for any of the channels. Just store the values from + @ mbfilter. + sub r8, r0, #3 + + vst3.8 {d18[0], d19[0], d20[0]}, [r8], r1 + vst3.8 {d21[0], d22[0], d23[0]}, [r0], r1 + vst3.8 {d18[1], d19[1], d20[1]}, [r8], r1 + vst3.8 {d21[1], d22[1], d23[1]}, [r0], r1 + vst3.8 {d18[2], d19[2], d20[2]}, [r8], r1 + vst3.8 {d21[2], d22[2], d23[2]}, [r0], r1 + vst3.8 {d18[3], d19[3], d20[3]}, [r8], r1 + vst3.8 {d21[3], d22[3], d23[3]}, [r0], r1 + vst3.8 {d18[4], d19[4], d20[4]}, [r8], r1 + vst3.8 {d21[4], d22[4], d23[4]}, [r0], r1 + vst3.8 {d18[5], d19[5], d20[5]}, [r8], r1 + vst3.8 {d21[5], d22[5], d23[5]}, [r0], r1 + vst3.8 {d18[6], d19[6], d20[6]}, [r8], r1 + vst3.8 {d21[6], d22[6], d23[6]}, [r0], r1 + vst3.8 {d18[7], d19[7], d20[7]}, [r8], r1 + vst3.8 {d21[7], d22[7], d23[7]}, [r0], r1 + + b v_end + +v_wide_mbfilter: + sub r8, r0, #8 + + vtrn.32 d0, d26 + vtrn.32 d16, d27 + vtrn.32 d24, d18 + vtrn.32 d25, d19 + + vtrn.16 d0, d24 + vtrn.16 d16, d25 + vtrn.16 d26, d18 + vtrn.16 d27, d19 + + vtrn.8 d0, d16 + vtrn.8 d24, d25 + vtrn.8 d26, d27 + vtrn.8 d18, d19 + + vtrn.32 d20, d1 + vtrn.32 d21, d2 + vtrn.32 d22, d3 + vtrn.32 d23, d15 + + vtrn.16 d20, d22 + vtrn.16 d21, d23 + vtrn.16 d1, d3 + vtrn.16 d2, d15 + + vtrn.8 d20, d21 + vtrn.8 d22, d23 + vtrn.8 d1, d2 + vtrn.8 d3, d15 + + vst1.8 {d0}, [r8,:64], r1 + vst1.8 {d20}, [r0,:64], r1 + vst1.8 {d16}, [r8,:64], r1 + vst1.8 {d21}, [r0,:64], r1 + vst1.8 {d24}, [r8,:64], r1 + vst1.8 {d22}, [r0,:64], r1 + vst1.8 {d25}, [r8,:64], r1 + vst1.8 {d23}, [r0,:64], r1 + vst1.8 {d26}, [r8,:64], r1 + vst1.8 {d1}, [r0,:64], r1 + vst1.8 {d27}, [r8,:64], r1 + vst1.8 {d2}, [r0,:64], r1 + vst1.8 {d18}, [r8,:64], r1 + vst1.8 {d3}, [r0,:64], r1 + vst1.8 {d19}, [r8,:64], r1 + vst1.8 {d15}, [r0,:64], r1 + +v_end: + vpop {d8-d15} + pop {r4-r8, pc} + + @ @ |vpx_lpf_vertical_16_neon| + + @ void vpx_wide_mbfilter_neon() @ + @ This is a helper function for the loopfilters. The invidual functions do the + @ necessary load, transpose (if necessary) and store. + @ + @ r0-r3 PRESERVE + @ d16 blimit + @ d17 limit + @ d18 thresh + @ d0 p7 + @ d1 p6 + @ d2 p5 + @ d3 p4 + @ d4 p3 + @ d5 p2 + @ d6 p1 + @ d7 p0 + @ d8 q0 + @ d9 q1 + @ d10 q2 + @ d11 q3 + @ d12 q4 + @ d13 q5 + @ d14 q6 + @ d15 q7 +_vpx_wide_mbfilter_neon: + vpx_wide_mbfilter_neon: @ + mov r7, #0 + + @ filter_mask + vabd.u8 d19, d4, d5 @ abs(p3 - p2) + vabd.u8 d20, d5, d6 @ abs(p2 - p1) + vabd.u8 d21, d6, d7 @ abs(p1 - p0) + vabd.u8 d22, d9, d8 @ abs(q1 - q0) + vabd.u8 d23, d10, d9 @ abs(q2 - q1) + vabd.u8 d24, d11, d10 @ abs(q3 - q2) + + @ only compare the largest value to limit + vmax.u8 d19, d19, d20 @ max(abs(p3 - p2), abs(p2 - p1)) + vmax.u8 d20, d21, d22 @ max(abs(p1 - p0), abs(q1 - q0)) + vmax.u8 d23, d23, d24 @ max(abs(q2 - q1), abs(q3 - q2)) + vmax.u8 d19, d19, d20 + + vabd.u8 d24, d7, d8 @ abs(p0 - q0) + + vmax.u8 d19, d19, d23 + + vabd.u8 d23, d6, d9 @ a = abs(p1 - q1) + vqadd.u8 d24, d24, d24 @ b = abs(p0 - q0) * 2 + + @ abs () > limit + vcge.u8 d19, d17, d19 + + @ flatmask4 + vabd.u8 d25, d7, d5 @ abs(p0 - p2) + vabd.u8 d26, d8, d10 @ abs(q0 - q2) + vabd.u8 d27, d4, d7 @ abs(p3 - p0) + vabd.u8 d28, d11, d8 @ abs(q3 - q0) + + @ only compare the largest value to thresh + vmax.u8 d25, d25, d26 @ max(abs(p0 - p2), abs(q0 - q2)) + vmax.u8 d26, d27, d28 @ max(abs(p3 - p0), abs(q3 - q0)) + vmax.u8 d25, d25, d26 + vmax.u8 d20, d20, d25 + + vshr.u8 d23, d23, #1 @ a = a / 2 + vqadd.u8 d24, d24, d23 @ a = b + a + + vmov.u8 d30, #1 + vcge.u8 d24, d16, d24 @ (a > blimit * 2 + limit) * -1 + + vcge.u8 d20, d30, d20 @ flat + + vand d19, d19, d24 @ mask + + @ hevmask + vcgt.u8 d21, d21, d18 @ (abs(p1 - p0) > thresh)*-1 + vcgt.u8 d22, d22, d18 @ (abs(q1 - q0) > thresh)*-1 + vorr d21, d21, d22 @ hev + + vand d16, d20, d19 @ flat && mask + vmov r5, r6, d16 + + @ flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7) + vabd.u8 d22, d3, d7 @ abs(p4 - p0) + vabd.u8 d23, d12, d8 @ abs(q4 - q0) + vabd.u8 d24, d7, d2 @ abs(p0 - p5) + vabd.u8 d25, d8, d13 @ abs(q0 - q5) + vabd.u8 d26, d1, d7 @ abs(p6 - p0) + vabd.u8 d27, d14, d8 @ abs(q6 - q0) + vabd.u8 d28, d0, d7 @ abs(p7 - p0) + vabd.u8 d29, d15, d8 @ abs(q7 - q0) + + @ only compare the largest value to thresh + vmax.u8 d22, d22, d23 @ max(abs(p4 - p0), abs(q4 - q0)) + vmax.u8 d23, d24, d25 @ max(abs(p0 - p5), abs(q0 - q5)) + vmax.u8 d24, d26, d27 @ max(abs(p6 - p0), abs(q6 - q0)) + vmax.u8 d25, d28, d29 @ max(abs(p7 - p0), abs(q7 - q0)) + + vmax.u8 d26, d22, d23 + vmax.u8 d27, d24, d25 + vmax.u8 d23, d26, d27 + + vcge.u8 d18, d30, d23 @ flat2 + + vmov.u8 d22, #0x80 + + orrs r5, r5, r6 @ Check for 0 + orreq r7, r7, #1 @ Only do filter branch + + vand d17, d18, d16 @ flat2 && flat && mask + vmov r5, r6, d17 + + @ mbfilter() function + + @ filter() function + @ convert to signed + veor d23, d8, d22 @ qs0 + veor d24, d7, d22 @ ps0 + veor d25, d6, d22 @ ps1 + veor d26, d9, d22 @ qs1 + + vmov.u8 d27, #3 + + vsub.s8 d28, d23, d24 @ ( qs0 - ps0) + vqsub.s8 d29, d25, d26 @ filter = clamp(ps1-qs1) + vmull.s8 q15, d28, d27 @ 3 * ( qs0 - ps0) + vand d29, d29, d21 @ filter &= hev + vaddw.s8 q15, q15, d29 @ filter + 3 * (qs0 - ps0) + vmov.u8 d29, #4 + + @ filter = clamp(filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d28, q15 + + vand d28, d28, d19 @ filter &= mask + + vqadd.s8 d30, d28, d27 @ filter2 = clamp(filter+3) + vqadd.s8 d29, d28, d29 @ filter1 = clamp(filter+4) + vshr.s8 d30, d30, #3 @ filter2 >>= 3 + vshr.s8 d29, d29, #3 @ filter1 >>= 3 + + + vqadd.s8 d24, d24, d30 @ op0 = clamp(ps0 + filter2) + vqsub.s8 d23, d23, d29 @ oq0 = clamp(qs0 - filter1) + + @ outer tap adjustments: ++filter1 >> 1 + vrshr.s8 d29, d29, #1 + vbic d29, d29, d21 @ filter &= ~hev + + vqadd.s8 d25, d25, d29 @ op1 = clamp(ps1 + filter) + vqsub.s8 d26, d26, d29 @ oq1 = clamp(qs1 - filter) + + veor d24, d24, d22 @ *f_op0 = u^0x80 + veor d23, d23, d22 @ *f_oq0 = u^0x80 + veor d25, d25, d22 @ *f_op1 = u^0x80 + veor d26, d26, d22 @ *f_oq1 = u^0x80 + + tst r7, #1 + bxne lr + + orrs r5, r5, r6 @ Check for 0 + orreq r7, r7, #2 @ Only do mbfilter branch + + @ mbfilter flat && mask branch + @ TODO(fgalligan): Can I decrease the cycles shifting to consective d's + @ and using vibt on the q's? + vmov.u8 d29, #2 + vaddl.u8 q15, d7, d8 @ op2 = p0 + q0 + vmlal.u8 q15, d4, d27 @ op2 = p0 + q0 + p3 * 3 + vmlal.u8 q15, d5, d29 @ op2 = p0 + q0 + p3 * 3 + p2 * 2 + vaddl.u8 q10, d4, d5 + vaddw.u8 q15, d6 @ op2=p1 + p0 + q0 + p3 * 3 + p2 *2 + vaddl.u8 q14, d6, d9 + vqrshrn.u16 d18, q15, #3 @ r_op2 + + vsub.i16 q15, q10 + vaddl.u8 q10, d4, d6 + vadd.i16 q15, q14 + vaddl.u8 q14, d7, d10 + vqrshrn.u16 d19, q15, #3 @ r_op1 + + vsub.i16 q15, q10 + vadd.i16 q15, q14 + vaddl.u8 q14, d8, d11 + vqrshrn.u16 d20, q15, #3 @ r_op0 + + vsubw.u8 q15, d4 @ oq0 = op0 - p3 + vsubw.u8 q15, d7 @ oq0 -= p0 + vadd.i16 q15, q14 + vaddl.u8 q14, d9, d11 + vqrshrn.u16 d21, q15, #3 @ r_oq0 + + vsubw.u8 q15, d5 @ oq1 = oq0 - p2 + vsubw.u8 q15, d8 @ oq1 -= q0 + vadd.i16 q15, q14 + vaddl.u8 q14, d10, d11 + vqrshrn.u16 d22, q15, #3 @ r_oq1 + + vsubw.u8 q15, d6 @ oq2 = oq0 - p1 + vsubw.u8 q15, d9 @ oq2 -= q1 + vadd.i16 q15, q14 + vqrshrn.u16 d27, q15, #3 @ r_oq2 + + @ Filter does not set op2 or oq2, so use p2 and q2. + vbif d18, d5, d16 @ t_op2 |= p2 & ~(flat & mask) + vbif d19, d25, d16 @ t_op1 |= f_op1 & ~(flat & mask) + vbif d20, d24, d16 @ t_op0 |= f_op0 & ~(flat & mask) + vbif d21, d23, d16 @ t_oq0 |= f_oq0 & ~(flat & mask) + vbif d22, d26, d16 @ t_oq1 |= f_oq1 & ~(flat & mask) + + vbit d23, d27, d16 @ t_oq2 |= r_oq2 & (flat & mask) + vbif d23, d10, d16 @ t_oq2 |= q2 & ~(flat & mask) + + tst r7, #2 + bxne lr + + @ wide_mbfilter flat2 && flat && mask branch + vmov.u8 d16, #7 + vaddl.u8 q15, d7, d8 @ op6 = p0 + q0 + vaddl.u8 q12, d2, d3 + vaddl.u8 q13, d4, d5 + vaddl.u8 q14, d1, d6 + vmlal.u8 q15, d0, d16 @ op6 += p7 * 3 + vadd.i16 q12, q13 + vadd.i16 q15, q14 + vaddl.u8 q14, d2, d9 + vadd.i16 q15, q12 + vaddl.u8 q12, d0, d1 + vaddw.u8 q15, d1 + vaddl.u8 q13, d0, d2 + vadd.i16 q14, q15, q14 + vqrshrn.u16 d16, q15, #4 @ w_op6 + + vsub.i16 q15, q14, q12 + vaddl.u8 q14, d3, d10 + vqrshrn.u16 d24, q15, #4 @ w_op5 + + vsub.i16 q15, q13 + vaddl.u8 q13, d0, d3 + vadd.i16 q15, q14 + vaddl.u8 q14, d4, d11 + vqrshrn.u16 d25, q15, #4 @ w_op4 + + vadd.i16 q15, q14 + vaddl.u8 q14, d0, d4 + vsub.i16 q15, q13 + vsub.i16 q14, q15, q14 + vqrshrn.u16 d26, q15, #4 @ w_op3 + + vaddw.u8 q15, q14, d5 @ op2 += p2 + vaddl.u8 q14, d0, d5 + vaddw.u8 q15, d12 @ op2 += q4 + vbif d26, d4, d17 @ op3 |= p3 & ~(f2 & f & m) + vqrshrn.u16 d27, q15, #4 @ w_op2 + + vsub.i16 q15, q14 + vaddl.u8 q14, d0, d6 + vaddw.u8 q15, d6 @ op1 += p1 + vaddw.u8 q15, d13 @ op1 += q5 + vbif d27, d18, d17 @ op2 |= t_op2 & ~(f2 & f & m) + vqrshrn.u16 d18, q15, #4 @ w_op1 + + vsub.i16 q15, q14 + vaddl.u8 q14, d0, d7 + vaddw.u8 q15, d7 @ op0 += p0 + vaddw.u8 q15, d14 @ op0 += q6 + vbif d18, d19, d17 @ op1 |= t_op1 & ~(f2 & f & m) + vqrshrn.u16 d19, q15, #4 @ w_op0 + + vsub.i16 q15, q14 + vaddl.u8 q14, d1, d8 + vaddw.u8 q15, d8 @ oq0 += q0 + vaddw.u8 q15, d15 @ oq0 += q7 + vbif d19, d20, d17 @ op0 |= t_op0 & ~(f2 & f & m) + vqrshrn.u16 d20, q15, #4 @ w_oq0 + + vsub.i16 q15, q14 + vaddl.u8 q14, d2, d9 + vaddw.u8 q15, d9 @ oq1 += q1 + vaddl.u8 q4, d10, d15 + vaddw.u8 q15, d15 @ oq1 += q7 + vbif d20, d21, d17 @ oq0 |= t_oq0 & ~(f2 & f & m) + vqrshrn.u16 d21, q15, #4 @ w_oq1 + + vsub.i16 q15, q14 + vaddl.u8 q14, d3, d10 + vadd.i16 q15, q4 + vaddl.u8 q4, d11, d15 + vbif d21, d22, d17 @ oq1 |= t_oq1 & ~(f2 & f & m) + vqrshrn.u16 d22, q15, #4 @ w_oq2 + + vsub.i16 q15, q14 + vaddl.u8 q14, d4, d11 + vadd.i16 q15, q4 + vaddl.u8 q4, d12, d15 + vbif d22, d23, d17 @ oq2 |= t_oq2 & ~(f2 & f & m) + vqrshrn.u16 d23, q15, #4 @ w_oq3 + + vsub.i16 q15, q14 + vaddl.u8 q14, d5, d12 + vadd.i16 q15, q4 + vaddl.u8 q4, d13, d15 + vbif d16, d1, d17 @ op6 |= p6 & ~(f2 & f & m) + vqrshrn.u16 d1, q15, #4 @ w_oq4 + + vsub.i16 q15, q14 + vaddl.u8 q14, d6, d13 + vadd.i16 q15, q4 + vaddl.u8 q4, d14, d15 + vbif d24, d2, d17 @ op5 |= p5 & ~(f2 & f & m) + vqrshrn.u16 d2, q15, #4 @ w_oq5 + + vsub.i16 q15, q14 + vbif d25, d3, d17 @ op4 |= p4 & ~(f2 & f & m) + vadd.i16 q15, q4 + vbif d23, d11, d17 @ oq3 |= q3 & ~(f2 & f & m) + vqrshrn.u16 d3, q15, #4 @ w_oq6 + vbif d1, d12, d17 @ oq4 |= q4 & ~(f2 & f & m) + vbif d2, d13, d17 @ oq5 |= q5 & ~(f2 & f & m) + vbif d3, d14, d17 @ oq6 |= q6 & ~(f2 & f & m) + + bx lr + @ @ |vpx_wide_mbfilter_neon| + diff --git a/thirdparty/libvpx/vpx_dsp/arm/gas_apple/save_reg_neon.s b/thirdparty/libvpx/vpx_dsp/arm/gas_apple/save_reg_neon.s new file mode 100644 index 0000000000..f322b698b4 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/arm/gas_apple/save_reg_neon.s @@ -0,0 +1,46 @@ +@ This file was created from a .asm file +@ using the ads2gas_apple.pl script. + + .set WIDE_REFERENCE, 0 + .set ARCHITECTURE, 5 + .set DO1STROUNDING, 0 + @ + @ Copyright (c) 2010 The WebM project authors. All Rights Reserved. + @ + @ Use of this source code is governed by a BSD-style license + @ that can be found in the LICENSE file in the root of the source + @ tree. An additional intellectual property rights grant can be found + @ in the file PATENTS. All contributing project authors may + @ be found in the AUTHORS file in the root of the source tree. + @ + + + .globl _vpx_push_neon + .globl vpx_push_neon + .globl _vpx_pop_neon + .globl vpx_pop_neon + + @ ARM + @ + @ PRESERVE8 + +.text +.p2align 2 + +_vpx_push_neon: + vpx_push_neon: @ + vst1.i64 {d8, d9, d10, d11}, [r0]! + vst1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + + @ + +_vpx_pop_neon: + vpx_pop_neon: @ + vld1.i64 {d8, d9, d10, d11}, [r0]! + vld1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + + @ + + diff --git a/thirdparty/libvpx/vpx_dsp_rtcd.h b/thirdparty/libvpx/vpx_dsp_rtcd.h new file mode 100644 index 0000000000..4d5ad89533 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp_rtcd.h @@ -0,0 +1,9 @@ +#include "vpx_config.h" + +#if defined(WEBM_X86ASM) && (ARCH_X86 || ARCH_X86_64) + #include "rtcd/vpx_dsp_rtcd_x86.h" +#elif defined(WEBM_ARMASM) && ARCH_ARM + #include "rtcd/vpx_dsp_rtcd_arm.h" +#else + #include "rtcd/vpx_dsp_rtcd_c.h" +#endif diff --git a/thirdparty/libvpx/vpx_ports/arm_cpudetect.c b/thirdparty/libvpx/vpx_ports/arm_cpudetect.c index 8a4b8af964..7eb74a7dc9 100644 --- a/thirdparty/libvpx/vpx_ports/arm_cpudetect.c +++ b/thirdparty/libvpx/vpx_ports/arm_cpudetect.c @@ -38,26 +38,7 @@ static int arm_cpu_env_mask(void) { } #if !CONFIG_RUNTIME_CPU_DETECT - -int arm_cpu_caps(void) { - /* This function should actually be a no-op. There is no way to adjust any of - * these because the RTCD tables do not exist: the functions are called - * statically */ - int flags; - int mask; - if (!arm_cpu_env_flags(&flags)) { - return flags; - } - mask = arm_cpu_env_mask(); -#if HAVE_MEDIA - flags |= HAS_MEDIA; -#endif /* HAVE_MEDIA */ -#if HAVE_NEON || HAVE_NEON_ASM - flags |= HAS_NEON; -#endif /* HAVE_NEON || HAVE_NEON_ASM */ - return flags & mask; -} - + #error "CONFIG_RUNTIME_CPU_DETECT should be enabled!" #elif defined(_MSC_VER) /* end !CONFIG_RUNTIME_CPU_DETECT */ /*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/ #define WIN32_LEAN_AND_MEAN @@ -76,28 +57,28 @@ int arm_cpu_caps(void) { * All of these instructions should be essentially nops. */ #if HAVE_MEDIA - if (mask & HAS_MEDIA) + if (mask & HAS_MEDIA) { __try { /*SHADD8 r3,r3,r3*/ __emit(0xE6333F93); flags |= HAS_MEDIA; } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) { /*Ignore exception.*/ + } } -} #endif /* HAVE_MEDIA */ #if HAVE_NEON || HAVE_NEON_ASM -if (mask &HAS_NEON) { - __try { - /*VORR q0,q0,q0*/ - __emit(0xF2200150); - flags |= HAS_NEON; - } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) { - /*Ignore exception.*/ + if (mask &HAS_NEON) { + __try { + /*VORR q0,q0,q0*/ + __emit(0xF2200150); + flags |= HAS_NEON; + } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) { + /*Ignore exception.*/ + } } -} #endif /* HAVE_NEON || HAVE_NEON_ASM */ -return flags & mask; + return flags & mask; } #elif defined(__ANDROID__) /* end _MSC_VER */ @@ -170,6 +151,20 @@ int arm_cpu_caps(void) { return flags & mask; } #else /* end __linux__ */ -#error "--enable-runtime-cpu-detect selected, but no CPU detection method " \ -"available for your platform. Reconfigure with --disable-runtime-cpu-detect." +int arm_cpu_caps(void) { + int flags; + int mask; + if (!arm_cpu_env_flags(&flags)) { + return flags; + } + mask = arm_cpu_env_mask(); +#if HAVE_MEDIA + flags |= HAS_MEDIA; +#endif /* HAVE_MEDIA */ +#if HAVE_NEON || HAVE_NEON_ASM + flags |= HAS_NEON; +#endif /* HAVE_NEON || HAVE_NEON_ASM */ + return flags & mask; +} +#warning "ARM run-time CPU detection is disabled for this platform..." #endif diff --git a/thirdparty/libvpx/vpx_ports/system_state.h b/thirdparty/libvpx/vpx_ports/system_state.h index 086c64681f..01989dcafc 100644 --- a/thirdparty/libvpx/vpx_ports/system_state.h +++ b/thirdparty/libvpx/vpx_ports/system_state.h @@ -13,10 +13,10 @@ #include "./vpx_config.h" -#if ARCH_X86 || ARCH_X86_64 -void vpx_reset_mmx_state(void); -#define vpx_clear_system_state() vpx_reset_mmx_state() +#if defined(WEBM_X86ASM) && (ARCH_X86 || ARCH_X86_64) + void vpx_reset_mmx_state(void); + #define vpx_clear_system_state() vpx_reset_mmx_state() #else -#define vpx_clear_system_state() + #define vpx_clear_system_state() #endif // ARCH_X86 || ARCH_X86_64 #endif // VPX_PORTS_SYSTEM_STATE_H_ diff --git a/thirdparty/libvpx/vpx_scale_rtcd.h b/thirdparty/libvpx/vpx_scale_rtcd.h new file mode 100644 index 0000000000..4d59467fe9 --- /dev/null +++ b/thirdparty/libvpx/vpx_scale_rtcd.h @@ -0,0 +1,44 @@ +#ifndef VPX_SCALE_RTCD_H_ +#define VPX_SCALE_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vp8_yv12_copy_frame vp8_yv12_copy_frame_c + +void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c + +void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_borders vpx_extend_frame_borders_c + +void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c + +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_c + +void vpx_scale_rtcd(void); + +#ifdef RTCD_C +static void setup_rtcd_internal(void) +{ + //Only MIPS has something here, but it is not supported +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif |