From cfcc8a20e862b758c32bd3f152186e6df0591a24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Verschelde?= Date: Thu, 13 Oct 2016 19:40:40 +0200 Subject: theora: Move to a module and split thirdparty lib Same rationale as the previous commits. --- thirdparty/libtheora/x86_vc/mmxencfrag.c | 969 +++++++++++++++++++++++++++++++ thirdparty/libtheora/x86_vc/mmxfdct.c | 670 +++++++++++++++++++++ thirdparty/libtheora/x86_vc/mmxfrag.c | 337 +++++++++++ thirdparty/libtheora/x86_vc/mmxfrag.h | 61 ++ thirdparty/libtheora/x86_vc/mmxidct.c | 562 ++++++++++++++++++ thirdparty/libtheora/x86_vc/mmxloop.h | 219 +++++++ thirdparty/libtheora/x86_vc/mmxstate.c | 211 +++++++ thirdparty/libtheora/x86_vc/x86enc.c | 49 ++ thirdparty/libtheora/x86_vc/x86enc.h | 47 ++ thirdparty/libtheora/x86_vc/x86int.h | 42 ++ thirdparty/libtheora/x86_vc/x86state.c | 62 ++ 11 files changed, 3229 insertions(+) create mode 100644 thirdparty/libtheora/x86_vc/mmxencfrag.c create mode 100644 thirdparty/libtheora/x86_vc/mmxfdct.c create mode 100644 thirdparty/libtheora/x86_vc/mmxfrag.c create mode 100644 thirdparty/libtheora/x86_vc/mmxfrag.h create mode 100644 thirdparty/libtheora/x86_vc/mmxidct.c create mode 100644 thirdparty/libtheora/x86_vc/mmxloop.h create mode 100644 thirdparty/libtheora/x86_vc/mmxstate.c create mode 100644 thirdparty/libtheora/x86_vc/x86enc.c create mode 100644 thirdparty/libtheora/x86_vc/x86enc.h create mode 100644 thirdparty/libtheora/x86_vc/x86int.h create mode 100644 thirdparty/libtheora/x86_vc/x86state.c (limited to 'thirdparty/libtheora/x86_vc') diff --git a/thirdparty/libtheora/x86_vc/mmxencfrag.c b/thirdparty/libtheora/x86_vc/mmxencfrag.c new file mode 100644 index 0000000000..ac9dacf377 --- /dev/null +++ b/thirdparty/libtheora/x86_vc/mmxencfrag.c @@ -0,0 +1,969 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * + * by the Xiph.Org Foundation http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $ + + ********************************************************************/ +#include +#include "x86enc.h" + +#if defined(OC_X86_ASM) + +unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src, + const unsigned char *_ref,int _ystride){ + ptrdiff_t ret; + __asm{ +#define SRC esi +#define REF edx +#define YSTRIDE ecx +#define YSTRIDE3 edi + mov YSTRIDE,_ystride + mov SRC,_src + mov REF,_ref + /*Load the first 4 rows of each block.*/ + movq mm0,[SRC] + movq mm1,[REF] + movq mm2,[SRC][YSTRIDE] + movq mm3,[REF][YSTRIDE] + lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] + movq mm4,[SRC+YSTRIDE*2] + movq mm5,[REF+YSTRIDE*2] + movq mm6,[SRC+YSTRIDE3] + movq mm7,[REF+YSTRIDE3] + /*Compute their SADs and add them in mm0*/ + psadbw mm0,mm1 + psadbw mm2,mm3 + lea SRC,[SRC+YSTRIDE*4] + paddw mm0,mm2 + lea REF,[REF+YSTRIDE*4] + /*Load the next 3 rows as registers become available.*/ + movq mm2,[SRC] + movq mm3,[REF] + psadbw mm4,mm5 + psadbw mm6,mm7 + paddw mm0,mm4 + movq mm5,[REF+YSTRIDE] + movq mm4,[SRC+YSTRIDE] + paddw mm0,mm6 + movq mm7,[REF+YSTRIDE*2] + movq mm6,[SRC+YSTRIDE*2] + /*Start adding their SADs to mm0*/ + psadbw mm2,mm3 + psadbw mm4,mm5 + paddw mm0,mm2 + psadbw mm6,mm7 + /*Load last row as registers become available.*/ + movq mm2,[SRC+YSTRIDE3] + movq mm3,[REF+YSTRIDE3] + /*And finish adding up their SADs.*/ + paddw mm0,mm4 + psadbw mm2,mm3 + paddw mm0,mm6 + paddw mm0,mm2 + movd [ret],mm0 +#undef SRC +#undef REF +#undef YSTRIDE +#undef YSTRIDE3 + } + return (unsigned)ret; +} + +unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src, + const unsigned char *_ref,int _ystride,unsigned _thresh){ + /*Early termination is for suckers.*/ + return oc_enc_frag_sad_mmxext(_src,_ref,_ystride); +} + +#define OC_SAD2_LOOP __asm{ \ + /*We want to compute (mm0+mm1>>1) on unsigned bytes without overflow, but \ + pavgb computes (mm0+mm1+1>>1). \ + The latter is exactly 1 too large when the low bit of two corresponding \ + bytes is only set in one of them. \ + Therefore we pxor the operands, pand to mask out the low bits, and psubb to \ + correct the output of pavgb.*/ \ + __asm movq mm6,mm0 \ + __asm lea REF1,[REF1+YSTRIDE*2] \ + __asm pxor mm0,mm1 \ + __asm pavgb mm6,mm1 \ + __asm lea REF2,[REF2+YSTRIDE*2] \ + __asm movq mm1,mm2 \ + __asm pand mm0,mm7 \ + __asm pavgb mm2,mm3 \ + __asm pxor mm1,mm3 \ + __asm movq mm3,[REF2+YSTRIDE] \ + __asm psubb mm6,mm0 \ + __asm movq mm0,[REF1] \ + __asm pand mm1,mm7 \ + __asm psadbw mm4,mm6 \ + __asm movd mm6,RET \ + __asm psubb mm2,mm1 \ + __asm movq mm1,[REF2] \ + __asm lea SRC,[SRC+YSTRIDE*2] \ + __asm psadbw mm5,mm2 \ + __asm movq mm2,[REF1+YSTRIDE] \ + __asm paddw mm5,mm4 \ + __asm movq mm4,[SRC] \ + __asm paddw mm6,mm5 \ + __asm movq mm5,[SRC+YSTRIDE] \ + __asm movd RET,mm6 \ +} + +/*Same as above, but does not pre-load the next two rows.*/ +#define OC_SAD2_TAIL __asm{ \ + __asm movq mm6,mm0 \ + __asm pavgb mm0,mm1 \ + __asm pxor mm6,mm1 \ + __asm movq mm1,mm2 \ + __asm pand mm6,mm7 \ + __asm pavgb mm2,mm3 \ + __asm pxor mm1,mm3 \ + __asm psubb mm0,mm6 \ + __asm pand mm1,mm7 \ + __asm psadbw mm4,mm0 \ + __asm psubb mm2,mm1 \ + __asm movd mm6,RET \ + __asm psadbw mm5,mm2 \ + __asm paddw mm5,mm4 \ + __asm paddw mm6,mm5 \ + __asm movd RET,mm6 \ +} + +unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src, + const unsigned char *_ref1,const unsigned char *_ref2,int _ystride, + unsigned _thresh){ + ptrdiff_t ret; + __asm{ +#define REF1 ecx +#define REF2 edi +#define YSTRIDE esi +#define SRC edx +#define RET eax + mov YSTRIDE,_ystride + mov SRC,_src + mov REF1,_ref1 + mov REF2,_ref2 + movq mm0,[REF1] + movq mm1,[REF2] + movq mm2,[REF1+YSTRIDE] + movq mm3,[REF2+YSTRIDE] + xor RET,RET + movq mm4,[SRC] + pxor mm7,mm7 + pcmpeqb mm6,mm6 + movq mm5,[SRC+YSTRIDE] + psubb mm7,mm6 + OC_SAD2_LOOP + OC_SAD2_LOOP + OC_SAD2_LOOP + OC_SAD2_TAIL + mov [ret],RET +#undef REF1 +#undef REF2 +#undef YSTRIDE +#undef SRC +#undef RET + } + return (unsigned)ret; +} + +/*Load an 8x4 array of pixel values from %[src] and %[ref] and compute their + 16-bit difference in mm0...mm7.*/ +#define OC_LOAD_SUB_8x4(_off) __asm{ \ + __asm movd mm0,[_off+SRC] \ + __asm movd mm4,[_off+REF] \ + __asm movd mm1,[_off+SRC+SRC_YSTRIDE] \ + __asm lea SRC,[SRC+SRC_YSTRIDE*2] \ + __asm movd mm5,[_off+REF+REF_YSTRIDE] \ + __asm lea REF,[REF+REF_YSTRIDE*2] \ + __asm movd mm2,[_off+SRC] \ + __asm movd mm7,[_off+REF] \ + __asm movd mm3,[_off+SRC+SRC_YSTRIDE] \ + __asm movd mm6,[_off+REF+REF_YSTRIDE] \ + __asm punpcklbw mm0,mm4 \ + __asm lea SRC,[SRC+SRC_YSTRIDE*2] \ + __asm punpcklbw mm4,mm4 \ + __asm lea REF,[REF+REF_YSTRIDE*2] \ + __asm psubw mm0,mm4 \ + __asm movd mm4,[_off+SRC] \ + __asm movq [_off*2+BUF],mm0 \ + __asm movd mm0,[_off+REF] \ + __asm punpcklbw mm1,mm5 \ + __asm punpcklbw mm5,mm5 \ + __asm psubw mm1,mm5 \ + __asm movd mm5,[_off+SRC+SRC_YSTRIDE] \ + __asm punpcklbw mm2,mm7 \ + __asm punpcklbw mm7,mm7 \ + __asm psubw mm2,mm7 \ + __asm movd mm7,[_off+REF+REF_YSTRIDE] \ + __asm punpcklbw mm3,mm6 \ + __asm lea SRC,[SRC+SRC_YSTRIDE*2] \ + __asm punpcklbw mm6,mm6 \ + __asm psubw mm3,mm6 \ + __asm movd mm6,[_off+SRC] \ + __asm punpcklbw mm4,mm0 \ + __asm lea REF,[REF+REF_YSTRIDE*2] \ + __asm punpcklbw mm0,mm0 \ + __asm lea SRC,[SRC+SRC_YSTRIDE*2] \ + __asm psubw mm4,mm0 \ + __asm movd mm0,[_off+REF] \ + __asm punpcklbw mm5,mm7 \ + __asm neg SRC_YSTRIDE \ + __asm punpcklbw mm7,mm7 \ + __asm psubw mm5,mm7 \ + __asm movd mm7,[_off+SRC+SRC_YSTRIDE] \ + __asm punpcklbw mm6,mm0 \ + __asm lea REF,[REF+REF_YSTRIDE*2] \ + __asm punpcklbw mm0,mm0 \ + __asm neg REF_YSTRIDE \ + __asm psubw mm6,mm0 \ + __asm movd mm0,[_off+REF+REF_YSTRIDE] \ + __asm lea SRC,[SRC+SRC_YSTRIDE*8] \ + __asm punpcklbw mm7,mm0 \ + __asm neg SRC_YSTRIDE \ + __asm punpcklbw mm0,mm0 \ + __asm lea REF,[REF+REF_YSTRIDE*8] \ + __asm psubw mm7,mm0 \ + __asm neg REF_YSTRIDE \ + __asm movq mm0,[_off*2+BUF] \ +} + +/*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/ +#define OC_LOAD_8x4(_off) __asm{ \ + __asm movd mm0,[_off+SRC] \ + __asm movd mm1,[_off+SRC+YSTRIDE] \ + __asm movd mm2,[_off+SRC+YSTRIDE*2] \ + __asm pxor mm7,mm7 \ + __asm movd mm3,[_off+SRC+YSTRIDE3] \ + __asm punpcklbw mm0,mm7 \ + __asm movd mm4,[_off+SRC4] \ + __asm punpcklbw mm1,mm7 \ + __asm movd mm5,[_off+SRC4+YSTRIDE] \ + __asm punpcklbw mm2,mm7 \ + __asm movd mm6,[_off+SRC4+YSTRIDE*2] \ + __asm punpcklbw mm3,mm7 \ + __asm movd mm7,[_off+SRC4+YSTRIDE3] \ + __asm punpcklbw mm4,mm4 \ + __asm punpcklbw mm5,mm5 \ + __asm psrlw mm4,8 \ + __asm psrlw mm5,8 \ + __asm punpcklbw mm6,mm6 \ + __asm punpcklbw mm7,mm7 \ + __asm psrlw mm6,8 \ + __asm psrlw mm7,8 \ +} + +/*Performs the first two stages of an 8-point 1-D Hadamard transform. + The transform is performed in place, except that outputs 0-3 are swapped with + outputs 4-7. + Outputs 2, 3, 6 and 7 from the second stage are negated (which allows us to + perform this stage in place with no temporary registers).*/ +#define OC_HADAMARD_AB_8x4 __asm{ \ + /*Stage A: \ + Outputs 0-3 are swapped with 4-7 here.*/ \ + __asm paddw mm5,mm1 \ + __asm paddw mm6,mm2 \ + __asm paddw mm1,mm1 \ + __asm paddw mm2,mm2 \ + __asm psubw mm1,mm5 \ + __asm psubw mm2,mm6 \ + __asm paddw mm7,mm3 \ + __asm paddw mm4,mm0 \ + __asm paddw mm3,mm3 \ + __asm paddw mm0,mm0 \ + __asm psubw mm3,mm7 \ + __asm psubw mm0,mm4 \ + /*Stage B:*/ \ + __asm paddw mm0,mm2 \ + __asm paddw mm1,mm3 \ + __asm paddw mm4,mm6 \ + __asm paddw mm5,mm7 \ + __asm paddw mm2,mm2 \ + __asm paddw mm3,mm3 \ + __asm paddw mm6,mm6 \ + __asm paddw mm7,mm7 \ + __asm psubw mm2,mm0 \ + __asm psubw mm3,mm1 \ + __asm psubw mm6,mm4 \ + __asm psubw mm7,mm5 \ +} + +/*Performs the last stage of an 8-point 1-D Hadamard transform in place. + Ouputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in + place with no temporary registers).*/ +#define OC_HADAMARD_C_8x4 __asm{ \ + /*Stage C:*/ \ + __asm paddw mm0,mm1 \ + __asm paddw mm2,mm3 \ + __asm paddw mm4,mm5 \ + __asm paddw mm6,mm7 \ + __asm paddw mm1,mm1 \ + __asm paddw mm3,mm3 \ + __asm paddw mm5,mm5 \ + __asm paddw mm7,mm7 \ + __asm psubw mm1,mm0 \ + __asm psubw mm3,mm2 \ + __asm psubw mm5,mm4 \ + __asm psubw mm7,mm6 \ +} + +/*Performs an 8-point 1-D Hadamard transform. + The transform is performed in place, except that outputs 0-3 are swapped with + outputs 4-7. + Outputs 1, 2, 5 and 6 are negated (which allows us to perform the transform + in place with no temporary registers).*/ +#define OC_HADAMARD_8x4 __asm{ \ + OC_HADAMARD_AB_8x4 \ + OC_HADAMARD_C_8x4 \ +} + +/*Performs the first part of the final stage of the Hadamard transform and + summing of absolute values. + At the end of this part, mm1 will contain the DC coefficient of the + transform.*/ +#define OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) __asm{ \ + /*We use the fact that \ + (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \ + to merge the final butterfly with the abs and the first stage of \ + accumulation. \ + Thus we can avoid using pabsw, which is not available until SSSE3. \ + Emulating pabsw takes 3 instructions, so the straightforward MMXEXT \ + implementation would be (3+3)*8+7=55 instructions (+4 for spilling \ + registers). \ + Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \ + This implementation is only 26 (+4 for spilling registers).*/ \ + __asm movq [_r7+BUF],mm7 \ + __asm movq [_r6+BUF],mm6 \ + /*mm7={0x7FFF}x4 \ + mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \ + __asm pcmpeqb mm7,mm7 \ + __asm movq mm6,mm0 \ + __asm psrlw mm7,1 \ + __asm paddw mm6,mm1 \ + __asm pmaxsw mm0,mm1 \ + __asm paddsw mm6,mm7 \ + __asm psubw mm0,mm6 \ + /*mm2=max(abs(mm2),abs(mm3))-0x7FFF \ + mm4=max(abs(mm4),abs(mm5))-0x7FFF*/ \ + __asm movq mm6,mm2 \ + __asm movq mm1,mm4 \ + __asm pmaxsw mm2,mm3 \ + __asm pmaxsw mm4,mm5 \ + __asm paddw mm6,mm3 \ + __asm paddw mm1,mm5 \ + __asm movq mm3,[_r7+BUF] \ +} + +/*Performs the second part of the final stage of the Hadamard transform and + summing of absolute values.*/ +#define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) __asm{ \ + __asm paddsw mm6,mm7 \ + __asm movq mm5,[_r6+BUF] \ + __asm paddsw mm1,mm7 \ + __asm psubw mm2,mm6 \ + __asm psubw mm4,mm1 \ + /*mm7={1}x4 (needed for the horizontal add that follows) \ + mm0+=mm2+mm4+max(abs(mm3),abs(mm5))-0x7FFF*/ \ + __asm movq mm6,mm3 \ + __asm pmaxsw mm3,mm5 \ + __asm paddw mm0,mm2 \ + __asm paddw mm6,mm5 \ + __asm paddw mm0,mm4 \ + __asm paddsw mm6,mm7 \ + __asm paddw mm0,mm3 \ + __asm psrlw mm7,14 \ + __asm psubw mm0,mm6 \ +} + +/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the + absolute value of each component, and accumulates everything into mm0. + This is the only portion of SATD which requires MMXEXT (we could use plain + MMX, but it takes 4 instructions and an extra register to work around the + lack of a pmaxsw, which is a pretty serious penalty).*/ +#define OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) __asm{ \ + OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \ + OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \ +} + +/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each + component, and accumulates everything into mm0. + Note that mm0 will have an extra 4 added to each column, and that after + removing this value, the remainder will be half the conventional value.*/ +#define OC_HADAMARD_ABS_ACCUM_8x4(_r6,_r7) __asm{ \ + OC_HADAMARD_AB_8x4 \ + OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) \ +} + +/*Performs two 4x4 transposes (mostly) in place. + On input, {mm0,mm1,mm2,mm3} contains rows {e,f,g,h}, and {mm4,mm5,mm6,mm7} + contains rows {a,b,c,d}. + On output, {0x40,0x50,0x60,0x70}+_off+BUF contains {e,f,g,h}^T, and + {mm4,mm5,mm6,mm7} contains the transposed rows {a,b,c,d}^T.*/ +#define OC_TRANSPOSE_4x4x2(_off) __asm{ \ + /*First 4x4 transpose:*/ \ + __asm movq [0x10+_off+BUF],mm5 \ + /*mm0 = e3 e2 e1 e0 \ + mm1 = f3 f2 f1 f0 \ + mm2 = g3 g2 g1 g0 \ + mm3 = h3 h2 h1 h0*/ \ + __asm movq mm5,mm2 \ + __asm punpcklwd mm2,mm3 \ + __asm punpckhwd mm5,mm3 \ + __asm movq mm3,mm0 \ + __asm punpcklwd mm0,mm1 \ + __asm punpckhwd mm3,mm1 \ + /*mm0 = f1 e1 f0 e0 \ + mm3 = f3 e3 f2 e2 \ + mm2 = h1 g1 h0 g0 \ + mm5 = h3 g3 h2 g2*/ \ + __asm movq mm1,mm0 \ + __asm punpckldq mm0,mm2 \ + __asm punpckhdq mm1,mm2 \ + __asm movq mm2,mm3 \ + __asm punpckhdq mm3,mm5 \ + __asm movq [0x40+_off+BUF],mm0 \ + __asm punpckldq mm2,mm5 \ + /*mm0 = h0 g0 f0 e0 \ + mm1 = h1 g1 f1 e1 \ + mm2 = h2 g2 f2 e2 \ + mm3 = h3 g3 f3 e3*/ \ + __asm movq mm5,[0x10+_off+BUF] \ + /*Second 4x4 transpose:*/ \ + /*mm4 = a3 a2 a1 a0 \ + mm5 = b3 b2 b1 b0 \ + mm6 = c3 c2 c1 c0 \ + mm7 = d3 d2 d1 d0*/ \ + __asm movq mm0,mm6 \ + __asm punpcklwd mm6,mm7 \ + __asm movq [0x50+_off+BUF],mm1 \ + __asm punpckhwd mm0,mm7 \ + __asm movq mm7,mm4 \ + __asm punpcklwd mm4,mm5 \ + __asm movq [0x60+_off+BUF],mm2 \ + __asm punpckhwd mm7,mm5 \ + /*mm4 = b1 a1 b0 a0 \ + mm7 = b3 a3 b2 a2 \ + mm6 = d1 c1 d0 c0 \ + mm0 = d3 c3 d2 c2*/ \ + __asm movq mm5,mm4 \ + __asm punpckldq mm4,mm6 \ + __asm movq [0x70+_off+BUF],mm3 \ + __asm punpckhdq mm5,mm6 \ + __asm movq mm6,mm7 \ + __asm punpckhdq mm7,mm0 \ + __asm punpckldq mm6,mm0 \ + /*mm4 = d0 c0 b0 a0 \ + mm5 = d1 c1 b1 a1 \ + mm6 = d2 c2 b2 a2 \ + mm7 = d3 c3 b3 a3*/ \ +} + +static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src, + int _src_ystride,const unsigned char *_ref,int _ref_ystride,unsigned _thresh){ + OC_ALIGN8(ogg_int16_t buf[64]); + ogg_int16_t *bufp; + unsigned ret1; + unsigned ret2; + bufp=buf; + __asm{ +#define SRC esi +#define REF eax +#define SRC_YSTRIDE ecx +#define REF_YSTRIDE edx +#define BUF edi +#define RET eax +#define RET2 edx + mov SRC,_src + mov SRC_YSTRIDE,_src_ystride + mov REF,_ref + mov REF_YSTRIDE,_ref_ystride + mov BUF,bufp + OC_LOAD_SUB_8x4(0x00) + OC_HADAMARD_8x4 + OC_TRANSPOSE_4x4x2(0x00) + /*Finish swapping out this 8x4 block to make room for the next one. + mm0...mm3 have been swapped out already.*/ + movq [0x00+BUF],mm4 + movq [0x10+BUF],mm5 + movq [0x20+BUF],mm6 + movq [0x30+BUF],mm7 + OC_LOAD_SUB_8x4(0x04) + OC_HADAMARD_8x4 + OC_TRANSPOSE_4x4x2(0x08) + /*Here the first 4x4 block of output from the last transpose is the second + 4x4 block of input for the next transform. + We have cleverly arranged that it already be in the appropriate place, so + we only have to do half the loads.*/ + movq mm1,[0x10+BUF] + movq mm2,[0x20+BUF] + movq mm3,[0x30+BUF] + movq mm0,[0x00+BUF] + OC_HADAMARD_ABS_ACCUM_8x4(0x28,0x38) + /*Up to this point, everything fit in 16 bits (8 input + 1 for the + difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1 + for the factor of two we dropped + 3 for the vertical accumulation). + Now we finally have to promote things to dwords. + We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long + latency of pmaddwd by starting the next series of loads now.*/ + mov RET2,_thresh + pmaddwd mm0,mm7 + movq mm1,[0x50+BUF] + movq mm5,[0x58+BUF] + movq mm4,mm0 + movq mm2,[0x60+BUF] + punpckhdq mm0,mm0 + movq mm6,[0x68+BUF] + paddd mm4,mm0 + movq mm3,[0x70+BUF] + movd RET,mm4 + movq mm7,[0x78+BUF] + /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4 + added to them, and a factor of two removed; correct the final sum here.*/ + lea RET,[RET+RET-32] + movq mm0,[0x40+BUF] + cmp RET,RET2 + movq mm4,[0x48+BUF] + jae at_end + OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78) + pmaddwd mm0,mm7 + /*There isn't much to stick in here to hide the latency this time, but the + alternative to pmaddwd is movq->punpcklwd->punpckhwd->paddd, whose + latency is even worse.*/ + sub RET,32 + movq mm4,mm0 + punpckhdq mm0,mm0 + paddd mm4,mm0 + movd RET2,mm4 + lea RET,[RET+RET2*2] + align 16 +at_end: + mov ret1,RET +#undef SRC +#undef REF +#undef SRC_YSTRIDE +#undef REF_YSTRIDE +#undef BUF +#undef RET +#undef RET2 + } + return ret1; +} + +unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src, + const unsigned char *_ref,int _ystride,unsigned _thresh){ + return oc_int_frag_satd_thresh_mmxext(_src,_ystride,_ref,_ystride,_thresh); +} + + +/*Our internal implementation of frag_copy2 takes an extra stride parameter so + we can share code with oc_enc_frag_satd2_thresh_mmxext().*/ +static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride, + const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){ + __asm{ + /*Load the first 3 rows.*/ +#define DST_YSTRIDE edi +#define SRC_YSTRIDE esi +#define DST eax +#define SRC1 edx +#define SRC2 ecx + mov DST_YSTRIDE,_dst_ystride + mov SRC_YSTRIDE,_src_ystride + mov DST,_dst + mov SRC1,_src1 + mov SRC2,_src2 + movq mm0,[SRC1] + movq mm1,[SRC2] + movq mm2,[SRC1+SRC_YSTRIDE] + lea SRC1,[SRC1+SRC_YSTRIDE*2] + movq mm3,[SRC2+SRC_YSTRIDE] + lea SRC2,[SRC2+SRC_YSTRIDE*2] + pxor mm7,mm7 + movq mm4,[SRC1] + pcmpeqb mm6,mm6 + movq mm5,[SRC2] + /*mm7={1}x8.*/ + psubb mm7,mm6 + /*Start averaging mm0 and mm1 into mm6.*/ + movq mm6,mm0 + pxor mm0,mm1 + pavgb mm6,mm1 + /*mm1 is free, start averaging mm3 into mm2 using mm1.*/ + movq mm1,mm2 + pand mm0,mm7 + pavgb mm2,mm3 + pxor mm1,mm3 + /*mm3 is free.*/ + psubb mm6,mm0 + /*mm0 is free, start loading the next row.*/ + movq mm0,[SRC1+SRC_YSTRIDE] + /*Start averaging mm5 and mm4 using mm3.*/ + movq mm3,mm4 + /*mm6 [row 0] is done; write it out.*/ + movq [DST],mm6 + pand mm1,mm7 + pavgb mm4,mm5 + psubb mm2,mm1 + /*mm1 is free, continue loading the next row.*/ + movq mm1,[SRC2+SRC_YSTRIDE] + pxor mm3,mm5 + lea SRC1,[SRC1+SRC_YSTRIDE*2] + /*mm2 [row 1] is done; write it out.*/ + movq [DST+DST_YSTRIDE],mm2 + pand mm3,mm7 + /*Start loading the next row.*/ + movq mm2,[SRC1] + lea DST,[DST+DST_YSTRIDE*2] + psubb mm4,mm3 + lea SRC2,[SRC2+SRC_YSTRIDE*2] + /*mm4 [row 2] is done; write it out.*/ + movq [DST],mm4 + /*Continue loading the next row.*/ + movq mm3,[SRC2] + /*Start averaging mm0 and mm1 into mm6.*/ + movq mm6,mm0 + pxor mm0,mm1 + /*Start loading the next row.*/ + movq mm4,[SRC1+SRC_YSTRIDE] + pavgb mm6,mm1 + /*mm1 is free; start averaging mm3 into mm2 using mm1.*/ + movq mm1,mm2 + pand mm0,mm7 + /*Continue loading the next row.*/ + movq mm5,[SRC2+SRC_YSTRIDE] + pavgb mm2,mm3 + lea SRC1,[SRC1+SRC_YSTRIDE*2] + pxor mm1,mm3 + /*mm3 is free.*/ + psubb mm6,mm0 + /*mm0 is free, start loading the next row.*/ + movq mm0,[SRC1] + /*Start averaging mm5 into mm4 using mm3.*/ + movq mm3,mm4 + /*mm6 [row 3] is done; write it out.*/ + movq [DST+DST_YSTRIDE],mm6 + pand mm1,mm7 + lea SRC2,[SRC2+SRC_YSTRIDE*2] + pavgb mm4,mm5 + lea DST,[DST+DST_YSTRIDE*2] + psubb mm2,mm1 + /*mm1 is free; continue loading the next row.*/ + movq mm1,[SRC2] + pxor mm3,mm5 + /*mm2 [row 4] is done; write it out.*/ + movq [DST],mm2 + pand mm3,mm7 + /*Start loading the next row.*/ + movq mm2,[SRC1+SRC_YSTRIDE] + psubb mm4,mm3 + /*Start averaging mm0 and mm1 into mm6.*/ + movq mm6,mm0 + /*Continue loading the next row.*/ + movq mm3,[SRC2+SRC_YSTRIDE] + /*mm4 [row 5] is done; write it out.*/ + movq [DST+DST_YSTRIDE],mm4 + pxor mm0,mm1 + pavgb mm6,mm1 + /*mm4 is free; start averaging mm3 into mm2 using mm4.*/ + movq mm4,mm2 + pand mm0,mm7 + pavgb mm2,mm3 + pxor mm4,mm3 + lea DST,[DST+DST_YSTRIDE*2] + psubb mm6,mm0 + pand mm4,mm7 + /*mm6 [row 6] is done, write it out.*/ + movq [DST],mm6 + psubb mm2,mm4 + /*mm2 [row 7] is done, write it out.*/ + movq [DST+DST_YSTRIDE],mm2 +#undef SRC1 +#undef SRC2 +#undef SRC_YSTRIDE +#undef DST_YSTRIDE +#undef DST + } +} + +unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src, + const unsigned char *_ref1,const unsigned char *_ref2,int _ystride, + unsigned _thresh){ + OC_ALIGN8(unsigned char ref[64]); + oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride); + return oc_int_frag_satd_thresh_mmxext(_src,_ystride,ref,8,_thresh); +} + +unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src, + int _ystride){ + OC_ALIGN8(ogg_int16_t buf[64]); + ogg_int16_t *bufp; + unsigned ret1; + unsigned ret2; + bufp=buf; + __asm{ +#define SRC eax +#define SRC4 esi +#define BUF edi +#define RET eax +#define RET_WORD ax +#define RET2 ecx +#define YSTRIDE edx +#define YSTRIDE3 ecx + mov SRC,_src + mov BUF,bufp + mov YSTRIDE,_ystride + /* src4 = src+4*ystride */ + lea SRC4,[SRC+YSTRIDE*4] + /* ystride3 = 3*ystride */ + lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] + OC_LOAD_8x4(0x00) + OC_HADAMARD_8x4 + OC_TRANSPOSE_4x4x2(0x00) + /*Finish swapping out this 8x4 block to make room for the next one. + mm0...mm3 have been swapped out already.*/ + movq [0x00+BUF],mm4 + movq [0x10+BUF],mm5 + movq [0x20+BUF],mm6 + movq [0x30+BUF],mm7 + OC_LOAD_8x4(0x04) + OC_HADAMARD_8x4 + OC_TRANSPOSE_4x4x2(0x08) + /*Here the first 4x4 block of output from the last transpose is the second + 4x4 block of input for the next transform. + We have cleverly arranged that it already be in the appropriate place, so + we only have to do half the loads.*/ + movq mm1,[0x10+BUF] + movq mm2,[0x20+BUF] + movq mm3,[0x30+BUF] + movq mm0,[0x00+BUF] + /*We split out the stages here so we can save the DC coefficient in the + middle.*/ + OC_HADAMARD_AB_8x4 + OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38) + movd RET,mm1 + OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38) + /*Up to this point, everything fit in 16 bits (8 input + 1 for the + difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1 + for the factor of two we dropped + 3 for the vertical accumulation). + Now we finally have to promote things to dwords. + We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long + latency of pmaddwd by starting the next series of loads now.*/ + pmaddwd mm0,mm7 + movq mm1,[0x50+BUF] + movq mm5,[0x58+BUF] + movq mm2,[0x60+BUF] + movq mm4,mm0 + movq mm6,[0x68+BUF] + punpckhdq mm0,mm0 + movq mm3,[0x70+BUF] + paddd mm4,mm0 + movq mm7,[0x78+BUF] + movd RET2,mm4 + movq mm0,[0x40+BUF] + movq mm4,[0x48+BUF] + OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78) + pmaddwd mm0,mm7 + /*We assume that the DC coefficient is always positive (which is true, + because the input to the INTRA transform was not a difference).*/ + movzx RET,RET_WORD + add RET2,RET2 + sub RET2,RET + movq mm4,mm0 + punpckhdq mm0,mm0 + paddd mm4,mm0 + movd RET,mm4 + lea RET,[-64+RET2+RET*2] + mov [ret1],RET +#undef SRC +#undef SRC4 +#undef BUF +#undef RET +#undef RET_WORD +#undef RET2 +#undef YSTRIDE +#undef YSTRIDE3 + } + return ret1; +} + +void oc_enc_frag_sub_mmx(ogg_int16_t _residue[64], + const unsigned char *_src, const unsigned char *_ref,int _ystride){ + int i; + __asm pxor mm7,mm7 + for(i=4;i-->0;){ + __asm{ +#define SRC edx +#define YSTRIDE esi +#define RESIDUE eax +#define REF ecx + mov YSTRIDE,_ystride + mov RESIDUE,_residue + mov SRC,_src + mov REF,_ref + /*mm0=[src]*/ + movq mm0,[SRC] + /*mm1=[ref]*/ + movq mm1,[REF] + /*mm4=[src+ystride]*/ + movq mm4,[SRC+YSTRIDE] + /*mm5=[ref+ystride]*/ + movq mm5,[REF+YSTRIDE] + /*Compute [src]-[ref].*/ + movq mm2,mm0 + punpcklbw mm0,mm7 + movq mm3,mm1 + punpckhbw mm2,mm7 + punpcklbw mm1,mm7 + punpckhbw mm3,mm7 + psubw mm0,mm1 + psubw mm2,mm3 + /*Compute [src+ystride]-[ref+ystride].*/ + movq mm1,mm4 + punpcklbw mm4,mm7 + movq mm3,mm5 + punpckhbw mm1,mm7 + lea SRC,[SRC+YSTRIDE*2] + punpcklbw mm5,mm7 + lea REF,[REF+YSTRIDE*2] + punpckhbw mm3,mm7 + psubw mm4,mm5 + psubw mm1,mm3 + /*Write the answer out.*/ + movq [RESIDUE+0x00],mm0 + movq [RESIDUE+0x08],mm2 + movq [RESIDUE+0x10],mm4 + movq [RESIDUE+0x18],mm1 + lea RESIDUE,[RESIDUE+0x20] + mov _residue,RESIDUE + mov _src,SRC + mov _ref,REF +#undef SRC +#undef YSTRIDE +#undef RESIDUE +#undef REF + } + } +} + +void oc_enc_frag_sub_128_mmx(ogg_int16_t _residue[64], + const unsigned char *_src,int _ystride){ + __asm{ +#define YSTRIDE edx +#define YSTRIDE3 edi +#define RESIDUE ecx +#define SRC eax + mov YSTRIDE,_ystride + mov RESIDUE,_residue + mov SRC,_src + /*mm0=[src]*/ + movq mm0,[SRC] + /*mm1=[src+ystride]*/ + movq mm1,[SRC+YSTRIDE] + /*mm6={-1}x4*/ + pcmpeqw mm6,mm6 + /*mm2=[src+2*ystride]*/ + movq mm2,[SRC+YSTRIDE*2] + /*[ystride3]=3*[ystride]*/ + lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] + /*mm6={1}x4*/ + psllw mm6,15 + /*mm3=[src+3*ystride]*/ + movq mm3,[SRC+YSTRIDE3] + /*mm6={128}x4*/ + psrlw mm6,8 + /*mm7=0*/ + pxor mm7,mm7 + /*[src]=[src]+4*[ystride]*/ + lea SRC,[SRC+YSTRIDE*4] + /*Compute [src]-128 and [src+ystride]-128*/ + movq mm4,mm0 + punpcklbw mm0,mm7 + movq mm5,mm1 + punpckhbw mm4,mm7 + psubw mm0,mm6 + punpcklbw mm1,mm7 + psubw mm4,mm6 + punpckhbw mm5,mm7 + psubw mm1,mm6 + psubw mm5,mm6 + /*Write the answer out.*/ + movq [RESIDUE+0x00],mm0 + movq [RESIDUE+0x08],mm4 + movq [RESIDUE+0x10],mm1 + movq [RESIDUE+0x18],mm5 + /*mm0=[src+4*ystride]*/ + movq mm0,[SRC] + /*mm1=[src+5*ystride]*/ + movq mm1,[SRC+YSTRIDE] + /*Compute [src+2*ystride]-128 and [src+3*ystride]-128*/ + movq mm4,mm2 + punpcklbw mm2,mm7 + movq mm5,mm3 + punpckhbw mm4,mm7 + psubw mm2,mm6 + punpcklbw mm3,mm7 + psubw mm4,mm6 + punpckhbw mm5,mm7 + psubw mm3,mm6 + psubw mm5,mm6 + /*Write the answer out.*/ + movq [RESIDUE+0x20],mm2 + movq [RESIDUE+0x28],mm4 + movq [RESIDUE+0x30],mm3 + movq [RESIDUE+0x38],mm5 + /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/ + movq mm2,[SRC+YSTRIDE*2] + movq mm3,[SRC+YSTRIDE3] + movq mm4,mm0 + punpcklbw mm0,mm7 + movq mm5,mm1 + punpckhbw mm4,mm7 + psubw mm0,mm6 + punpcklbw mm1,mm7 + psubw mm4,mm6 + punpckhbw mm5,mm7 + psubw mm1,mm6 + psubw mm5,mm6 + /*Write the answer out.*/ + movq [RESIDUE+0x40],mm0 + movq [RESIDUE+0x48],mm4 + movq [RESIDUE+0x50],mm1 + movq [RESIDUE+0x58],mm5 + /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/ + movq mm4,mm2 + punpcklbw mm2,mm7 + movq mm5,mm3 + punpckhbw mm4,mm7 + psubw mm2,mm6 + punpcklbw mm3,mm7 + psubw mm4,mm6 + punpckhbw mm5,mm7 + psubw mm3,mm6 + psubw mm5,mm6 + /*Write the answer out.*/ + movq [RESIDUE+0x60],mm2 + movq [RESIDUE+0x68],mm4 + movq [RESIDUE+0x70],mm3 + movq [RESIDUE+0x78],mm5 +#undef YSTRIDE +#undef YSTRIDE3 +#undef RESIDUE +#undef SRC + } +} + +void oc_enc_frag_copy2_mmxext(unsigned char *_dst, + const unsigned char *_src1,const unsigned char *_src2,int _ystride){ + oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride); +} + +#endif diff --git a/thirdparty/libtheora/x86_vc/mmxfdct.c b/thirdparty/libtheora/x86_vc/mmxfdct.c new file mode 100644 index 0000000000..dcf17c9fa7 --- /dev/null +++ b/thirdparty/libtheora/x86_vc/mmxfdct.c @@ -0,0 +1,670 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006 * + * by the Xiph.Org Foundation http://www.xiph.org/ * + * * + ********************************************************************/ + /*MMX fDCT implementation for x86_32*/ +/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/ +#include "x86enc.h" + +#if defined(OC_X86_ASM) + +#define OC_FDCT_STAGE1_8x4 __asm{ \ + /*Stage 1:*/ \ + /*mm0=t7'=t0-t7*/ \ + __asm psubw mm0,mm7 \ + __asm paddw mm7,mm7 \ + /*mm1=t6'=t1-t6*/ \ + __asm psubw mm1, mm6 \ + __asm paddw mm6,mm6 \ + /*mm2=t5'=t2-t5*/ \ + __asm psubw mm2,mm5 \ + __asm paddw mm5,mm5 \ + /*mm3=t4'=t3-t4*/ \ + __asm psubw mm3,mm4 \ + __asm paddw mm4,mm4 \ + /*mm7=t0'=t0+t7*/ \ + __asm paddw mm7,mm0 \ + /*mm6=t1'=t1+t6*/ \ + __asm paddw mm6,mm1 \ + /*mm5=t2'=t2+t5*/ \ + __asm paddw mm5,mm2 \ + /*mm4=t3'=t3+t4*/ \ + __asm paddw mm4,mm3\ +} + +#define OC_FDCT8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) __asm{ \ + /*Stage 2:*/ \ + /*mm7=t3''=t0'-t3'*/ \ + __asm psubw mm7,mm4 \ + __asm paddw mm4,mm4 \ + /*mm6=t2''=t1'-t2'*/ \ + __asm psubw mm6,mm5 \ + __asm movq [Y+_r6],mm7 \ + __asm paddw mm5,mm5 \ + /*mm1=t5''=t6'-t5'*/ \ + __asm psubw mm1,mm2 \ + __asm movq [Y+_r2],mm6 \ + /*mm4=t0''=t0'+t3'*/ \ + __asm paddw mm4,mm7 \ + __asm paddw mm2,mm2 \ + /*mm5=t1''=t1'+t2'*/ \ + __asm movq [Y+_r0],mm4 \ + __asm paddw mm5,mm6 \ + /*mm2=t6''=t6'+t5'*/ \ + __asm paddw mm2,mm1 \ + __asm movq [Y+_r4],mm5 \ + /*mm0=t7', mm1=t5'', mm2=t6'', mm3=t4'.*/ \ + /*mm4, mm5, mm6, mm7 are free.*/ \ + /*Stage 3:*/ \ + /*mm6={2}x4, mm7={27146,0xB500>>1}x2*/ \ + __asm mov A,0x5A806A0A \ + __asm pcmpeqb mm6,mm6 \ + __asm movd mm7,A \ + __asm psrlw mm6,15 \ + __asm punpckldq mm7,mm7 \ + __asm paddw mm6,mm6 \ + /*mm0=0, m2={-1}x4 \ + mm5:mm4=t5''*27146+0xB500*/ \ + __asm movq mm4,mm1 \ + __asm movq mm5,mm1 \ + __asm punpcklwd mm4,mm6 \ + __asm movq [Y+_r3],mm2 \ + __asm pmaddwd mm4,mm7 \ + __asm movq [Y+_r7],mm0 \ + __asm punpckhwd mm5,mm6 \ + __asm pxor mm0,mm0 \ + __asm pmaddwd mm5,mm7 \ + __asm pcmpeqb mm2,mm2 \ + /*mm2=t6'', mm1=t5''+(t5''!=0) \ + mm4=(t5''*27146+0xB500>>16)*/ \ + __asm pcmpeqw mm0,mm1 \ + __asm psrad mm4,16 \ + __asm psubw mm0,mm2 \ + __asm movq mm2, [Y+_r3] \ + __asm psrad mm5,16 \ + __asm paddw mm1,mm0 \ + __asm packssdw mm4,mm5 \ + /*mm4=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \ + __asm paddw mm4,mm1 \ + __asm movq mm0, [Y+_r7] \ + __asm psraw mm4,1 \ + __asm movq mm1,mm3 \ + /*mm3=t4''=t4'+s*/ \ + __asm paddw mm3,mm4 \ + /*mm1=t5'''=t4'-s*/ \ + __asm psubw mm1,mm4 \ + /*mm1=0, mm3={-1}x4 \ + mm5:mm4=t6''*27146+0xB500*/ \ + __asm movq mm4,mm2 \ + __asm movq mm5,mm2 \ + __asm punpcklwd mm4,mm6 \ + __asm movq [Y+_r5],mm1 \ + __asm pmaddwd mm4,mm7 \ + __asm movq [Y+_r1],mm3 \ + __asm punpckhwd mm5,mm6 \ + __asm pxor mm1,mm1 \ + __asm pmaddwd mm5,mm7 \ + __asm pcmpeqb mm3,mm3 \ + /*mm2=t6''+(t6''!=0), mm4=(t6''*27146+0xB500>>16)*/ \ + __asm psrad mm4,16 \ + __asm pcmpeqw mm1,mm2 \ + __asm psrad mm5,16 \ + __asm psubw mm1,mm3 \ + __asm packssdw mm4,mm5 \ + __asm paddw mm2,mm1 \ + /*mm1=t1'' \ + mm4=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \ + __asm paddw mm4,mm2 \ + __asm movq mm1,[Y+_r4] \ + __asm psraw mm4,1 \ + __asm movq mm2,mm0 \ + /*mm7={54491-0x7FFF,0x7FFF}x2 \ + mm0=t7''=t7'+s*/ \ + __asm paddw mm0,mm4 \ + /*mm2=t6'''=t7'-s*/ \ + __asm psubw mm2,mm4 \ + /*Stage 4:*/ \ + /*mm0=0, mm2=t0'' \ + mm5:mm4=t1''*27146+0xB500*/ \ + __asm movq mm4,mm1 \ + __asm movq mm5,mm1 \ + __asm punpcklwd mm4,mm6 \ + __asm movq [Y+_r3],mm2 \ + __asm pmaddwd mm4,mm7 \ + __asm movq mm2,[Y+_r0] \ + __asm punpckhwd mm5,mm6 \ + __asm movq [Y+_r7],mm0 \ + __asm pmaddwd mm5,mm7 \ + __asm pxor mm0,mm0 \ + /*mm7={27146,0x4000>>1}x2 \ + mm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \ + __asm psrad mm4,16 \ + __asm mov A,0x20006A0A \ + __asm pcmpeqw mm0,mm1 \ + __asm movd mm7,A \ + __asm psrad mm5,16 \ + __asm psubw mm0,mm3 \ + __asm packssdw mm4,mm5 \ + __asm paddw mm0,mm1 \ + __asm punpckldq mm7,mm7 \ + __asm paddw mm0,mm4 \ + /*mm6={0x00000E3D}x2 \ + mm1=-(t0''==0), mm5:mm4=t0''*27146+0x4000*/ \ + __asm movq mm4,mm2 \ + __asm movq mm5,mm2 \ + __asm punpcklwd mm4,mm6 \ + __asm mov A,0x0E3D \ + __asm pmaddwd mm4,mm7 \ + __asm punpckhwd mm5,mm6 \ + __asm movd mm6,A \ + __asm pmaddwd mm5,mm7 \ + __asm pxor mm1,mm1 \ + __asm punpckldq mm6,mm6 \ + __asm pcmpeqw mm1,mm2 \ + /*mm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \ + __asm psrad mm4,16 \ + __asm psubw mm1,mm3 \ + __asm psrad mm5,16 \ + __asm paddw mm2,mm1 \ + __asm packssdw mm4,mm5 \ + __asm movq mm1,[Y+_r5] \ + __asm paddw mm4,mm2 \ + /*mm2=t6'', mm0=_y[0]=u=r+s>>1 \ + The naive implementation could cause overflow, so we use \ + u=(r&s)+((r^s)>>1).*/ \ + __asm movq mm2,[Y+_r3] \ + __asm movq mm7,mm0 \ + __asm pxor mm0,mm4 \ + __asm pand mm7,mm4 \ + __asm psraw mm0,1 \ + __asm mov A,0x7FFF54DC \ + __asm paddw mm0,mm7 \ + __asm movd mm7,A \ + /*mm7={54491-0x7FFF,0x7FFF}x2 \ + mm4=_y[4]=v=r-u*/ \ + __asm psubw mm4,mm0 \ + __asm punpckldq mm7,mm7 \ + __asm movq [Y+_r4],mm4 \ + /*mm0=0, mm7={36410}x4 \ + mm1=(t5'''!=0), mm5:mm4=54491*t5'''+0x0E3D*/ \ + __asm movq mm4,mm1 \ + __asm movq mm5,mm1 \ + __asm punpcklwd mm4,mm1 \ + __asm mov A,0x8E3A8E3A \ + __asm pmaddwd mm4,mm7 \ + __asm movq [Y+_r0],mm0 \ + __asm punpckhwd mm5,mm1 \ + __asm pxor mm0,mm0 \ + __asm pmaddwd mm5,mm7 \ + __asm pcmpeqw mm1,mm0 \ + __asm movd mm7,A \ + __asm psubw mm1,mm3 \ + __asm punpckldq mm7,mm7 \ + __asm paddd mm4,mm6 \ + __asm paddd mm5,mm6 \ + /*mm0=0 \ + mm3:mm1=36410*t6'''+((t5'''!=0)<<16)*/ \ + __asm movq mm6,mm2 \ + __asm movq mm3,mm2 \ + __asm pmulhw mm6,mm7 \ + __asm paddw mm1,mm2 \ + __asm pmullw mm3,mm7 \ + __asm pxor mm0,mm0 \ + __asm paddw mm6,mm1 \ + __asm movq mm1,mm3 \ + __asm punpckhwd mm3,mm6 \ + __asm punpcklwd mm1,mm6 \ + /*mm3={-1}x4, mm6={1}x4 \ + mm4=_y[5]=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \ + __asm paddd mm5,mm3 \ + __asm paddd mm4,mm1 \ + __asm psrad mm5,16 \ + __asm pxor mm6,mm6 \ + __asm psrad mm4,16 \ + __asm pcmpeqb mm3,mm3 \ + __asm packssdw mm4,mm5 \ + __asm psubw mm6,mm3 \ + /*mm1=t7'', mm7={26568,0x3400}x2 \ + mm2=s=t6'''-(36410*u>>16)*/ \ + __asm movq mm1,mm4 \ + __asm mov A,0x340067C8 \ + __asm pmulhw mm4,mm7 \ + __asm movd mm7,A \ + __asm movq [Y+_r5],mm1 \ + __asm punpckldq mm7,mm7 \ + __asm paddw mm4,mm1 \ + __asm movq mm1,[Y+_r7] \ + __asm psubw mm2,mm4 \ + /*mm6={0x00007B1B}x2 \ + mm0=(s!=0), mm5:mm4=s*26568+0x3400*/ \ + __asm movq mm4,mm2 \ + __asm movq mm5,mm2 \ + __asm punpcklwd mm4,mm6 \ + __asm pcmpeqw mm0,mm2 \ + __asm pmaddwd mm4,mm7 \ + __asm mov A,0x7B1B \ + __asm punpckhwd mm5,mm6 \ + __asm movd mm6,A \ + __asm pmaddwd mm5,mm7 \ + __asm psubw mm0,mm3 \ + __asm punpckldq mm6,mm6 \ + /*mm7={64277-0x7FFF,0x7FFF}x2 \ + mm2=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \ + __asm psrad mm4,17 \ + __asm paddw mm2,mm0 \ + __asm psrad mm5,17 \ + __asm mov A,0x7FFF7B16 \ + __asm packssdw mm4,mm5 \ + __asm movd mm7,A \ + __asm paddw mm2,mm4 \ + __asm punpckldq mm7,mm7 \ + /*mm0=0, mm7={12785}x4 \ + mm1=(t7''!=0), mm2=t4'', mm5:mm4=64277*t7''+0x7B1B*/ \ + __asm movq mm4,mm1 \ + __asm movq mm5,mm1 \ + __asm movq [Y+_r3],mm2 \ + __asm punpcklwd mm4,mm1 \ + __asm movq mm2,[Y+_r1] \ + __asm pmaddwd mm4,mm7 \ + __asm mov A,0x31F131F1 \ + __asm punpckhwd mm5,mm1 \ + __asm pxor mm0,mm0 \ + __asm pmaddwd mm5,mm7 \ + __asm pcmpeqw mm1,mm0 \ + __asm movd mm7,A \ + __asm psubw mm1,mm3 \ + __asm punpckldq mm7,mm7 \ + __asm paddd mm4,mm6 \ + __asm paddd mm5,mm6 \ + /*mm3:mm1=12785*t4'''+((t7''!=0)<<16)*/ \ + __asm movq mm6,mm2 \ + __asm movq mm3,mm2 \ + __asm pmulhw mm6,mm7 \ + __asm pmullw mm3,mm7 \ + __asm paddw mm6,mm1 \ + __asm movq mm1,mm3 \ + __asm punpckhwd mm3,mm6 \ + __asm punpcklwd mm1,mm6 \ + /*mm3={-1}x4, mm6={1}x4 \ + mm4=_y[1]=u=(12785*t4'''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \ + __asm paddd mm5,mm3 \ + __asm paddd mm4,mm1 \ + __asm psrad mm5,16 \ + __asm pxor mm6,mm6 \ + __asm psrad mm4,16 \ + __asm pcmpeqb mm3,mm3 \ + __asm packssdw mm4,mm5 \ + __asm psubw mm6,mm3 \ + /*mm1=t3'', mm7={20539,0x3000}x2 \ + mm4=s=(12785*u>>16)-t4''*/ \ + __asm movq [Y+_r1],mm4 \ + __asm pmulhw mm4,mm7 \ + __asm mov A,0x3000503B \ + __asm movq mm1,[Y+_r6] \ + __asm movd mm7,A \ + __asm psubw mm4,mm2 \ + __asm punpckldq mm7,mm7 \ + /*mm6={0x00006CB7}x2 \ + mm0=(s!=0), mm5:mm4=s*20539+0x3000*/ \ + __asm movq mm5,mm4 \ + __asm movq mm2,mm4 \ + __asm punpcklwd mm4,mm6 \ + __asm pcmpeqw mm0,mm2 \ + __asm pmaddwd mm4,mm7 \ + __asm mov A,0x6CB7 \ + __asm punpckhwd mm5,mm6 \ + __asm movd mm6,A \ + __asm pmaddwd mm5,mm7 \ + __asm psubw mm0,mm3 \ + __asm punpckldq mm6,mm6 \ + /*mm7={60547-0x7FFF,0x7FFF}x2 \ + mm2=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \ + __asm psrad mm4,20 \ + __asm paddw mm2,mm0 \ + __asm psrad mm5,20 \ + __asm mov A,0x7FFF6C84 \ + __asm packssdw mm4,mm5 \ + __asm movd mm7,A \ + __asm paddw mm2,mm4 \ + __asm punpckldq mm7,mm7 \ + /*mm0=0, mm7={25080}x4 \ + mm2=t2'', mm5:mm4=60547*t3''+0x6CB7*/ \ + __asm movq mm4,mm1 \ + __asm movq mm5,mm1 \ + __asm movq [Y+_r7],mm2 \ + __asm punpcklwd mm4,mm1 \ + __asm movq mm2,[Y+_r2] \ + __asm pmaddwd mm4,mm7 \ + __asm mov A,0x61F861F8 \ + __asm punpckhwd mm5,mm1 \ + __asm pxor mm0,mm0 \ + __asm pmaddwd mm5,mm7 \ + __asm movd mm7,A \ + __asm pcmpeqw mm1,mm0 \ + __asm psubw mm1,mm3 \ + __asm punpckldq mm7,mm7 \ + __asm paddd mm4,mm6 \ + __asm paddd mm5,mm6 \ + /*mm3:mm1=25080*t2''+((t3''!=0)<<16)*/ \ + __asm movq mm6,mm2 \ + __asm movq mm3,mm2 \ + __asm pmulhw mm6,mm7 \ + __asm pmullw mm3,mm7 \ + __asm paddw mm6,mm1 \ + __asm movq mm1,mm3 \ + __asm punpckhwd mm3,mm6 \ + __asm punpcklwd mm1,mm6 \ + /*mm1={-1}x4 \ + mm4=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \ + __asm paddd mm5,mm3 \ + __asm paddd mm4,mm1 \ + __asm psrad mm5,16 \ + __asm mov A,0x28005460 \ + __asm psrad mm4,16 \ + __asm pcmpeqb mm1,mm1 \ + __asm packssdw mm4,mm5 \ + /*mm5={1}x4, mm6=_y[2]=u, mm7={21600,0x2800}x2 \ + mm4=s=(25080*u>>16)-t2''*/ \ + __asm movq mm6,mm4 \ + __asm pmulhw mm4,mm7 \ + __asm pxor mm5,mm5 \ + __asm movd mm7,A \ + __asm psubw mm5,mm1 \ + __asm punpckldq mm7,mm7 \ + __asm psubw mm4,mm2 \ + /*mm2=s+(s!=0) \ + mm4:mm3=s*21600+0x2800*/ \ + __asm movq mm3,mm4 \ + __asm movq mm2,mm4 \ + __asm punpckhwd mm4,mm5 \ + __asm pcmpeqw mm0,mm2 \ + __asm pmaddwd mm4,mm7 \ + __asm psubw mm0,mm1 \ + __asm punpcklwd mm3,mm5 \ + __asm paddw mm2,mm0 \ + __asm pmaddwd mm3,mm7 \ + /*mm0=_y[4], mm1=_y[7], mm4=_y[0], mm5=_y[5] \ + mm3=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \ + __asm movq mm0,[Y+_r4] \ + __asm psrad mm4,18 \ + __asm movq mm5,[Y+_r5] \ + __asm psrad mm3,18 \ + __asm movq mm1,[Y+_r7] \ + __asm packssdw mm3,mm4 \ + __asm movq mm4,[Y+_r0] \ + __asm paddw mm3,mm2 \ +} + +/*On input, mm4=_y[0], mm6=_y[2], mm0=_y[4], mm5=_y[5], mm3=_y[6], mm1=_y[7]. + On output, {_y[4],mm1,mm2,mm3} contains the transpose of _y[4...7] and + {mm4,mm5,mm6,mm7} contains the transpose of _y[0...3].*/ +#define OC_TRANSPOSE8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) __asm{ \ + /*First 4x4 transpose:*/ \ + /*mm0 = e3 e2 e1 e0 \ + mm5 = f3 f2 f1 f0 \ + mm3 = g3 g2 g1 g0 \ + mm1 = h3 h2 h1 h0*/ \ + __asm movq mm2,mm0 \ + __asm punpcklwd mm0,mm5 \ + __asm punpckhwd mm2,mm5 \ + __asm movq mm5,mm3 \ + __asm punpcklwd mm3,mm1 \ + __asm punpckhwd mm5,mm1 \ + /*mm0 = f1 e1 f0 e0 \ + mm2 = f3 e3 f2 e2 \ + mm3 = h1 g1 h0 g0 \ + mm5 = h3 g3 h2 g2*/ \ + __asm movq mm1,mm0 \ + __asm punpckldq mm0,mm3 \ + __asm movq [Y+_r4],mm0 \ + __asm punpckhdq mm1,mm3 \ + __asm movq mm0,[Y+_r1] \ + __asm movq mm3,mm2 \ + __asm punpckldq mm2,mm5 \ + __asm punpckhdq mm3,mm5 \ + __asm movq mm5,[Y+_r3] \ + /*_y[4] = h0 g0 f0 e0 \ + mm1 = h1 g1 f1 e1 \ + mm2 = h2 g2 f2 e2 \ + mm3 = h3 g3 f3 e3*/ \ + /*Second 4x4 transpose:*/ \ + /*mm4 = a3 a2 a1 a0 \ + mm0 = b3 b2 b1 b0 \ + mm6 = c3 c2 c1 c0 \ + mm5 = d3 d2 d1 d0*/ \ + __asm movq mm7,mm4 \ + __asm punpcklwd mm4,mm0 \ + __asm punpckhwd mm7,mm0 \ + __asm movq mm0,mm6 \ + __asm punpcklwd mm6,mm5 \ + __asm punpckhwd mm0,mm5 \ + /*mm4 = b1 a1 b0 a0 \ + mm7 = b3 a3 b2 a2 \ + mm6 = d1 c1 d0 c0 \ + mm0 = d3 c3 d2 c2*/ \ + __asm movq mm5,mm4 \ + __asm punpckldq mm4,mm6 \ + __asm punpckhdq mm5,mm6 \ + __asm movq mm6,mm7 \ + __asm punpckhdq mm7,mm0 \ + __asm punpckldq mm6,mm0 \ + /*mm4 = d0 c0 b0 a0 \ + mm5 = d1 c1 b1 a1 \ + mm6 = d2 c2 b2 a2 \ + mm7 = d3 c3 b3 a3*/ \ +} + +/*MMX implementation of the fDCT.*/ +void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){ + ptrdiff_t a; + __asm{ +#define Y eax +#define A ecx +#define X edx + /*Add two extra bits of working precision to improve accuracy; any more and + we could overflow.*/ + /*We also add biases to correct for some systematic error that remains in + the full fDCT->iDCT round trip.*/ + mov X, _x + mov Y, _y + movq mm0,[0x00+X] + movq mm1,[0x10+X] + movq mm2,[0x20+X] + movq mm3,[0x30+X] + pcmpeqb mm4,mm4 + pxor mm7,mm7 + movq mm5,mm0 + psllw mm0,2 + pcmpeqw mm5,mm7 + movq mm7,[0x70+X] + psllw mm1,2 + psubw mm5,mm4 + psllw mm2,2 + mov A,1 + pslld mm5,16 + movd mm6,A + psllq mm5,16 + mov A,0x10001 + psllw mm3,2 + movd mm4,A + punpckhwd mm5,mm6 + psubw mm1,mm6 + movq mm6,[0x60+X] + paddw mm0,mm5 + movq mm5,[0x50+X] + paddw mm0,mm4 + movq mm4,[0x40+X] + /*We inline stage1 of the transform here so we can get better instruction + scheduling with the shifts.*/ + /*mm0=t7'=t0-t7*/ + psllw mm7,2 + psubw mm0,mm7 + psllw mm6,2 + paddw mm7,mm7 + /*mm1=t6'=t1-t6*/ + psllw mm5,2 + psubw mm1,mm6 + psllw mm4,2 + paddw mm6,mm6 + /*mm2=t5'=t2-t5*/ + psubw mm2,mm5 + paddw mm5,mm5 + /*mm3=t4'=t3-t4*/ + psubw mm3,mm4 + paddw mm4,mm4 + /*mm7=t0'=t0+t7*/ + paddw mm7,mm0 + /*mm6=t1'=t1+t6*/ + paddw mm6,mm1 + /*mm5=t2'=t2+t5*/ + paddw mm5,mm2 + /*mm4=t3'=t3+t4*/ + paddw mm4,mm3 + OC_FDCT8x4(0x00,0x10,0x20,0x30,0x40,0x50,0x60,0x70) + OC_TRANSPOSE8x4(0x00,0x10,0x20,0x30,0x40,0x50,0x60,0x70) + /*Swap out this 8x4 block for the next one.*/ + movq mm0,[0x08+X] + movq [0x30+Y],mm7 + movq mm7,[0x78+X] + movq [0x50+Y],mm1 + movq mm1,[0x18+X] + movq [0x20+Y],mm6 + movq mm6,[0x68+X] + movq [0x60+Y],mm2 + movq mm2,[0x28+X] + movq [0x10+Y],mm5 + movq mm5,[0x58+X] + movq [0x70+Y],mm3 + movq mm3,[0x38+X] + /*And increase its working precision, too.*/ + psllw mm0,2 + movq [0x00+Y],mm4 + psllw mm7,2 + movq mm4,[0x48+X] + /*We inline stage1 of the transform here so we can get better instruction + scheduling with the shifts.*/ + /*mm0=t7'=t0-t7*/ + psubw mm0,mm7 + psllw mm1,2 + paddw mm7,mm7 + psllw mm6,2 + /*mm1=t6'=t1-t6*/ + psubw mm1,mm6 + psllw mm2,2 + paddw mm6,mm6 + psllw mm5,2 + /*mm2=t5'=t2-t5*/ + psubw mm2,mm5 + psllw mm3,2 + paddw mm5,mm5 + psllw mm4,2 + /*mm3=t4'=t3-t4*/ + psubw mm3,mm4 + paddw mm4,mm4 + /*mm7=t0'=t0+t7*/ + paddw mm7,mm0 + /*mm6=t1'=t1+t6*/ + paddw mm6,mm1 + /*mm5=t2'=t2+t5*/ + paddw mm5,mm2 + /*mm4=t3'=t3+t4*/ + paddw mm4,mm3 + OC_FDCT8x4(0x08,0x18,0x28,0x38,0x48,0x58,0x68,0x78) + OC_TRANSPOSE8x4(0x08,0x18,0x28,0x38,0x48,0x58,0x68,0x78) + /*Here the first 4x4 block of output from the last transpose is the second + 4x4 block of input for the next transform. + We have cleverly arranged that it already be in the appropriate place, + so we only have to do half the stores and loads.*/ + movq mm0,[0x00+Y] + movq [0x58+Y],mm1 + movq mm1,[0x10+Y] + movq [0x68+Y],mm2 + movq mm2,[0x20+Y] + movq [0x78+Y],mm3 + movq mm3,[0x30+Y] + OC_FDCT_STAGE1_8x4 + OC_FDCT8x4(0x00,0x10,0x20,0x30,0x08,0x18,0x28,0x38) + OC_TRANSPOSE8x4(0x00,0x10,0x20,0x30,0x08,0x18,0x28,0x38) + /*mm0={-2}x4*/ + pcmpeqw mm0,mm0 + paddw mm0,mm0 + /*Round the results.*/ + psubw mm1,mm0 + psubw mm2,mm0 + psraw mm1,2 + psubw mm3,mm0 + movq [0x18+Y],mm1 + psraw mm2,2 + psubw mm4,mm0 + movq mm1,[0x08+Y] + psraw mm3,2 + psubw mm5,mm0 + psraw mm4,2 + psubw mm6,mm0 + psraw mm5,2 + psubw mm7,mm0 + psraw mm6,2 + psubw mm1,mm0 + psraw mm7,2 + movq mm0,[0x40+Y] + psraw mm1,2 + movq [0x30+Y],mm7 + movq mm7,[0x78+Y] + movq [0x08+Y],mm1 + movq mm1,[0x50+Y] + movq [0x20+Y],mm6 + movq mm6,[0x68+Y] + movq [0x28+Y],mm2 + movq mm2,[0x60+Y] + movq [0x10+Y],mm5 + movq mm5,[0x58+Y] + movq [0x38+Y],mm3 + movq mm3,[0x70+Y] + movq [0x00+Y],mm4 + movq mm4,[0x48+Y] + OC_FDCT_STAGE1_8x4 + OC_FDCT8x4(0x40,0x50,0x60,0x70,0x48,0x58,0x68,0x78) + OC_TRANSPOSE8x4(0x40,0x50,0x60,0x70,0x48,0x58,0x68,0x78) + /*mm0={-2}x4*/ + pcmpeqw mm0,mm0 + paddw mm0,mm0 + /*Round the results.*/ + psubw mm1,mm0 + psubw mm2,mm0 + psraw mm1,2 + psubw mm3,mm0 + movq [0x58+Y],mm1 + psraw mm2,2 + psubw mm4,mm0 + movq mm1,[0x48+Y] + psraw mm3,2 + psubw mm5,mm0 + movq [0x68+Y],mm2 + psraw mm4,2 + psubw mm6,mm0 + movq [0x78+Y],mm3 + psraw mm5,2 + psubw mm7,mm0 + movq [0x40+Y],mm4 + psraw mm6,2 + psubw mm1,mm0 + movq [0x50+Y],mm5 + psraw mm7,2 + movq [0x60+Y],mm6 + psraw mm1,2 + movq [0x70+Y],mm7 + movq [0x48+Y],mm1 +#undef Y +#undef A +#undef X + } +} + +#endif diff --git a/thirdparty/libtheora/x86_vc/mmxfrag.c b/thirdparty/libtheora/x86_vc/mmxfrag.c new file mode 100644 index 0000000000..4eb2084dc6 --- /dev/null +++ b/thirdparty/libtheora/x86_vc/mmxfrag.c @@ -0,0 +1,337 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: mmxfrag.c 16578 2009-09-25 19:50:48Z cristianadam $ + + ********************************************************************/ + +/*MMX acceleration of fragment reconstruction for motion compensation. + Originally written by Rudolf Marek. + Additional optimization by Nils Pipenbrinck. + Note: Loops are unrolled for best performance. + The iteration each instruction belongs to is marked in the comments as #i.*/ +#include +#include "x86int.h" +#include "mmxfrag.h" + +#if defined(OC_X86_ASM) + +/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes + between rows.*/ +void oc_frag_copy_mmx(unsigned char *_dst, + const unsigned char *_src,int _ystride){ +#define SRC edx +#define DST eax +#define YSTRIDE ecx +#define YSTRIDE3 esi + OC_FRAG_COPY_MMX(_dst,_src,_ystride); +#undef SRC +#undef DST +#undef YSTRIDE +#undef YSTRIDE3 +} + +void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride, + const ogg_int16_t *_residue){ + __asm{ +#define DST edx +#define DST4 esi +#define YSTRIDE eax +#define YSTRIDE3 edi +#define RESIDUE ecx + mov DST,_dst + mov YSTRIDE,_ystride + mov RESIDUE,_residue + lea DST4,[DST+YSTRIDE*4] + lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] + /*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/ + pcmpeqw mm0,mm0 + /*#0 Load low residue.*/ + movq mm1,[0*8+RESIDUE] + /*#0 Load high residue.*/ + movq mm2,[1*8+RESIDUE] + /*Set mm0 to 0x8000800080008000.*/ + psllw mm0,15 + /*#1 Load low residue.*/ + movq mm3,[2*8+RESIDUE] + /*#1 Load high residue.*/ + movq mm4,[3*8+RESIDUE] + /*Set mm0 to 0x0080008000800080.*/ + psrlw mm0,8 + /*#2 Load low residue.*/ + movq mm5,[4*8+RESIDUE] + /*#2 Load high residue.*/ + movq mm6,[5*8+RESIDUE] + /*#0 Bias low residue.*/ + paddsw mm1,mm0 + /*#0 Bias high residue.*/ + paddsw mm2,mm0 + /*#0 Pack to byte.*/ + packuswb mm1,mm2 + /*#1 Bias low residue.*/ + paddsw mm3,mm0 + /*#1 Bias high residue.*/ + paddsw mm4,mm0 + /*#1 Pack to byte.*/ + packuswb mm3,mm4 + /*#2 Bias low residue.*/ + paddsw mm5,mm0 + /*#2 Bias high residue.*/ + paddsw mm6,mm0 + /*#2 Pack to byte.*/ + packuswb mm5,mm6 + /*#0 Write row.*/ + movq [DST],mm1 + /*#1 Write row.*/ + movq [DST+YSTRIDE],mm3 + /*#2 Write row.*/ + movq [DST+YSTRIDE*2],mm5 + /*#3 Load low residue.*/ + movq mm1,[6*8+RESIDUE] + /*#3 Load high residue.*/ + movq mm2,[7*8+RESIDUE] + /*#4 Load high residue.*/ + movq mm3,[8*8+RESIDUE] + /*#4 Load high residue.*/ + movq mm4,[9*8+RESIDUE] + /*#5 Load high residue.*/ + movq mm5,[10*8+RESIDUE] + /*#5 Load high residue.*/ + movq mm6,[11*8+RESIDUE] + /*#3 Bias low residue.*/ + paddsw mm1,mm0 + /*#3 Bias high residue.*/ + paddsw mm2,mm0 + /*#3 Pack to byte.*/ + packuswb mm1,mm2 + /*#4 Bias low residue.*/ + paddsw mm3,mm0 + /*#4 Bias high residue.*/ + paddsw mm4,mm0 + /*#4 Pack to byte.*/ + packuswb mm3,mm4 + /*#5 Bias low residue.*/ + paddsw mm5,mm0 + /*#5 Bias high residue.*/ + paddsw mm6,mm0 + /*#5 Pack to byte.*/ + packuswb mm5,mm6 + /*#3 Write row.*/ + movq [DST+YSTRIDE3],mm1 + /*#4 Write row.*/ + movq [DST4],mm3 + /*#5 Write row.*/ + movq [DST4+YSTRIDE],mm5 + /*#6 Load low residue.*/ + movq mm1,[12*8+RESIDUE] + /*#6 Load high residue.*/ + movq mm2,[13*8+RESIDUE] + /*#7 Load low residue.*/ + movq mm3,[14*8+RESIDUE] + /*#7 Load high residue.*/ + movq mm4,[15*8+RESIDUE] + /*#6 Bias low residue.*/ + paddsw mm1,mm0 + /*#6 Bias high residue.*/ + paddsw mm2,mm0 + /*#6 Pack to byte.*/ + packuswb mm1,mm2 + /*#7 Bias low residue.*/ + paddsw mm3,mm0 + /*#7 Bias high residue.*/ + paddsw mm4,mm0 + /*#7 Pack to byte.*/ + packuswb mm3,mm4 + /*#6 Write row.*/ + movq [DST4+YSTRIDE*2],mm1 + /*#7 Write row.*/ + movq [DST4+YSTRIDE3],mm3 +#undef DST +#undef DST4 +#undef YSTRIDE +#undef YSTRIDE3 +#undef RESIDUE + } +} + +void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src, + int _ystride,const ogg_int16_t *_residue){ + int i; + /*Zero mm0.*/ + __asm pxor mm0,mm0; + for(i=4;i-->0;){ + __asm{ +#define DST edx +#define SRC ecx +#define YSTRIDE edi +#define RESIDUE eax + mov DST,_dst + mov SRC,_src + mov YSTRIDE,_ystride + mov RESIDUE,_residue + /*#0 Load source.*/ + movq mm3,[SRC] + /*#1 Load source.*/ + movq mm7,[SRC+YSTRIDE] + /*#0 Get copy of src.*/ + movq mm4,mm3 + /*#0 Expand high source.*/ + punpckhbw mm4,mm0 + /*#0 Expand low source.*/ + punpcklbw mm3,mm0 + /*#0 Add residue high.*/ + paddsw mm4,[8+RESIDUE] + /*#1 Get copy of src.*/ + movq mm2,mm7 + /*#0 Add residue low.*/ + paddsw mm3,[RESIDUE] + /*#1 Expand high source.*/ + punpckhbw mm2,mm0 + /*#0 Pack final row pixels.*/ + packuswb mm3,mm4 + /*#1 Expand low source.*/ + punpcklbw mm7,mm0 + /*#1 Add residue low.*/ + paddsw mm7,[16+RESIDUE] + /*#1 Add residue high.*/ + paddsw mm2,[24+RESIDUE] + /*Advance residue.*/ + lea RESIDUE,[32+RESIDUE] + /*#1 Pack final row pixels.*/ + packuswb mm7,mm2 + /*Advance src.*/ + lea SRC,[SRC+YSTRIDE*2] + /*#0 Write row.*/ + movq [DST],mm3 + /*#1 Write row.*/ + movq [DST+YSTRIDE],mm7 + /*Advance dst.*/ + lea DST,[DST+YSTRIDE*2] + mov _residue,RESIDUE + mov _dst,DST + mov _src,SRC +#undef DST +#undef SRC +#undef YSTRIDE +#undef RESIDUE + } + } +} + +void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1, + const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){ + int i; + /*Zero mm7.*/ + __asm pxor mm7,mm7; + for(i=4;i-->0;){ + __asm{ +#define SRC1 ecx +#define SRC2 edi +#define YSTRIDE esi +#define RESIDUE edx +#define DST eax + mov YSTRIDE,_ystride + mov DST,_dst + mov RESIDUE,_residue + mov SRC1,_src1 + mov SRC2,_src2 + /*#0 Load src1.*/ + movq mm0,[SRC1] + /*#0 Load src2.*/ + movq mm2,[SRC2] + /*#0 Copy src1.*/ + movq mm1,mm0 + /*#0 Copy src2.*/ + movq mm3,mm2 + /*#1 Load src1.*/ + movq mm4,[SRC1+YSTRIDE] + /*#0 Unpack lower src1.*/ + punpcklbw mm0,mm7 + /*#1 Load src2.*/ + movq mm5,[SRC2+YSTRIDE] + /*#0 Unpack higher src1.*/ + punpckhbw mm1,mm7 + /*#0 Unpack lower src2.*/ + punpcklbw mm2,mm7 + /*#0 Unpack higher src2.*/ + punpckhbw mm3,mm7 + /*Advance src1 ptr.*/ + lea SRC1,[SRC1+YSTRIDE*2] + /*Advance src2 ptr.*/ + lea SRC2,[SRC2+YSTRIDE*2] + /*#0 Lower src1+src2.*/ + paddsw mm0,mm2 + /*#0 Higher src1+src2.*/ + paddsw mm1,mm3 + /*#1 Copy src1.*/ + movq mm2,mm4 + /*#0 Build lo average.*/ + psraw mm0,1 + /*#1 Copy src2.*/ + movq mm3,mm5 + /*#1 Unpack lower src1.*/ + punpcklbw mm4,mm7 + /*#0 Build hi average.*/ + psraw mm1,1 + /*#1 Unpack higher src1.*/ + punpckhbw mm2,mm7 + /*#0 low+=residue.*/ + paddsw mm0,[RESIDUE] + /*#1 Unpack lower src2.*/ + punpcklbw mm5,mm7 + /*#0 high+=residue.*/ + paddsw mm1,[8+RESIDUE] + /*#1 Unpack higher src2.*/ + punpckhbw mm3,mm7 + /*#1 Lower src1+src2.*/ + paddsw mm5,mm4 + /*#0 Pack and saturate.*/ + packuswb mm0,mm1 + /*#1 Higher src1+src2.*/ + paddsw mm3,mm2 + /*#0 Write row.*/ + movq [DST],mm0 + /*#1 Build lo average.*/ + psraw mm5,1 + /*#1 Build hi average.*/ + psraw mm3,1 + /*#1 low+=residue.*/ + paddsw mm5,[16+RESIDUE] + /*#1 high+=residue.*/ + paddsw mm3,[24+RESIDUE] + /*#1 Pack and saturate.*/ + packuswb mm5,mm3 + /*#1 Write row ptr.*/ + movq [DST+YSTRIDE],mm5 + /*Advance residue ptr.*/ + add RESIDUE,32 + /*Advance dest ptr.*/ + lea DST,[DST+YSTRIDE*2] + mov _dst,DST + mov _residue,RESIDUE + mov _src1,SRC1 + mov _src2,SRC2 +#undef SRC1 +#undef SRC2 +#undef YSTRIDE +#undef RESIDUE +#undef DST + } + } +} + +void oc_restore_fpu_mmx(void){ + __asm emms; +} + +#endif diff --git a/thirdparty/libtheora/x86_vc/mmxfrag.h b/thirdparty/libtheora/x86_vc/mmxfrag.h new file mode 100644 index 0000000000..45ee93e777 --- /dev/null +++ b/thirdparty/libtheora/x86_vc/mmxfrag.h @@ -0,0 +1,61 @@ +#if !defined(_x86_vc_mmxfrag_H) +# define _x86_vc_mmxfrag_H (1) +# include +# include "x86int.h" + +#if defined(OC_X86_ASM) + +/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes + between rows.*/ +#define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \ + do{ \ + const unsigned char *src; \ + unsigned char *dst; \ + src=(_src); \ + dst=(_dst); \ + __asm mov SRC,src \ + __asm mov DST,dst \ + __asm mov YSTRIDE,_ystride \ + /*src+0*ystride*/ \ + __asm movq mm0,[SRC] \ + /*src+1*ystride*/ \ + __asm movq mm1,[SRC+YSTRIDE] \ + /*ystride3=ystride*3*/ \ + __asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \ + /*src+2*ystride*/ \ + __asm movq mm2,[SRC+YSTRIDE*2] \ + /*src+3*ystride*/ \ + __asm movq mm3,[SRC+YSTRIDE3] \ + /*dst+0*ystride*/ \ + __asm movq [DST],mm0 \ + /*dst+1*ystride*/ \ + __asm movq [DST+YSTRIDE],mm1 \ + /*Pointer to next 4.*/ \ + __asm lea SRC,[SRC+YSTRIDE*4] \ + /*dst+2*ystride*/ \ + __asm movq [DST+YSTRIDE*2],mm2 \ + /*dst+3*ystride*/ \ + __asm movq [DST+YSTRIDE3],mm3 \ + /*Pointer to next 4.*/ \ + __asm lea DST,[DST+YSTRIDE*4] \ + /*src+0*ystride*/ \ + __asm movq mm0,[SRC] \ + /*src+1*ystride*/ \ + __asm movq mm1,[SRC+YSTRIDE] \ + /*src+2*ystride*/ \ + __asm movq mm2,[SRC+YSTRIDE*2] \ + /*src+3*ystride*/ \ + __asm movq mm3,[SRC+YSTRIDE3] \ + /*dst+0*ystride*/ \ + __asm movq [DST],mm0 \ + /*dst+1*ystride*/ \ + __asm movq [DST+YSTRIDE],mm1 \ + /*dst+2*ystride*/ \ + __asm movq [DST+YSTRIDE*2],mm2 \ + /*dst+3*ystride*/ \ + __asm movq [DST+YSTRIDE3],mm3 \ + } \ + while(0) + +# endif +#endif diff --git a/thirdparty/libtheora/x86_vc/mmxidct.c b/thirdparty/libtheora/x86_vc/mmxidct.c new file mode 100644 index 0000000000..8f5ff6803c --- /dev/null +++ b/thirdparty/libtheora/x86_vc/mmxidct.c @@ -0,0 +1,562 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $ + + ********************************************************************/ + +/*MMX acceleration of Theora's iDCT. + Originally written by Rudolf Marek, based on code from On2's VP3.*/ +#include "x86int.h" +#include "../dct.h" + +#if defined(OC_X86_ASM) + +/*These are offsets into the table of constants below.*/ +/*7 rows of cosines, in order: pi/16 * (1 ... 7).*/ +#define OC_COSINE_OFFSET (0) +/*A row of 8's.*/ +#define OC_EIGHT_OFFSET (56) + + + +/*A table of constants used by the MMX routines.*/ +static const __declspec(align(16))ogg_uint16_t + OC_IDCT_CONSTS[(7+1)*4]={ + (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7, + (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7, + (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6, + (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6, + (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5, + (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5, + (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4, + (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4, + (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3, + (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3, + (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2, + (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2, + (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1, + (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1, + 8, 8, 8, 8 +}; + +/*38 cycles*/ +#define OC_IDCT_BEGIN __asm{ \ + __asm movq mm2,OC_I(3) \ + __asm movq mm6,OC_C(3) \ + __asm movq mm4,mm2 \ + __asm movq mm7,OC_J(5) \ + __asm pmulhw mm4,mm6 \ + __asm movq mm1,OC_C(5) \ + __asm pmulhw mm6,mm7 \ + __asm movq mm5,mm1 \ + __asm pmulhw mm1,mm2 \ + __asm movq mm3,OC_I(1) \ + __asm pmulhw mm5,mm7 \ + __asm movq mm0,OC_C(1) \ + __asm paddw mm4,mm2 \ + __asm paddw mm6,mm7 \ + __asm paddw mm2,mm1 \ + __asm movq mm1,OC_J(7) \ + __asm paddw mm7,mm5 \ + __asm movq mm5,mm0 \ + __asm pmulhw mm0,mm3 \ + __asm paddw mm4,mm7 \ + __asm pmulhw mm5,mm1 \ + __asm movq mm7,OC_C(7) \ + __asm psubw mm6,mm2 \ + __asm paddw mm0,mm3 \ + __asm pmulhw mm3,mm7 \ + __asm movq mm2,OC_I(2) \ + __asm pmulhw mm7,mm1 \ + __asm paddw mm5,mm1 \ + __asm movq mm1,mm2 \ + __asm pmulhw mm2,OC_C(2) \ + __asm psubw mm3,mm5 \ + __asm movq mm5,OC_J(6) \ + __asm paddw mm0,mm7 \ + __asm movq mm7,mm5 \ + __asm psubw mm0,mm4 \ + __asm pmulhw mm5,OC_C(2) \ + __asm paddw mm2,mm1 \ + __asm pmulhw mm1,OC_C(6) \ + __asm paddw mm4,mm4 \ + __asm paddw mm4,mm0 \ + __asm psubw mm3,mm6 \ + __asm paddw mm5,mm7 \ + __asm paddw mm6,mm6 \ + __asm pmulhw mm7,OC_C(6) \ + __asm paddw mm6,mm3 \ + __asm movq OC_I(1),mm4 \ + __asm psubw mm1,mm5 \ + __asm movq mm4,OC_C(4) \ + __asm movq mm5,mm3 \ + __asm pmulhw mm3,mm4 \ + __asm paddw mm7,mm2 \ + __asm movq OC_I(2),mm6 \ + __asm movq mm2,mm0 \ + __asm movq mm6,OC_I(0) \ + __asm pmulhw mm0,mm4 \ + __asm paddw mm5,mm3 \ + __asm movq mm3,OC_J(4) \ + __asm psubw mm5,mm1 \ + __asm paddw mm2,mm0 \ + __asm psubw mm6,mm3 \ + __asm movq mm0,mm6 \ + __asm pmulhw mm6,mm4 \ + __asm paddw mm3,mm3 \ + __asm paddw mm1,mm1 \ + __asm paddw mm3,mm0 \ + __asm paddw mm1,mm5 \ + __asm pmulhw mm4,mm3 \ + __asm paddw mm6,mm0 \ + __asm psubw mm6,mm2 \ + __asm paddw mm2,mm2 \ + __asm movq mm0,OC_I(1) \ + __asm paddw mm2,mm6 \ + __asm paddw mm4,mm3 \ + __asm psubw mm2,mm1 \ +} + +/*38+8=46 cycles.*/ +#define OC_ROW_IDCT __asm{ \ + OC_IDCT_BEGIN \ + /*r3=D'*/ \ + __asm movq mm3,OC_I(2) \ + /*r4=E'=E-G*/ \ + __asm psubw mm4,mm7 \ + /*r1=H'+H'*/ \ + __asm paddw mm1,mm1 \ + /*r7=G+G*/ \ + __asm paddw mm7,mm7 \ + /*r1=R1=A''+H'*/ \ + __asm paddw mm1,mm2 \ + /*r7=G'=E+G*/ \ + __asm paddw mm7,mm4 \ + /*r4=R4=E'-D'*/ \ + __asm psubw mm4,mm3 \ + __asm paddw mm3,mm3 \ + /*r6=R6=F'-B''*/ \ + __asm psubw mm6,mm5 \ + __asm paddw mm5,mm5 \ + /*r3=R3=E'+D'*/ \ + __asm paddw mm3,mm4 \ + /*r5=R5=F'+B''*/ \ + __asm paddw mm5,mm6 \ + /*r7=R7=G'-C'*/ \ + __asm psubw mm7,mm0 \ + __asm paddw mm0,mm0 \ + /*Save R1.*/ \ + __asm movq OC_I(1),mm1 \ + /*r0=R0=G.+C.*/ \ + __asm paddw mm0,mm7 \ +} + +/*The following macro does two 4x4 transposes in place. + At entry, we assume: + r0 = a3 a2 a1 a0 + I(1) = b3 b2 b1 b0 + r2 = c3 c2 c1 c0 + r3 = d3 d2 d1 d0 + + r4 = e3 e2 e1 e0 + r5 = f3 f2 f1 f0 + r6 = g3 g2 g1 g0 + r7 = h3 h2 h1 h0 + + At exit, we have: + I(0) = d0 c0 b0 a0 + I(1) = d1 c1 b1 a1 + I(2) = d2 c2 b2 a2 + I(3) = d3 c3 b3 a3 + + J(4) = h0 g0 f0 e0 + J(5) = h1 g1 f1 e1 + J(6) = h2 g2 f2 e2 + J(7) = h3 g3 f3 e3 + + I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. + J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. + + Since r1 is free at entry, we calculate the Js first.*/ +/*19 cycles.*/ +#define OC_TRANSPOSE __asm{ \ + __asm movq mm1,mm4 \ + __asm punpcklwd mm4,mm5 \ + __asm movq OC_I(0),mm0 \ + __asm punpckhwd mm1,mm5 \ + __asm movq mm0,mm6 \ + __asm punpcklwd mm6,mm7 \ + __asm movq mm5,mm4 \ + __asm punpckldq mm4,mm6 \ + __asm punpckhdq mm5,mm6 \ + __asm movq mm6,mm1 \ + __asm movq OC_J(4),mm4 \ + __asm punpckhwd mm0,mm7 \ + __asm movq OC_J(5),mm5 \ + __asm punpckhdq mm6,mm0 \ + __asm movq mm4,OC_I(0) \ + __asm punpckldq mm1,mm0 \ + __asm movq mm5,OC_I(1) \ + __asm movq mm0,mm4 \ + __asm movq OC_J(7),mm6 \ + __asm punpcklwd mm0,mm5 \ + __asm movq OC_J(6),mm1 \ + __asm punpckhwd mm4,mm5 \ + __asm movq mm5,mm2 \ + __asm punpcklwd mm2,mm3 \ + __asm movq mm1,mm0 \ + __asm punpckldq mm0,mm2 \ + __asm punpckhdq mm1,mm2 \ + __asm movq mm2,mm4 \ + __asm movq OC_I(0),mm0 \ + __asm punpckhwd mm5,mm3 \ + __asm movq OC_I(1),mm1 \ + __asm punpckhdq mm4,mm5 \ + __asm punpckldq mm2,mm5 \ + __asm movq OC_I(3),mm4 \ + __asm movq OC_I(2),mm2 \ +} + +/*38+19=57 cycles.*/ +#define OC_COLUMN_IDCT __asm{ \ + OC_IDCT_BEGIN \ + __asm paddw mm2,OC_8 \ + /*r1=H'+H'*/ \ + __asm paddw mm1,mm1 \ + /*r1=R1=A''+H'*/ \ + __asm paddw mm1,mm2 \ + /*r2=NR2*/ \ + __asm psraw mm2,4 \ + /*r4=E'=E-G*/ \ + __asm psubw mm4,mm7 \ + /*r1=NR1*/ \ + __asm psraw mm1,4 \ + /*r3=D'*/ \ + __asm movq mm3,OC_I(2) \ + /*r7=G+G*/ \ + __asm paddw mm7,mm7 \ + /*Store NR2 at I(2).*/ \ + __asm movq OC_I(2),mm2 \ + /*r7=G'=E+G*/ \ + __asm paddw mm7,mm4 \ + /*Store NR1 at I(1).*/ \ + __asm movq OC_I(1),mm1 \ + /*r4=R4=E'-D'*/ \ + __asm psubw mm4,mm3 \ + __asm paddw mm4,OC_8 \ + /*r3=D'+D'*/ \ + __asm paddw mm3,mm3 \ + /*r3=R3=E'+D'*/ \ + __asm paddw mm3,mm4 \ + /*r4=NR4*/ \ + __asm psraw mm4,4 \ + /*r6=R6=F'-B''*/ \ + __asm psubw mm6,mm5 \ + /*r3=NR3*/ \ + __asm psraw mm3,4 \ + __asm paddw mm6,OC_8 \ + /*r5=B''+B''*/ \ + __asm paddw mm5,mm5 \ + /*r5=R5=F'+B''*/ \ + __asm paddw mm5,mm6 \ + /*r6=NR6*/ \ + __asm psraw mm6,4 \ + /*Store NR4 at J(4).*/ \ + __asm movq OC_J(4),mm4 \ + /*r5=NR5*/ \ + __asm psraw mm5,4 \ + /*Store NR3 at I(3).*/ \ + __asm movq OC_I(3),mm3 \ + /*r7=R7=G'-C'*/ \ + __asm psubw mm7,mm0 \ + __asm paddw mm7,OC_8 \ + /*r0=C'+C'*/ \ + __asm paddw mm0,mm0 \ + /*r0=R0=G'+C'*/ \ + __asm paddw mm0,mm7 \ + /*r7=NR7*/ \ + __asm psraw mm7,4 \ + /*Store NR6 at J(6).*/ \ + __asm movq OC_J(6),mm6 \ + /*r0=NR0*/ \ + __asm psraw mm0,4 \ + /*Store NR5 at J(5).*/ \ + __asm movq OC_J(5),mm5 \ + /*Store NR7 at J(7).*/ \ + __asm movq OC_J(7),mm7 \ + /*Store NR0 at I(0).*/ \ + __asm movq OC_I(0),mm0 \ +} + +#define OC_MID(_m,_i) [CONSTS+_m+(_i)*8] +#define OC_C(_i) OC_MID(OC_COSINE_OFFSET,_i-1) +#define OC_8 OC_MID(OC_EIGHT_OFFSET,0) + +static void oc_idct8x8_slow(ogg_int16_t _y[64]){ + /*This routine accepts an 8x8 matrix, but in partially transposed form. + Every 4x4 block is transposed.*/ + __asm{ +#define CONSTS eax +#define Y edx + mov CONSTS,offset OC_IDCT_CONSTS + mov Y,_y +#define OC_I(_k) [Y+_k*16] +#define OC_J(_k) [Y+(_k-4)*16+8] + OC_ROW_IDCT + OC_TRANSPOSE +#undef OC_I +#undef OC_J +#define OC_I(_k) [Y+(_k*16)+64] +#define OC_J(_k) [Y+(_k-4)*16+72] + OC_ROW_IDCT + OC_TRANSPOSE +#undef OC_I +#undef OC_J +#define OC_I(_k) [Y+_k*16] +#define OC_J(_k) OC_I(_k) + OC_COLUMN_IDCT +#undef OC_I +#undef OC_J +#define OC_I(_k) [Y+_k*16+8] +#define OC_J(_k) OC_I(_k) + OC_COLUMN_IDCT +#undef OC_I +#undef OC_J +#undef CONSTS +#undef Y + } +} + +/*25 cycles.*/ +#define OC_IDCT_BEGIN_10 __asm{ \ + __asm movq mm2,OC_I(3) \ + __asm nop \ + __asm movq mm6,OC_C(3) \ + __asm movq mm4,mm2 \ + __asm movq mm1,OC_C(5) \ + __asm pmulhw mm4,mm6 \ + __asm movq mm3,OC_I(1) \ + __asm pmulhw mm1,mm2 \ + __asm movq mm0,OC_C(1) \ + __asm paddw mm4,mm2 \ + __asm pxor mm6,mm6 \ + __asm paddw mm2,mm1 \ + __asm movq mm5,OC_I(2) \ + __asm pmulhw mm0,mm3 \ + __asm movq mm1,mm5 \ + __asm paddw mm0,mm3 \ + __asm pmulhw mm3,OC_C(7) \ + __asm psubw mm6,mm2 \ + __asm pmulhw mm5,OC_C(2) \ + __asm psubw mm0,mm4 \ + __asm movq mm7,OC_I(2) \ + __asm paddw mm4,mm4 \ + __asm paddw mm7,mm5 \ + __asm paddw mm4,mm0 \ + __asm pmulhw mm1,OC_C(6) \ + __asm psubw mm3,mm6 \ + __asm movq OC_I(1),mm4 \ + __asm paddw mm6,mm6 \ + __asm movq mm4,OC_C(4) \ + __asm paddw mm6,mm3 \ + __asm movq mm5,mm3 \ + __asm pmulhw mm3,mm4 \ + __asm movq OC_I(2),mm6 \ + __asm movq mm2,mm0 \ + __asm movq mm6,OC_I(0) \ + __asm pmulhw mm0,mm4 \ + __asm paddw mm5,mm3 \ + __asm paddw mm2,mm0 \ + __asm psubw mm5,mm1 \ + __asm pmulhw mm6,mm4 \ + __asm paddw mm6,OC_I(0) \ + __asm paddw mm1,mm1 \ + __asm movq mm4,mm6 \ + __asm paddw mm1,mm5 \ + __asm psubw mm6,mm2 \ + __asm paddw mm2,mm2 \ + __asm movq mm0,OC_I(1) \ + __asm paddw mm2,mm6 \ + __asm psubw mm2,mm1 \ + __asm nop \ +} + +/*25+8=33 cycles.*/ +#define OC_ROW_IDCT_10 __asm{ \ + OC_IDCT_BEGIN_10 \ + /*r3=D'*/ \ + __asm movq mm3,OC_I(2) \ + /*r4=E'=E-G*/ \ + __asm psubw mm4,mm7 \ + /*r1=H'+H'*/ \ + __asm paddw mm1,mm1 \ + /*r7=G+G*/ \ + __asm paddw mm7,mm7 \ + /*r1=R1=A''+H'*/ \ + __asm paddw mm1,mm2 \ + /*r7=G'=E+G*/ \ + __asm paddw mm7,mm4 \ + /*r4=R4=E'-D'*/ \ + __asm psubw mm4,mm3 \ + __asm paddw mm3,mm3 \ + /*r6=R6=F'-B''*/ \ + __asm psubw mm6,mm5 \ + __asm paddw mm5,mm5 \ + /*r3=R3=E'+D'*/ \ + __asm paddw mm3,mm4 \ + /*r5=R5=F'+B''*/ \ + __asm paddw mm5,mm6 \ + /*r7=R7=G'-C'*/ \ + __asm psubw mm7,mm0 \ + __asm paddw mm0,mm0 \ + /*Save R1.*/ \ + __asm movq OC_I(1),mm1 \ + /*r0=R0=G'+C'*/ \ + __asm paddw mm0,mm7 \ +} + +/*25+19=44 cycles'*/ +#define OC_COLUMN_IDCT_10 __asm{ \ + OC_IDCT_BEGIN_10 \ + __asm paddw mm2,OC_8 \ + /*r1=H'+H'*/ \ + __asm paddw mm1,mm1 \ + /*r1=R1=A''+H'*/ \ + __asm paddw mm1,mm2 \ + /*r2=NR2*/ \ + __asm psraw mm2,4 \ + /*r4=E'=E-G*/ \ + __asm psubw mm4,mm7 \ + /*r1=NR1*/ \ + __asm psraw mm1,4 \ + /*r3=D'*/ \ + __asm movq mm3,OC_I(2) \ + /*r7=G+G*/ \ + __asm paddw mm7,mm7 \ + /*Store NR2 at I(2).*/ \ + __asm movq OC_I(2),mm2 \ + /*r7=G'=E+G*/ \ + __asm paddw mm7,mm4 \ + /*Store NR1 at I(1).*/ \ + __asm movq OC_I(1),mm1 \ + /*r4=R4=E'-D'*/ \ + __asm psubw mm4,mm3 \ + __asm paddw mm4,OC_8 \ + /*r3=D'+D'*/ \ + __asm paddw mm3,mm3 \ + /*r3=R3=E'+D'*/ \ + __asm paddw mm3,mm4 \ + /*r4=NR4*/ \ + __asm psraw mm4,4 \ + /*r6=R6=F'-B''*/ \ + __asm psubw mm6,mm5 \ + /*r3=NR3*/ \ + __asm psraw mm3,4 \ + __asm paddw mm6,OC_8 \ + /*r5=B''+B''*/ \ + __asm paddw mm5,mm5 \ + /*r5=R5=F'+B''*/ \ + __asm paddw mm5,mm6 \ + /*r6=NR6*/ \ + __asm psraw mm6,4 \ + /*Store NR4 at J(4).*/ \ + __asm movq OC_J(4),mm4 \ + /*r5=NR5*/ \ + __asm psraw mm5,4 \ + /*Store NR3 at I(3).*/ \ + __asm movq OC_I(3),mm3 \ + /*r7=R7=G'-C'*/ \ + __asm psubw mm7,mm0 \ + __asm paddw mm7,OC_8 \ + /*r0=C'+C'*/ \ + __asm paddw mm0,mm0 \ + /*r0=R0=G'+C'*/ \ + __asm paddw mm0,mm7 \ + /*r7=NR7*/ \ + __asm psraw mm7,4 \ + /*Store NR6 at J(6).*/ \ + __asm movq OC_J(6),mm6 \ + /*r0=NR0*/ \ + __asm psraw mm0,4 \ + /*Store NR5 at J(5).*/ \ + __asm movq OC_J(5),mm5 \ + /*Store NR7 at J(7).*/ \ + __asm movq OC_J(7),mm7 \ + /*Store NR0 at I(0).*/ \ + __asm movq OC_I(0),mm0 \ +} + +static void oc_idct8x8_10(ogg_int16_t _y[64]){ + __asm{ +#define CONSTS eax +#define Y edx + mov CONSTS,offset OC_IDCT_CONSTS + mov Y,_y +#define OC_I(_k) [Y+_k*16] +#define OC_J(_k) [Y+(_k-4)*16+8] + /*Done with dequant, descramble, and partial transpose. + Now do the iDCT itself.*/ + OC_ROW_IDCT_10 + OC_TRANSPOSE +#undef OC_I +#undef OC_J +#define OC_I(_k) [Y+_k*16] +#define OC_J(_k) OC_I(_k) + OC_COLUMN_IDCT_10 +#undef OC_I +#undef OC_J +#define OC_I(_k) [Y+_k*16+8] +#define OC_J(_k) OC_I(_k) + OC_COLUMN_IDCT_10 +#undef OC_I +#undef OC_J +#undef CONSTS +#undef Y + } +} + +/*Performs an inverse 8x8 Type-II DCT transform. + The input is assumed to be scaled by a factor of 4 relative to orthonormal + version of the transform.*/ +void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){ + /*_last_zzi is subtly different from an actual count of the number of + coefficients we decoded for this block. + It contains the value of zzi BEFORE the final token in the block was + decoded. + In most cases this is an EOB token (the continuation of an EOB run from a + previous block counts), and so this is the same as the coefficient count. + However, in the case that the last token was NOT an EOB token, but filled + the block up with exactly 64 coefficients, _last_zzi will be less than 64. + Provided the last token was not a pure zero run, the minimum value it can + be is 46, and so that doesn't affect any of the cases in this routine. + However, if the last token WAS a pure zero run of length 63, then _last_zzi + will be 1 while the number of coefficients decoded is 64. + Thus, we will trigger the following special case, where the real + coefficient count would not. + Note also that a zero run of length 64 will give _last_zzi a value of 0, + but we still process the DC coefficient, which might have a non-zero value + due to DC prediction. + Although convoluted, this is arguably the correct behavior: it allows us to + use a smaller transform when the block ends with a long zero run instead + of a normal EOB token. + It could be smarter... multiple separate zero runs at the end of a block + will fool it, but an encoder that generates these really deserves what it + gets. + Needless to say we inherited this approach from VP3.*/ + /*Perform the iDCT.*/ + if(_last_zzi<10)oc_idct8x8_10(_y); + else oc_idct8x8_slow(_y); +} + +#endif diff --git a/thirdparty/libtheora/x86_vc/mmxloop.h b/thirdparty/libtheora/x86_vc/mmxloop.h new file mode 100644 index 0000000000..2561fca2ae --- /dev/null +++ b/thirdparty/libtheora/x86_vc/mmxloop.h @@ -0,0 +1,219 @@ +#if !defined(_x86_vc_mmxloop_H) +# define _x86_vc_mmxloop_H (1) +# include +# include "x86int.h" + +#if defined(OC_X86_ASM) + +/*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}. + On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and + mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}; mm0 and mm3 are clobbered.*/ +#define OC_LOOP_FILTER8_MMX __asm{ \ + /*mm7=0*/ \ + __asm pxor mm7,mm7 \ + /*mm6:mm0={a0,...,a7}*/ \ + __asm movq mm6,mm0 \ + __asm punpcklbw mm0,mm7 \ + __asm punpckhbw mm6,mm7 \ + /*mm3:mm5={d0,...,d7}*/ \ + __asm movq mm5,mm3 \ + __asm punpcklbw mm3,mm7 \ + __asm punpckhbw mm5,mm7 \ + /*mm6:mm0={a0-d0,...,a7-d7}*/ \ + __asm psubw mm0,mm3 \ + __asm psubw mm6,mm5 \ + /*mm3:mm1={b0,...,b7}*/ \ + __asm movq mm3,mm1 \ + __asm punpcklbw mm1,mm7 \ + __asm movq mm4,mm2 \ + __asm punpckhbw mm3,mm7 \ + /*mm5:mm4={c0,...,c7}*/ \ + __asm movq mm5,mm2 \ + __asm punpcklbw mm4,mm7 \ + __asm punpckhbw mm5,mm7 \ + /*mm7={3}x4 \ + mm5:mm4={c0-b0,...,c7-b7}*/ \ + __asm pcmpeqw mm7,mm7 \ + __asm psubw mm4,mm1 \ + __asm psrlw mm7,14 \ + __asm psubw mm5,mm3 \ + /*Scale by 3.*/ \ + __asm pmullw mm4,mm7 \ + __asm pmullw mm5,mm7 \ + /*mm7={4}x4 \ + mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \ + __asm psrlw mm7,1 \ + __asm paddw mm4,mm0 \ + __asm psllw mm7,2 \ + __asm movq mm0,[LL] \ + __asm paddw mm5,mm6 \ + /*R_i has the range [-127,128], so we compute -R_i instead. \ + mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \ + __asm psubw mm4,mm7 \ + __asm psubw mm5,mm7 \ + __asm psraw mm4,3 \ + __asm psraw mm5,3 \ + __asm pcmpeqb mm7,mm7 \ + __asm packsswb mm4,mm5 \ + __asm pxor mm6,mm6 \ + __asm pxor mm4,mm7 \ + __asm packuswb mm1,mm3 \ + /*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \ + /*There's no unsigned byte+signed byte with unsigned saturation op code, so \ + we have to split things by sign (the other option is to work in 16 bits, \ + but working in 8 bits gives much better parallelism). \ + We compute abs(R_i), but save a mask of which terms were negative in mm6. \ + Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \ + Finally, we split mm4 into positive and negative pieces using the mask in \ + mm6, and add and subtract them as appropriate.*/ \ + /*mm4=abs(-R_i)*/ \ + /*mm7=255-2*L*/ \ + __asm pcmpgtb mm6,mm4 \ + __asm psubb mm7,mm0 \ + __asm pxor mm4,mm6 \ + __asm psubb mm7,mm0 \ + __asm psubb mm4,mm6 \ + /*mm7=255-max(2*L-abs(R_i),0)*/ \ + __asm paddusb mm7,mm4 \ + /*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \ + __asm paddusb mm4,mm7 \ + __asm psubusb mm4,mm7 \ + /*Now split mm4 by the original sign of -R_i.*/ \ + __asm movq mm5,mm4 \ + __asm pand mm4,mm6 \ + __asm pandn mm6,mm5 \ + /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \ + /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \ + __asm paddusb mm1,mm4 \ + __asm psubusb mm2,mm4 \ + __asm psubusb mm1,mm6 \ + __asm paddusb mm2,mm6 \ +} + +#define OC_LOOP_FILTER_V_MMX(_pix,_ystride,_ll) \ + do{ \ + /*Used local variable pix__ in order to fix compilation errors like: \ + "error C2425: 'SHL' : non-constant expression in 'second operand'".*/ \ + unsigned char *pix__; \ + unsigned char *ll__; \ + ll__=(_ll); \ + pix__=(_pix); \ + __asm mov YSTRIDE,_ystride \ + __asm mov LL,ll__ \ + __asm mov PIX,pix__ \ + __asm sub PIX,YSTRIDE \ + __asm sub PIX,YSTRIDE \ + /*mm0={a0,...,a7}*/ \ + __asm movq mm0,[PIX] \ + /*ystride3=_ystride*3*/ \ + __asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \ + /*mm3={d0,...,d7}*/ \ + __asm movq mm3,[PIX+YSTRIDE3] \ + /*mm1={b0,...,b7}*/ \ + __asm movq mm1,[PIX+YSTRIDE] \ + /*mm2={c0,...,c7}*/ \ + __asm movq mm2,[PIX+YSTRIDE*2] \ + OC_LOOP_FILTER8_MMX \ + /*Write it back out.*/ \ + __asm movq [PIX+YSTRIDE],mm1 \ + __asm movq [PIX+YSTRIDE*2],mm2 \ + } \ + while(0) + +#define OC_LOOP_FILTER_H_MMX(_pix,_ystride,_ll) \ + do{ \ + /*Used local variable ll__ in order to fix compilation errors like: \ + "error C2443: operand size conflict".*/ \ + unsigned char *ll__; \ + unsigned char *pix__; \ + ll__=(_ll); \ + pix__=(_pix)-2; \ + __asm mov PIX,pix__ \ + __asm mov YSTRIDE,_ystride \ + __asm mov LL,ll__ \ + /*x x x x d0 c0 b0 a0*/ \ + __asm movd mm0,[PIX] \ + /*x x x x d1 c1 b1 a1*/ \ + __asm movd mm1,[PIX+YSTRIDE] \ + /*ystride3=_ystride*3*/ \ + __asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \ + /*x x x x d2 c2 b2 a2*/ \ + __asm movd mm2,[PIX+YSTRIDE*2] \ + /*x x x x d3 c3 b3 a3*/ \ + __asm lea D,[PIX+YSTRIDE*4] \ + __asm movd mm3,[PIX+YSTRIDE3] \ + /*x x x x d4 c4 b4 a4*/ \ + __asm movd mm4,[D] \ + /*x x x x d5 c5 b5 a5*/ \ + __asm movd mm5,[D+YSTRIDE] \ + /*x x x x d6 c6 b6 a6*/ \ + __asm movd mm6,[D+YSTRIDE*2] \ + /*x x x x d7 c7 b7 a7*/ \ + __asm movd mm7,[D+YSTRIDE3] \ + /*mm0=d1 d0 c1 c0 b1 b0 a1 a0*/ \ + __asm punpcklbw mm0,mm1 \ + /*mm2=d3 d2 c3 c2 b3 b2 a3 a2*/ \ + __asm punpcklbw mm2,mm3 \ + /*mm3=d1 d0 c1 c0 b1 b0 a1 a0*/ \ + __asm movq mm3,mm0 \ + /*mm0=b3 b2 b1 b0 a3 a2 a1 a0*/ \ + __asm punpcklwd mm0,mm2 \ + /*mm3=d3 d2 d1 d0 c3 c2 c1 c0*/ \ + __asm punpckhwd mm3,mm2 \ + /*mm1=b3 b2 b1 b0 a3 a2 a1 a0*/ \ + __asm movq mm1,mm0 \ + /*mm4=d5 d4 c5 c4 b5 b4 a5 a4*/ \ + __asm punpcklbw mm4,mm5 \ + /*mm6=d7 d6 c7 c6 b7 b6 a7 a6*/ \ + __asm punpcklbw mm6,mm7 \ + /*mm5=d5 d4 c5 c4 b5 b4 a5 a4*/ \ + __asm movq mm5,mm4 \ + /*mm4=b7 b6 b5 b4 a7 a6 a5 a4*/ \ + __asm punpcklwd mm4,mm6 \ + /*mm5=d7 d6 d5 d4 c7 c6 c5 c4*/ \ + __asm punpckhwd mm5,mm6 \ + /*mm2=d3 d2 d1 d0 c3 c2 c1 c0*/ \ + __asm movq mm2,mm3 \ + /*mm0=a7 a6 a5 a4 a3 a2 a1 a0*/ \ + __asm punpckldq mm0,mm4 \ + /*mm1=b7 b6 b5 b4 b3 b2 b1 b0*/ \ + __asm punpckhdq mm1,mm4 \ + /*mm2=c7 c6 c5 c4 c3 c2 c1 c0*/ \ + __asm punpckldq mm2,mm5 \ + /*mm3=d7 d6 d5 d4 d3 d2 d1 d0*/ \ + __asm punpckhdq mm3,mm5 \ + OC_LOOP_FILTER8_MMX \ + /*mm2={b0+R_0'',...,b7+R_7''}*/ \ + __asm movq mm0,mm1 \ + /*mm1={b0+R_0'',c0-R_0'',...,b3+R_3'',c3-R_3''}*/ \ + __asm punpcklbw mm1,mm2 \ + /*mm2={b4+R_4'',c4-R_4'',...,b7+R_7'',c7-R_7''}*/ \ + __asm punpckhbw mm0,mm2 \ + /*[d]=c1 b1 c0 b0*/ \ + __asm movd D,mm1 \ + __asm mov [PIX+1],D_WORD \ + __asm psrlq mm1,32 \ + __asm shr D,16 \ + __asm mov [PIX+YSTRIDE+1],D_WORD \ + /*[d]=c3 b3 c2 b2*/ \ + __asm movd D,mm1 \ + __asm mov [PIX+YSTRIDE*2+1],D_WORD \ + __asm shr D,16 \ + __asm mov [PIX+YSTRIDE3+1],D_WORD \ + __asm lea PIX,[PIX+YSTRIDE*4] \ + /*[d]=c5 b5 c4 b4*/ \ + __asm movd D,mm0 \ + __asm mov [PIX+1],D_WORD \ + __asm psrlq mm0,32 \ + __asm shr D,16 \ + __asm mov [PIX+YSTRIDE+1],D_WORD \ + /*[d]=c7 b7 c6 b6*/ \ + __asm movd D,mm0 \ + __asm mov [PIX+YSTRIDE*2+1],D_WORD \ + __asm shr D,16 \ + __asm mov [PIX+YSTRIDE3+1],D_WORD \ + } \ + while(0) + +# endif +#endif diff --git a/thirdparty/libtheora/x86_vc/mmxstate.c b/thirdparty/libtheora/x86_vc/mmxstate.c new file mode 100644 index 0000000000..73bd1981cf --- /dev/null +++ b/thirdparty/libtheora/x86_vc/mmxstate.c @@ -0,0 +1,211 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: mmxstate.c 16584 2009-09-26 19:35:55Z tterribe $ + + ********************************************************************/ + +/*MMX acceleration of complete fragment reconstruction algorithm. + Originally written by Rudolf Marek.*/ +#include +#include "x86int.h" +#include "mmxfrag.h" +#include "mmxloop.h" + +#if defined(OC_X86_ASM) + +void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi, + int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){ + unsigned char *dst; + ptrdiff_t frag_buf_off; + int ystride; + int mb_mode; + /*Apply the inverse transform.*/ + /*Special case only having a DC component.*/ + if(_last_zzi<2){ + /*Note that this value must be unsigned, to keep the __asm__ block from + sign-extending it when it puts it in a register.*/ + ogg_uint16_t p; + /*We round this dequant product (and not any of the others) because there's + no iDCT rounding.*/ + p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5); + /*Fill _dct_coeffs with p.*/ + __asm{ +#define Y eax +#define P ecx + mov Y,_dct_coeffs + movzx P,p + /*mm0=0000 0000 0000 AAAA*/ + movd mm0,P + /*mm0=0000 0000 AAAA AAAA*/ + punpcklwd mm0,mm0 + /*mm0=AAAA AAAA AAAA AAAA*/ + punpckldq mm0,mm0 + movq [Y],mm0 + movq [8+Y],mm0 + movq [16+Y],mm0 + movq [24+Y],mm0 + movq [32+Y],mm0 + movq [40+Y],mm0 + movq [48+Y],mm0 + movq [56+Y],mm0 + movq [64+Y],mm0 + movq [72+Y],mm0 + movq [80+Y],mm0 + movq [88+Y],mm0 + movq [96+Y],mm0 + movq [104+Y],mm0 + movq [112+Y],mm0 + movq [120+Y],mm0 +#undef Y +#undef P + } + } + else{ + /*Dequantize the DC coefficient.*/ + _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant); + oc_idct8x8_mmx(_dct_coeffs,_last_zzi); + } + /*Fill in the target buffer.*/ + frag_buf_off=_state->frag_buf_offs[_fragi]; + mb_mode=_state->frags[_fragi].mb_mode; + ystride=_state->ref_ystride[_pli]; + dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off; + if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs); + else{ + const unsigned char *ref; + int mvoffsets[2]; + ref= + _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]] + +frag_buf_off; + if(oc_state_get_mv_offsets(_state,mvoffsets,_pli, + _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){ + oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride, + _dct_coeffs); + } + else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs); + } +} + +/*We copy these entire function to inline the actual MMX routines so that we + use only a single indirect call.*/ + +/*Copies the fragments specified by the lists of fragment indices from one + frame to another. + _fragis: A pointer to a list of fragment indices. + _nfragis: The number of fragment indices to copy. + _dst_frame: The reference frame to copy to. + _src_frame: The reference frame to copy from. + _pli: The color plane the fragments lie in.*/ +void oc_state_frag_copy_list_mmx(const oc_theora_state *_state, + const ptrdiff_t *_fragis,ptrdiff_t _nfragis, + int _dst_frame,int _src_frame,int _pli){ + const ptrdiff_t *frag_buf_offs; + const unsigned char *src_frame_data; + unsigned char *dst_frame_data; + ptrdiff_t fragii; + int ystride; + dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]]; + src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]]; + ystride=_state->ref_ystride[_pli]; + frag_buf_offs=_state->frag_buf_offs; + for(fragii=0;fragii<_nfragis;fragii++){ + ptrdiff_t frag_buf_off; + frag_buf_off=frag_buf_offs[_fragis[fragii]]; +#define SRC edx +#define DST eax +#define YSTRIDE ecx +#define YSTRIDE3 edi + OC_FRAG_COPY_MMX(dst_frame_data+frag_buf_off, + src_frame_data+frag_buf_off,ystride); +#undef SRC +#undef DST +#undef YSTRIDE +#undef YSTRIDE3 + } +} + +/*Apply the loop filter to a given set of fragment rows in the given plane. + The filter may be run on the bottom edge, affecting pixels in the next row of + fragments, so this row also needs to be available. + _bv: The bounding values array. + _refi: The index of the frame buffer to filter. + _pli: The color plane to filter. + _fragy0: The Y coordinate of the first fragment row to filter. + _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/ +void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state, + int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){ + OC_ALIGN8(unsigned char ll[8]); + const oc_fragment_plane *fplane; + const oc_fragment *frags; + const ptrdiff_t *frag_buf_offs; + unsigned char *ref_frame_data; + ptrdiff_t fragi_top; + ptrdiff_t fragi_bot; + ptrdiff_t fragi0; + ptrdiff_t fragi0_end; + int ystride; + int nhfrags; + memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll)); + fplane=_state->fplanes+_pli; + nhfrags=fplane->nhfrags; + fragi_top=fplane->froffset; + fragi_bot=fragi_top+fplane->nfrags; + fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags; + fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags; + ystride=_state->ref_ystride[_pli]; + frags=_state->frags; + frag_buf_offs=_state->frag_buf_offs; + ref_frame_data=_state->ref_frame_data[_refi]; + /*The following loops are constructed somewhat non-intuitively on purpose. + The main idea is: if a block boundary has at least one coded fragment on + it, the filter is applied to it. + However, the order that the filters are applied in matters, and VP3 chose + the somewhat strange ordering used below.*/ + while(fragi0fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,ll); + if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,ll); + if(fragi+1opt_vtable.frag_sub=oc_enc_frag_sub_mmx; + _enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_mmx; + _enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx; + _enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx; + _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmx; + } + if(cpu_flags&OC_CPU_X86_MMXEXT){ + _enc->opt_vtable.frag_sad=oc_enc_frag_sad_mmxext; + _enc->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_mmxext; + _enc->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_mmxext; + _enc->opt_vtable.frag_satd_thresh=oc_enc_frag_satd_thresh_mmxext; + _enc->opt_vtable.frag_satd2_thresh=oc_enc_frag_satd2_thresh_mmxext; + _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_mmxext; + _enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_mmxext; + } + if(cpu_flags&OC_CPU_X86_SSE2){ +# if defined(OC_X86_64_ASM) + _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_x86_64sse2; +# endif + } +} +#endif diff --git a/thirdparty/libtheora/x86_vc/x86enc.h b/thirdparty/libtheora/x86_vc/x86enc.h new file mode 100644 index 0000000000..581484641f --- /dev/null +++ b/thirdparty/libtheora/x86_vc/x86enc.h @@ -0,0 +1,47 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: x86int.h 15675 2009-02-06 09:43:27Z tterribe $ + + ********************************************************************/ + +#if !defined(_x86_vc_x86enc_H) +# define _x86_vc_x86enc_H (1) +# include "../encint.h" +# include "x86int.h" + +void oc_enc_vtable_init_x86(oc_enc_ctx *_enc); + +unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src, + const unsigned char *_ref,int _ystride); +unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src, + const unsigned char *_ref,int _ystride,unsigned _thresh); +unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src, + const unsigned char *_ref1,const unsigned char *_ref2,int _ystride, + unsigned _thresh); +unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src, + const unsigned char *_ref,int _ystride,unsigned _thresh); +unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src, + const unsigned char *_ref1,const unsigned char *_ref2,int _ystride, + unsigned _thresh); +unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,int _ystride); +void oc_enc_frag_sub_mmx(ogg_int16_t _diff[64], + const unsigned char *_x,const unsigned char *_y,int _stride); +void oc_enc_frag_sub_128_mmx(ogg_int16_t _diff[64], + const unsigned char *_x,int _stride); +void oc_enc_frag_copy2_mmxext(unsigned char *_dst, + const unsigned char *_src1,const unsigned char *_src2,int _ystride); +void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]); +void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]); + +#endif diff --git a/thirdparty/libtheora/x86_vc/x86int.h b/thirdparty/libtheora/x86_vc/x86int.h new file mode 100644 index 0000000000..4cca485311 --- /dev/null +++ b/thirdparty/libtheora/x86_vc/x86int.h @@ -0,0 +1,42 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: x86int.h 16503 2009-08-22 18:14:02Z giles $ + + ********************************************************************/ + +#if !defined(_x86_vc_x86int_H) +# define _x86_vc_x86int_H (1) +# include "../internal.h" + +void oc_state_vtable_init_x86(oc_theora_state *_state); + +void oc_frag_copy_mmx(unsigned char *_dst, + const unsigned char *_src,int _ystride); +void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride, + const ogg_int16_t *_residue); +void oc_frag_recon_inter_mmx(unsigned char *_dst, + const unsigned char *_src,int _ystride,const ogg_int16_t *_residue); +void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1, + const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue); +void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi); +void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi, + int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant); +void oc_state_frag_copy_list_mmx(const oc_theora_state *_state, + const ptrdiff_t *_fragis,ptrdiff_t _nfragis, + int _dst_frame,int _src_frame,int _pli); +void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state, + int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end); +void oc_restore_fpu_mmx(void); + +#endif diff --git a/thirdparty/libtheora/x86_vc/x86state.c b/thirdparty/libtheora/x86_vc/x86state.c new file mode 100644 index 0000000000..a786bec284 --- /dev/null +++ b/thirdparty/libtheora/x86_vc/x86state.c @@ -0,0 +1,62 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: x86state.c 16503 2009-08-22 18:14:02Z giles $ + + ********************************************************************/ + +#include "x86int.h" + +#if defined(OC_X86_ASM) + +#include "../cpu.c" + +/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into + each quadrant of the destination.*/ +static const unsigned char OC_FZIG_ZAG_MMX[128]={ + 0, 8, 1, 2, 9,16,24,17, + 10, 3,32,11,18,25, 4,12, + 5,26,19,40,33,34,41,48, + 27, 6,13,20,28,21,14, 7, + 56,49,42,35,43,50,57,36, + 15,22,29,30,23,44,37,58, + 51,59,38,45,52,31,60,53, + 46,39,47,54,61,62,55,63, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64, +}; + +void oc_state_vtable_init_x86(oc_theora_state *_state){ + _state->cpu_flags=oc_cpu_flags_get(); + if(_state->cpu_flags&OC_CPU_X86_MMX){ + _state->opt_vtable.frag_copy=oc_frag_copy_mmx; + _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx; + _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx; + _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx; + _state->opt_vtable.idct8x8=oc_idct8x8_mmx; + _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx; + _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_mmx; + _state->opt_vtable.state_loop_filter_frag_rows= + oc_state_loop_filter_frag_rows_mmx; + _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx; + _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX; + } + else oc_state_vtable_init_c(_state); +} +#endif -- cgit v1.2.3