1 files changed, 969 insertions, 969 deletions
diff --git a/thirdparty/libtheora/x86_vc/mmxencfrag.c b/thirdparty/libtheora/x86_vc/mmxencfrag.c
index ac9dacf377..94f1d06513 100644
--- a/thirdparty/libtheora/x86_vc/mmxencfrag.c
+++ b/thirdparty/libtheora/x86_vc/mmxencfrag.c
@@ -1,969 +1,969 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $
-
- ********************************************************************/
-#include <stddef.h>
-#include "x86enc.h"
-
-#if defined(OC_X86_ASM)
-
-unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
- const unsigned char *_ref,int _ystride){
-  ptrdiff_t ret;
-  __asm{
-#define SRC esi
-#define REF edx
-#define YSTRIDE ecx
-#define YSTRIDE3 edi
-    mov YSTRIDE,_ystride
-    mov SRC,_src
-    mov REF,_ref
-    /*Load the first 4 rows of each block.*/
-    movq mm0,[SRC]
-    movq mm1,[REF]
-    movq mm2,[SRC][YSTRIDE]
-    movq mm3,[REF][YSTRIDE]
-    lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
-    movq mm4,[SRC+YSTRIDE*2]
-    movq mm5,[REF+YSTRIDE*2]
-    movq mm6,[SRC+YSTRIDE3]
-    movq mm7,[REF+YSTRIDE3]
-    /*Compute their SADs and add them in mm0*/
-    psadbw mm0,mm1
-    psadbw mm2,mm3
-    lea SRC,[SRC+YSTRIDE*4]
-    paddw mm0,mm2
-    lea REF,[REF+YSTRIDE*4]
-    /*Load the next 3 rows as registers become available.*/
-    movq mm2,[SRC]
-    movq mm3,[REF]
-    psadbw mm4,mm5
-    psadbw mm6,mm7
-    paddw mm0,mm4
-    movq mm5,[REF+YSTRIDE]
-    movq mm4,[SRC+YSTRIDE]
-    paddw mm0,mm6
-    movq mm7,[REF+YSTRIDE*2]
-    movq mm6,[SRC+YSTRIDE*2]
-    /*Start adding their SADs to mm0*/
-    psadbw mm2,mm3
-    psadbw mm4,mm5
-    paddw mm0,mm2
-    psadbw mm6,mm7
-    /*Load last row as registers become available.*/
-    movq mm2,[SRC+YSTRIDE3]
-    movq mm3,[REF+YSTRIDE3]
-    /*And finish adding up their SADs.*/
-    paddw mm0,mm4
-    psadbw mm2,mm3
-    paddw mm0,mm6
-    paddw mm0,mm2
-    movd [ret],mm0
-#undef SRC
-#undef REF
-#undef YSTRIDE
-#undef YSTRIDE3
-  }
-  return (unsigned)ret;
-}
-
-unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
- const unsigned char *_ref,int _ystride,unsigned _thresh){
-  /*Early termination is for suckers.*/
-  return oc_enc_frag_sad_mmxext(_src,_ref,_ystride);
-}
-
-#define OC_SAD2_LOOP __asm{ \
-  /*We want to compute (mm0+mm1>>1) on unsigned bytes without overflow, but \
-     pavgb computes (mm0+mm1+1>>1). \
-   The latter is exactly 1 too large when the low bit of two corresponding \
-    bytes is only set in one of them. \
-   Therefore we pxor the operands, pand to mask out the low bits, and psubb to \
-    correct the output of pavgb.*/ \
-  __asm  movq mm6,mm0 \
-  __asm  lea REF1,[REF1+YSTRIDE*2] \
-  __asm  pxor mm0,mm1 \
-  __asm  pavgb mm6,mm1 \
-  __asm  lea REF2,[REF2+YSTRIDE*2] \
-  __asm  movq mm1,mm2 \
-  __asm  pand mm0,mm7 \
-  __asm  pavgb mm2,mm3 \
-  __asm  pxor mm1,mm3 \
-  __asm  movq mm3,[REF2+YSTRIDE] \
-  __asm  psubb mm6,mm0 \
-  __asm  movq mm0,[REF1] \
-  __asm  pand mm1,mm7 \
-  __asm  psadbw mm4,mm6 \
-  __asm  movd mm6,RET \
-  __asm  psubb mm2,mm1 \
-  __asm  movq mm1,[REF2] \
-  __asm  lea SRC,[SRC+YSTRIDE*2] \
-  __asm  psadbw mm5,mm2 \
-  __asm  movq mm2,[REF1+YSTRIDE] \
-  __asm  paddw mm5,mm4 \
-  __asm  movq mm4,[SRC] \
-  __asm  paddw mm6,mm5 \
-  __asm  movq mm5,[SRC+YSTRIDE] \
-  __asm  movd RET,mm6 \
-}
-
-/*Same as above, but does not pre-load the next two rows.*/
-#define OC_SAD2_TAIL __asm{ \
-  __asm  movq mm6,mm0 \
-  __asm  pavgb mm0,mm1 \
-  __asm  pxor mm6,mm1 \
-  __asm  movq mm1,mm2 \
-  __asm  pand mm6,mm7 \
-  __asm  pavgb mm2,mm3 \
-  __asm  pxor mm1,mm3 \
-  __asm  psubb mm0,mm6 \
-  __asm  pand mm1,mm7 \
-  __asm  psadbw mm4,mm0 \
-  __asm  psubb mm2,mm1 \
-  __asm  movd mm6,RET \
-  __asm  psadbw mm5,mm2 \
-  __asm  paddw mm5,mm4 \
-  __asm  paddw mm6,mm5 \
-  __asm  movd RET,mm6 \
-}
-
-unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
- const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
- unsigned _thresh){
-  ptrdiff_t ret;
-  __asm{
-#define REF1 ecx
-#define REF2 edi
-#define YSTRIDE esi
-#define SRC edx
-#define RET eax
-    mov YSTRIDE,_ystride
-    mov SRC,_src
-    mov REF1,_ref1
-    mov REF2,_ref2
-    movq mm0,[REF1]
-    movq mm1,[REF2]
-    movq mm2,[REF1+YSTRIDE]
-    movq mm3,[REF2+YSTRIDE]
-    xor RET,RET
-    movq mm4,[SRC]
-    pxor mm7,mm7
-    pcmpeqb mm6,mm6
-    movq mm5,[SRC+YSTRIDE]
-    psubb mm7,mm6
-    OC_SAD2_LOOP
-    OC_SAD2_LOOP
-    OC_SAD2_LOOP
-    OC_SAD2_TAIL
-    mov [ret],RET
-#undef REF1
-#undef REF2
-#undef YSTRIDE
-#undef SRC
-#undef RET
-  }
-  return (unsigned)ret;
-}
-
-/*Load an 8x4 array of pixel values from %[src] and %[ref] and compute their
-  16-bit difference in mm0...mm7.*/
-#define OC_LOAD_SUB_8x4(_off) __asm{ \
-  __asm  movd mm0,[_off+SRC] \
-  __asm  movd mm4,[_off+REF] \
-  __asm  movd mm1,[_off+SRC+SRC_YSTRIDE] \
-  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
-  __asm  movd mm5,[_off+REF+REF_YSTRIDE] \
-  __asm  lea REF,[REF+REF_YSTRIDE*2] \
-  __asm  movd mm2,[_off+SRC] \
-  __asm  movd mm7,[_off+REF] \
-  __asm  movd mm3,[_off+SRC+SRC_YSTRIDE] \
-  __asm  movd mm6,[_off+REF+REF_YSTRIDE] \
-  __asm  punpcklbw mm0,mm4 \
-  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
-  __asm  punpcklbw mm4,mm4 \
-  __asm  lea REF,[REF+REF_YSTRIDE*2] \
-  __asm  psubw mm0,mm4 \
-  __asm  movd mm4,[_off+SRC] \
-  __asm  movq [_off*2+BUF],mm0 \
-  __asm  movd mm0,[_off+REF] \
-  __asm  punpcklbw mm1,mm5 \
-  __asm  punpcklbw mm5,mm5 \
-  __asm  psubw mm1,mm5 \
-  __asm  movd mm5,[_off+SRC+SRC_YSTRIDE] \
-  __asm  punpcklbw mm2,mm7 \
-  __asm  punpcklbw mm7,mm7 \
-  __asm  psubw mm2,mm7 \
-  __asm  movd mm7,[_off+REF+REF_YSTRIDE] \
-  __asm  punpcklbw mm3,mm6 \
-  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
-  __asm  punpcklbw mm6,mm6 \
-  __asm  psubw mm3,mm6 \
-  __asm  movd mm6,[_off+SRC] \
-  __asm  punpcklbw mm4,mm0 \
-  __asm  lea REF,[REF+REF_YSTRIDE*2] \
-  __asm  punpcklbw mm0,mm0 \
-  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
-  __asm  psubw mm4,mm0 \
-  __asm  movd mm0,[_off+REF] \
-  __asm  punpcklbw mm5,mm7 \
-  __asm  neg SRC_YSTRIDE \
-  __asm  punpcklbw mm7,mm7 \
-  __asm  psubw mm5,mm7 \
-  __asm  movd mm7,[_off+SRC+SRC_YSTRIDE] \
-  __asm  punpcklbw mm6,mm0 \
-  __asm  lea REF,[REF+REF_YSTRIDE*2] \
-  __asm  punpcklbw mm0,mm0 \
-  __asm  neg REF_YSTRIDE \
-  __asm  psubw mm6,mm0 \
-  __asm  movd mm0,[_off+REF+REF_YSTRIDE] \
-  __asm  lea SRC,[SRC+SRC_YSTRIDE*8] \
-  __asm  punpcklbw mm7,mm0 \
-  __asm  neg SRC_YSTRIDE \
-  __asm  punpcklbw mm0,mm0 \
-  __asm  lea REF,[REF+REF_YSTRIDE*8] \
-  __asm  psubw mm7,mm0 \
-  __asm  neg REF_YSTRIDE \
-  __asm  movq mm0,[_off*2+BUF] \
-}
-
-/*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/
-#define OC_LOAD_8x4(_off) __asm{ \
-  __asm  movd mm0,[_off+SRC] \
-  __asm  movd mm1,[_off+SRC+YSTRIDE] \
-  __asm  movd mm2,[_off+SRC+YSTRIDE*2] \
-  __asm  pxor mm7,mm7 \
-  __asm  movd mm3,[_off+SRC+YSTRIDE3] \
-  __asm  punpcklbw mm0,mm7 \
-  __asm  movd mm4,[_off+SRC4] \
-  __asm  punpcklbw mm1,mm7 \
-  __asm  movd mm5,[_off+SRC4+YSTRIDE] \
-  __asm  punpcklbw mm2,mm7 \
-  __asm  movd mm6,[_off+SRC4+YSTRIDE*2] \
-  __asm  punpcklbw mm3,mm7 \
-  __asm  movd mm7,[_off+SRC4+YSTRIDE3] \
-  __asm  punpcklbw mm4,mm4 \
-  __asm  punpcklbw mm5,mm5 \
-  __asm  psrlw mm4,8 \
-  __asm  psrlw mm5,8 \
-  __asm  punpcklbw mm6,mm6 \
-  __asm  punpcklbw mm7,mm7 \
-  __asm  psrlw mm6,8 \
-  __asm  psrlw mm7,8 \
-}
-
-/*Performs the first two stages of an 8-point 1-D Hadamard transform.
-  The transform is performed in place, except that outputs 0-3 are swapped with
-   outputs 4-7.
-  Outputs 2, 3, 6 and 7 from the second stage are negated (which allows us to
-   perform this stage in place with no temporary registers).*/
-#define OC_HADAMARD_AB_8x4 __asm{ \
-  /*Stage A: \
-    Outputs 0-3 are swapped with 4-7 here.*/ \
-  __asm  paddw mm5,mm1 \
-  __asm  paddw mm6,mm2 \
-  __asm  paddw mm1,mm1 \
-  __asm  paddw mm2,mm2 \
-  __asm  psubw mm1,mm5 \
-  __asm  psubw mm2,mm6 \
-  __asm  paddw mm7,mm3 \
-  __asm  paddw mm4,mm0 \
-  __asm  paddw mm3,mm3 \
-  __asm  paddw mm0,mm0 \
-  __asm  psubw mm3,mm7 \
-  __asm  psubw mm0,mm4 \
-   /*Stage B:*/ \
-  __asm  paddw mm0,mm2 \
-  __asm  paddw mm1,mm3 \
-  __asm  paddw mm4,mm6 \
-  __asm  paddw mm5,mm7 \
-  __asm  paddw mm2,mm2 \
-  __asm  paddw mm3,mm3 \
-  __asm  paddw mm6,mm6 \
-  __asm  paddw mm7,mm7 \
-  __asm  psubw mm2,mm0 \
-  __asm  psubw mm3,mm1 \
-  __asm  psubw mm6,mm4 \
-  __asm  psubw mm7,mm5 \
-}
-
-/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
-  Ouputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
-   place with no temporary registers).*/
-#define OC_HADAMARD_C_8x4 __asm{ \
-  /*Stage C:*/ \
-  __asm  paddw mm0,mm1 \
-  __asm  paddw mm2,mm3 \
-  __asm  paddw mm4,mm5 \
-  __asm  paddw mm6,mm7 \
-  __asm  paddw mm1,mm1 \
-  __asm  paddw mm3,mm3 \
-  __asm  paddw mm5,mm5 \
-  __asm  paddw mm7,mm7 \
-  __asm  psubw mm1,mm0 \
-  __asm  psubw mm3,mm2 \
-  __asm  psubw mm5,mm4 \
-  __asm  psubw mm7,mm6 \
-}
-
-/*Performs an 8-point 1-D Hadamard transform.
-  The transform is performed in place, except that outputs 0-3 are swapped with
-   outputs 4-7.
-  Outputs 1, 2, 5 and 6 are negated (which allows us to perform the transform
-   in place with no temporary registers).*/
-#define OC_HADAMARD_8x4 __asm{ \
-  OC_HADAMARD_AB_8x4 \
-  OC_HADAMARD_C_8x4 \
-}
-
-/*Performs the first part of the final stage of the Hadamard transform and
-   summing of absolute values.
-  At the end of this part, mm1 will contain the DC coefficient of the
-   transform.*/
-#define OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) __asm{ \
-  /*We use the fact that \
-      (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
-     to merge the final butterfly with the abs and the first stage of \
-     accumulation. \
-    Thus we can avoid using pabsw, which is not available until SSSE3. \
-    Emulating pabsw takes 3 instructions, so the straightforward MMXEXT \
-     implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
-     registers). \
-    Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
-    This implementation is only 26 (+4 for spilling registers).*/ \
-  __asm  movq [_r7+BUF],mm7 \
-  __asm  movq [_r6+BUF],mm6 \
-  /*mm7={0x7FFF}x4 \
-    mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \
-  __asm  pcmpeqb mm7,mm7 \
-  __asm  movq mm6,mm0 \
-  __asm  psrlw mm7,1 \
-  __asm  paddw mm6,mm1 \
-  __asm  pmaxsw mm0,mm1 \
-  __asm  paddsw mm6,mm7 \
-  __asm  psubw mm0,mm6 \
-  /*mm2=max(abs(mm2),abs(mm3))-0x7FFF \
-    mm4=max(abs(mm4),abs(mm5))-0x7FFF*/ \
-  __asm  movq mm6,mm2 \
-  __asm  movq mm1,mm4 \
-  __asm  pmaxsw mm2,mm3 \
-  __asm  pmaxsw mm4,mm5 \
-  __asm  paddw mm6,mm3 \
-  __asm  paddw mm1,mm5 \
-  __asm  movq mm3,[_r7+BUF] \
-}
-
-/*Performs the second part of the final stage of the Hadamard transform and
-   summing of absolute values.*/
-#define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) __asm{ \
-  __asm  paddsw mm6,mm7 \
-  __asm  movq mm5,[_r6+BUF] \
-  __asm  paddsw mm1,mm7 \
-  __asm  psubw mm2,mm6 \
-  __asm  psubw mm4,mm1 \
-  /*mm7={1}x4 (needed for the horizontal add that follows) \
-    mm0+=mm2+mm4+max(abs(mm3),abs(mm5))-0x7FFF*/ \
-  __asm  movq mm6,mm3 \
-  __asm  pmaxsw mm3,mm5 \
-  __asm  paddw mm0,mm2 \
-  __asm  paddw mm6,mm5 \
-  __asm  paddw mm0,mm4 \
-  __asm  paddsw mm6,mm7 \
-  __asm  paddw mm0,mm3 \
-  __asm  psrlw mm7,14 \
-  __asm  psubw mm0,mm6 \
-}
-
-/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
-   absolute value of each component, and accumulates everything into mm0.
-  This is the only portion of SATD which requires MMXEXT (we could use plain
-   MMX, but it takes 4 instructions and an extra register to work around the
-   lack of a pmaxsw, which is a pretty serious penalty).*/
-#define OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) __asm{ \
-  OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
-  OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
-}
-
-/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
-   component, and accumulates everything into mm0.
-  Note that mm0 will have an extra 4 added to each column, and that after
-   removing this value, the remainder will be half the conventional value.*/
-#define OC_HADAMARD_ABS_ACCUM_8x4(_r6,_r7) __asm{ \
-  OC_HADAMARD_AB_8x4 \
-  OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) \
-}
-
-/*Performs two 4x4 transposes (mostly) in place.
-  On input, {mm0,mm1,mm2,mm3} contains rows {e,f,g,h}, and {mm4,mm5,mm6,mm7}
-   contains rows {a,b,c,d}.
-  On output, {0x40,0x50,0x60,0x70}+_off+BUF contains {e,f,g,h}^T, and
-   {mm4,mm5,mm6,mm7} contains the transposed rows {a,b,c,d}^T.*/
-#define OC_TRANSPOSE_4x4x2(_off) __asm{ \
-  /*First 4x4 transpose:*/ \
-  __asm  movq [0x10+_off+BUF],mm5 \
-  /*mm0 = e3 e2 e1 e0 \
-    mm1 = f3 f2 f1 f0 \
-    mm2 = g3 g2 g1 g0 \
-    mm3 = h3 h2 h1 h0*/ \
-  __asm  movq mm5,mm2 \
-  __asm  punpcklwd mm2,mm3 \
-  __asm  punpckhwd mm5,mm3 \
-  __asm  movq mm3,mm0 \
-  __asm  punpcklwd mm0,mm1 \
-  __asm  punpckhwd mm3,mm1 \
-  /*mm0 = f1 e1 f0 e0 \
-    mm3 = f3 e3 f2 e2 \
-    mm2 = h1 g1 h0 g0 \
-    mm5 = h3 g3 h2 g2*/ \
-  __asm  movq mm1,mm0 \
-  __asm  punpckldq mm0,mm2 \
-  __asm  punpckhdq mm1,mm2 \
-  __asm  movq mm2,mm3 \
-  __asm  punpckhdq mm3,mm5 \
-  __asm  movq [0x40+_off+BUF],mm0 \
-  __asm  punpckldq mm2,mm5 \
-  /*mm0 = h0 g0 f0 e0 \
-    mm1 = h1 g1 f1 e1 \
-    mm2 = h2 g2 f2 e2 \
-    mm3 = h3 g3 f3 e3*/ \
-  __asm  movq mm5,[0x10+_off+BUF] \
-  /*Second 4x4 transpose:*/ \
-  /*mm4 = a3 a2 a1 a0 \
-    mm5 = b3 b2 b1 b0 \
-    mm6 = c3 c2 c1 c0 \
-    mm7 = d3 d2 d1 d0*/ \
-  __asm  movq mm0,mm6 \
-  __asm  punpcklwd mm6,mm7 \
-  __asm  movq [0x50+_off+BUF],mm1 \
-  __asm  punpckhwd mm0,mm7 \
-  __asm  movq mm7,mm4 \
-  __asm  punpcklwd mm4,mm5 \
-  __asm  movq [0x60+_off+BUF],mm2 \
-  __asm  punpckhwd mm7,mm5 \
-  /*mm4 = b1 a1 b0 a0 \
-    mm7 = b3 a3 b2 a2 \
-    mm6 = d1 c1 d0 c0 \
-    mm0 = d3 c3 d2 c2*/ \
-  __asm  movq mm5,mm4 \
-  __asm  punpckldq mm4,mm6 \
-  __asm  movq [0x70+_off+BUF],mm3 \
-  __asm  punpckhdq mm5,mm6 \
-  __asm  movq mm6,mm7 \
-  __asm  punpckhdq mm7,mm0 \
-  __asm  punpckldq mm6,mm0 \
-  /*mm4 = d0 c0 b0 a0 \
-    mm5 = d1 c1 b1 a1 \
-    mm6 = d2 c2 b2 a2 \
-    mm7 = d3 c3 b3 a3*/ \
-}
-
-static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src,
- int _src_ystride,const unsigned char *_ref,int _ref_ystride,unsigned _thresh){
-  OC_ALIGN8(ogg_int16_t  buf[64]);
-  ogg_int16_t           *bufp;
-  unsigned               ret1;
-  unsigned               ret2;
-  bufp=buf;
-  __asm{
-#define SRC esi
-#define REF eax
-#define SRC_YSTRIDE ecx
-#define REF_YSTRIDE edx
-#define BUF edi
-#define RET eax
-#define RET2 edx
-    mov SRC,_src
-    mov SRC_YSTRIDE,_src_ystride
-    mov REF,_ref
-    mov REF_YSTRIDE,_ref_ystride
-    mov BUF,bufp
-    OC_LOAD_SUB_8x4(0x00)
-    OC_HADAMARD_8x4
-    OC_TRANSPOSE_4x4x2(0x00)
-    /*Finish swapping out this 8x4 block to make room for the next one.
-      mm0...mm3 have been swapped out already.*/
-    movq [0x00+BUF],mm4
-    movq [0x10+BUF],mm5
-    movq [0x20+BUF],mm6
-    movq [0x30+BUF],mm7
-    OC_LOAD_SUB_8x4(0x04)
-    OC_HADAMARD_8x4
-    OC_TRANSPOSE_4x4x2(0x08)
-    /*Here the first 4x4 block of output from the last transpose is the second
-       4x4 block of input for the next transform.
-      We have cleverly arranged that it already be in the appropriate place, so
-       we only have to do half the loads.*/
-    movq mm1,[0x10+BUF]
-    movq mm2,[0x20+BUF]
-    movq mm3,[0x30+BUF]
-    movq mm0,[0x00+BUF]
-    OC_HADAMARD_ABS_ACCUM_8x4(0x28,0x38)
-    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
-       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
-       for the factor of two we dropped + 3 for the vertical accumulation).
-      Now we finally have to promote things to dwords.
-      We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
-       latency of pmaddwd by starting the next series of loads now.*/
-    mov RET2,_thresh
-    pmaddwd mm0,mm7
-    movq mm1,[0x50+BUF]
-    movq mm5,[0x58+BUF]
-    movq mm4,mm0
-    movq mm2,[0x60+BUF]
-    punpckhdq mm0,mm0
-    movq mm6,[0x68+BUF]
-    paddd mm4,mm0
-    movq mm3,[0x70+BUF]
-    movd RET,mm4
-    movq mm7,[0x78+BUF]
-    /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
-       added to them, and a factor of two removed; correct the final sum here.*/
-    lea RET,[RET+RET-32]
-    movq mm0,[0x40+BUF]
-    cmp RET,RET2
-    movq mm4,[0x48+BUF]
-    jae at_end
-    OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
-    pmaddwd mm0,mm7
-    /*There isn't much to stick in here to hide the latency this time, but the
-       alternative to pmaddwd is movq->punpcklwd->punpckhwd->paddd, whose
-       latency is even worse.*/
-    sub RET,32
-    movq mm4,mm0
-    punpckhdq mm0,mm0
-    paddd mm4,mm0
-    movd RET2,mm4
-    lea RET,[RET+RET2*2]
-    align 16
-at_end:
-    mov ret1,RET
-#undef SRC
-#undef REF
-#undef SRC_YSTRIDE
-#undef REF_YSTRIDE
-#undef BUF
-#undef RET
-#undef RET2
-  }
-  return ret1;
-}
-
-unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src,
- const unsigned char *_ref,int _ystride,unsigned _thresh){
-  return oc_int_frag_satd_thresh_mmxext(_src,_ystride,_ref,_ystride,_thresh);
-}
-
-
-/*Our internal implementation of frag_copy2 takes an extra stride parameter so
-   we can share code with oc_enc_frag_satd2_thresh_mmxext().*/
-static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
- const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
-  __asm{
-    /*Load the first 3 rows.*/
-#define DST_YSTRIDE edi
-#define SRC_YSTRIDE esi
-#define DST eax
-#define SRC1 edx
-#define SRC2 ecx
-    mov DST_YSTRIDE,_dst_ystride
-    mov SRC_YSTRIDE,_src_ystride
-    mov DST,_dst
-    mov SRC1,_src1
-    mov SRC2,_src2
-    movq mm0,[SRC1]
-    movq mm1,[SRC2]
-    movq mm2,[SRC1+SRC_YSTRIDE]
-    lea SRC1,[SRC1+SRC_YSTRIDE*2]
-    movq mm3,[SRC2+SRC_YSTRIDE]
-    lea SRC2,[SRC2+SRC_YSTRIDE*2]
-    pxor mm7,mm7
-    movq mm4,[SRC1]
-    pcmpeqb mm6,mm6
-    movq mm5,[SRC2]
-    /*mm7={1}x8.*/
-    psubb mm7,mm6
-    /*Start averaging mm0 and mm1 into mm6.*/
-    movq mm6,mm0
-    pxor mm0,mm1
-    pavgb mm6,mm1
-    /*mm1 is free, start averaging mm3 into mm2 using mm1.*/
-    movq mm1,mm2
-    pand mm0,mm7
-    pavgb mm2,mm3
-    pxor mm1,mm3
-    /*mm3 is free.*/
-    psubb mm6,mm0
-    /*mm0 is free, start loading the next row.*/
-    movq mm0,[SRC1+SRC_YSTRIDE]
-    /*Start averaging mm5 and mm4 using mm3.*/
-    movq mm3,mm4
-    /*mm6 [row 0] is done; write it out.*/
-    movq [DST],mm6
-    pand mm1,mm7
-    pavgb mm4,mm5
-    psubb mm2,mm1
-    /*mm1 is free, continue loading the next row.*/
-    movq mm1,[SRC2+SRC_YSTRIDE]
-    pxor mm3,mm5
-    lea SRC1,[SRC1+SRC_YSTRIDE*2]
-    /*mm2 [row 1] is done; write it out.*/
-    movq [DST+DST_YSTRIDE],mm2
-    pand mm3,mm7
-    /*Start loading the next row.*/
-    movq mm2,[SRC1]
-    lea DST,[DST+DST_YSTRIDE*2]
-    psubb mm4,mm3
-    lea SRC2,[SRC2+SRC_YSTRIDE*2]
-    /*mm4 [row 2] is done; write it out.*/
-    movq [DST],mm4
-    /*Continue loading the next row.*/
-    movq mm3,[SRC2]
-    /*Start averaging mm0 and mm1 into mm6.*/
-    movq mm6,mm0
-    pxor mm0,mm1
-    /*Start loading the next row.*/
-    movq mm4,[SRC1+SRC_YSTRIDE]
-    pavgb mm6,mm1
-    /*mm1 is free; start averaging mm3 into mm2 using mm1.*/
-    movq mm1,mm2
-    pand mm0,mm7
-    /*Continue loading the next row.*/
-    movq mm5,[SRC2+SRC_YSTRIDE]
-    pavgb mm2,mm3
-    lea SRC1,[SRC1+SRC_YSTRIDE*2]
-    pxor mm1,mm3
-    /*mm3 is free.*/
-    psubb mm6,mm0
-    /*mm0 is free, start loading the next row.*/
-    movq mm0,[SRC1]
-    /*Start averaging mm5 into mm4 using mm3.*/
-    movq mm3,mm4
-    /*mm6 [row 3] is done; write it out.*/
-    movq [DST+DST_YSTRIDE],mm6
-    pand mm1,mm7
-    lea SRC2,[SRC2+SRC_YSTRIDE*2]
-    pavgb mm4,mm5
-    lea DST,[DST+DST_YSTRIDE*2]
-    psubb mm2,mm1
-    /*mm1 is free; continue loading the next row.*/
-    movq mm1,[SRC2]
-    pxor mm3,mm5
-    /*mm2 [row 4] is done; write it out.*/
-    movq [DST],mm2
-    pand mm3,mm7
-    /*Start loading the next row.*/
-    movq mm2,[SRC1+SRC_YSTRIDE]
-    psubb mm4,mm3
-    /*Start averaging mm0 and mm1 into mm6.*/
-    movq mm6,mm0
-    /*Continue loading the next row.*/
-    movq mm3,[SRC2+SRC_YSTRIDE]
-    /*mm4 [row 5] is done; write it out.*/
-    movq [DST+DST_YSTRIDE],mm4
-    pxor mm0,mm1
-    pavgb mm6,mm1
-    /*mm4 is free; start averaging mm3 into mm2 using mm4.*/
-    movq mm4,mm2
-    pand mm0,mm7
-    pavgb mm2,mm3
-    pxor mm4,mm3
-    lea DST,[DST+DST_YSTRIDE*2]
-    psubb mm6,mm0
-    pand mm4,mm7
-    /*mm6 [row 6] is done, write it out.*/
-    movq [DST],mm6
-    psubb mm2,mm4
-    /*mm2 [row 7] is done, write it out.*/
-    movq [DST+DST_YSTRIDE],mm2
-#undef SRC1
-#undef SRC2
-#undef SRC_YSTRIDE
-#undef DST_YSTRIDE
-#undef DST
-  }
-}
-
-unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src,
- const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
- unsigned _thresh){
-  OC_ALIGN8(unsigned char ref[64]);
-  oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
-  return oc_int_frag_satd_thresh_mmxext(_src,_ystride,ref,8,_thresh);
-}
-
-unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,
- int _ystride){
-  OC_ALIGN8(ogg_int16_t  buf[64]);
-  ogg_int16_t           *bufp;
-  unsigned               ret1;
-  unsigned               ret2;
-  bufp=buf;
-  __asm{
-#define SRC eax
-#define SRC4 esi
-#define BUF edi
-#define RET eax
-#define RET_WORD ax
-#define RET2 ecx
-#define YSTRIDE edx
-#define YSTRIDE3 ecx
-    mov SRC,_src
-    mov BUF,bufp
-    mov YSTRIDE,_ystride
-    /* src4 = src+4*ystride */
-    lea SRC4,[SRC+YSTRIDE*4]
-    /* ystride3 = 3*ystride */
-    lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
-    OC_LOAD_8x4(0x00)
-    OC_HADAMARD_8x4
-    OC_TRANSPOSE_4x4x2(0x00)
-    /*Finish swapping out this 8x4 block to make room for the next one.
-      mm0...mm3 have been swapped out already.*/
-    movq [0x00+BUF],mm4
-    movq [0x10+BUF],mm5
-    movq [0x20+BUF],mm6
-    movq [0x30+BUF],mm7
-    OC_LOAD_8x4(0x04)
-    OC_HADAMARD_8x4
-    OC_TRANSPOSE_4x4x2(0x08)
-    /*Here the first 4x4 block of output from the last transpose is the second
-      4x4 block of input for the next transform.
-      We have cleverly arranged that it already be in the appropriate place, so
-      we only have to do half the loads.*/
-    movq mm1,[0x10+BUF]
-    movq mm2,[0x20+BUF]
-    movq mm3,[0x30+BUF]
-    movq mm0,[0x00+BUF]
-    /*We split out the stages here so we can save the DC coefficient in the
-      middle.*/
-    OC_HADAMARD_AB_8x4
-    OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
-    movd RET,mm1
-    OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
-    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
-      difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
-      for the factor of two we dropped + 3 for the vertical accumulation).
-      Now we finally have to promote things to dwords.
-      We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
-      latency of pmaddwd by starting the next series of loads now.*/
-    pmaddwd mm0,mm7
-    movq mm1,[0x50+BUF]
-    movq mm5,[0x58+BUF]
-    movq mm2,[0x60+BUF]
-    movq mm4,mm0
-    movq mm6,[0x68+BUF]
-    punpckhdq mm0,mm0
-    movq mm3,[0x70+BUF]
-    paddd mm4,mm0
-    movq mm7,[0x78+BUF]
-    movd RET2,mm4
-    movq mm0,[0x40+BUF]
-    movq mm4,[0x48+BUF]
-    OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
-    pmaddwd mm0,mm7
-    /*We assume that the DC coefficient is always positive (which is true,
-    because the input to the INTRA transform was not a difference).*/
-    movzx RET,RET_WORD
-    add RET2,RET2
-    sub RET2,RET
-    movq mm4,mm0
-    punpckhdq mm0,mm0
-    paddd mm4,mm0
-    movd RET,mm4
-    lea RET,[-64+RET2+RET*2]
-    mov [ret1],RET
-#undef SRC
-#undef SRC4
-#undef BUF
-#undef RET
-#undef RET_WORD
-#undef RET2
-#undef YSTRIDE
-#undef YSTRIDE3
-  }
-  return ret1;
-}
-
-void oc_enc_frag_sub_mmx(ogg_int16_t _residue[64],
- const unsigned char *_src, const unsigned char *_ref,int _ystride){
-  int i;
-  __asm  pxor mm7,mm7
-  for(i=4;i-->0;){
-    __asm{
-#define SRC edx
-#define YSTRIDE esi
-#define RESIDUE eax
-#define REF ecx
-      mov YSTRIDE,_ystride
-      mov RESIDUE,_residue
-      mov SRC,_src
-      mov REF,_ref
-      /*mm0=[src]*/
-      movq mm0,[SRC]
-      /*mm1=[ref]*/
-      movq mm1,[REF]
-      /*mm4=[src+ystride]*/
-      movq mm4,[SRC+YSTRIDE]
-      /*mm5=[ref+ystride]*/
-      movq mm5,[REF+YSTRIDE]
-      /*Compute [src]-[ref].*/
-      movq mm2,mm0
-      punpcklbw mm0,mm7
-      movq mm3,mm1
-      punpckhbw mm2,mm7
-      punpcklbw mm1,mm7
-      punpckhbw mm3,mm7
-      psubw mm0,mm1
-      psubw mm2,mm3
-      /*Compute [src+ystride]-[ref+ystride].*/
-      movq mm1,mm4
-      punpcklbw mm4,mm7
-      movq mm3,mm5
-      punpckhbw mm1,mm7
-      lea SRC,[SRC+YSTRIDE*2]
-      punpcklbw mm5,mm7
-      lea REF,[REF+YSTRIDE*2]
-      punpckhbw mm3,mm7
-      psubw mm4,mm5
-      psubw mm1,mm3
-      /*Write the answer out.*/
-      movq [RESIDUE+0x00],mm0
-      movq [RESIDUE+0x08],mm2
-      movq [RESIDUE+0x10],mm4
-      movq [RESIDUE+0x18],mm1
-      lea RESIDUE,[RESIDUE+0x20]
-      mov _residue,RESIDUE
-      mov _src,SRC
-      mov _ref,REF
-#undef SRC
-#undef YSTRIDE
-#undef RESIDUE
-#undef REF
-    }
-  }
-}
-
-void oc_enc_frag_sub_128_mmx(ogg_int16_t _residue[64],
- const unsigned char *_src,int _ystride){
-   __asm{
-#define YSTRIDE edx
-#define YSTRIDE3 edi
-#define RESIDUE ecx
-#define SRC eax
-    mov YSTRIDE,_ystride
-    mov RESIDUE,_residue
-    mov SRC,_src
-    /*mm0=[src]*/
-    movq mm0,[SRC]
-    /*mm1=[src+ystride]*/
-    movq mm1,[SRC+YSTRIDE]
-    /*mm6={-1}x4*/
-    pcmpeqw mm6,mm6
-    /*mm2=[src+2*ystride]*/
-    movq mm2,[SRC+YSTRIDE*2]
-    /*[ystride3]=3*[ystride]*/
-    lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
-    /*mm6={1}x4*/
-    psllw mm6,15
-    /*mm3=[src+3*ystride]*/
-    movq mm3,[SRC+YSTRIDE3]
-    /*mm6={128}x4*/
-    psrlw mm6,8
-    /*mm7=0*/ 
-    pxor mm7,mm7
-    /*[src]=[src]+4*[ystride]*/
-    lea SRC,[SRC+YSTRIDE*4]
-    /*Compute [src]-128 and [src+ystride]-128*/
-    movq mm4,mm0
-    punpcklbw mm0,mm7
-    movq mm5,mm1
-    punpckhbw mm4,mm7
-    psubw mm0,mm6
-    punpcklbw mm1,mm7
-    psubw mm4,mm6
-    punpckhbw mm5,mm7
-    psubw mm1,mm6
-    psubw mm5,mm6
-    /*Write the answer out.*/
-    movq [RESIDUE+0x00],mm0
-    movq [RESIDUE+0x08],mm4
-    movq [RESIDUE+0x10],mm1
-    movq [RESIDUE+0x18],mm5
-    /*mm0=[src+4*ystride]*/
-    movq mm0,[SRC]
-    /*mm1=[src+5*ystride]*/
-    movq mm1,[SRC+YSTRIDE]
-    /*Compute [src+2*ystride]-128 and [src+3*ystride]-128*/
-    movq mm4,mm2
-    punpcklbw mm2,mm7
-    movq mm5,mm3
-    punpckhbw mm4,mm7
-    psubw mm2,mm6
-    punpcklbw mm3,mm7
-    psubw mm4,mm6
-    punpckhbw mm5,mm7
-    psubw mm3,mm6
-    psubw mm5,mm6
-    /*Write the answer out.*/
-    movq [RESIDUE+0x20],mm2
-    movq [RESIDUE+0x28],mm4
-    movq [RESIDUE+0x30],mm3
-    movq [RESIDUE+0x38],mm5
-    /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
-    movq mm2,[SRC+YSTRIDE*2]
-    movq mm3,[SRC+YSTRIDE3]
-    movq mm4,mm0
-    punpcklbw mm0,mm7
-    movq mm5,mm1
-    punpckhbw mm4,mm7
-    psubw mm0,mm6
-    punpcklbw mm1,mm7
-    psubw mm4,mm6
-    punpckhbw mm5,mm7
-    psubw mm1,mm6
-    psubw mm5,mm6
-    /*Write the answer out.*/
-    movq [RESIDUE+0x40],mm0
-    movq [RESIDUE+0x48],mm4
-    movq [RESIDUE+0x50],mm1
-    movq [RESIDUE+0x58],mm5
-    /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
-    movq mm4,mm2
-    punpcklbw mm2,mm7
-    movq mm5,mm3
-    punpckhbw mm4,mm7
-    psubw mm2,mm6
-    punpcklbw mm3,mm7
-    psubw mm4,mm6
-    punpckhbw mm5,mm7
-    psubw mm3,mm6
-    psubw mm5,mm6
-    /*Write the answer out.*/
-    movq [RESIDUE+0x60],mm2
-    movq [RESIDUE+0x68],mm4
-    movq [RESIDUE+0x70],mm3
-    movq [RESIDUE+0x78],mm5
-#undef YSTRIDE
-#undef YSTRIDE3
-#undef RESIDUE
-#undef SRC
-  }
-}
-
-void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
- const unsigned char *_src1,const unsigned char *_src2,int _ystride){
-  oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride);
-}
-
-#endif
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $
+
+ ********************************************************************/
+#include <stddef.h>
+#include "x86enc.h"
+
+#if defined(OC_X86_ASM)
+
+unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
+  ptrdiff_t ret;
+  __asm{
+#define SRC esi
+#define REF edx
+#define YSTRIDE ecx
+#define YSTRIDE3 edi
+    mov YSTRIDE,_ystride
+    mov SRC,_src
+    mov REF,_ref
+    /*Load the first 4 rows of each block.*/
+    movq mm0,[SRC]
+    movq mm1,[REF]
+    movq mm2,[SRC][YSTRIDE]
+    movq mm3,[REF][YSTRIDE]
+    lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
+    movq mm4,[SRC+YSTRIDE*2]
+    movq mm5,[REF+YSTRIDE*2]
+    movq mm6,[SRC+YSTRIDE3]
+    movq mm7,[REF+YSTRIDE3]
+    /*Compute their SADs and add them in mm0*/
+    psadbw mm0,mm1
+    psadbw mm2,mm3
+    lea SRC,[SRC+YSTRIDE*4]
+    paddw mm0,mm2
+    lea REF,[REF+YSTRIDE*4]
+    /*Load the next 3 rows as registers become available.*/
+    movq mm2,[SRC]
+    movq mm3,[REF]
+    psadbw mm4,mm5
+    psadbw mm6,mm7
+    paddw mm0,mm4
+    movq mm5,[REF+YSTRIDE]
+    movq mm4,[SRC+YSTRIDE]
+    paddw mm0,mm6
+    movq mm7,[REF+YSTRIDE*2]
+    movq mm6,[SRC+YSTRIDE*2]
+    /*Start adding their SADs to mm0*/
+    psadbw mm2,mm3
+    psadbw mm4,mm5
+    paddw mm0,mm2
+    psadbw mm6,mm7
+    /*Load last row as registers become available.*/
+    movq mm2,[SRC+YSTRIDE3]
+    movq mm3,[REF+YSTRIDE3]
+    /*And finish adding up their SADs.*/
+    paddw mm0,mm4
+    psadbw mm2,mm3
+    paddw mm0,mm6
+    paddw mm0,mm2
+    movd [ret],mm0
+#undef SRC
+#undef REF
+#undef YSTRIDE
+#undef YSTRIDE3
+  }
+  return (unsigned)ret;
+}
+
+unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,unsigned _thresh){
+  /*Early termination is for suckers.*/
+  return oc_enc_frag_sad_mmxext(_src,_ref,_ystride);
+}
+
+#define OC_SAD2_LOOP __asm{ \
+  /*We want to compute (mm0+mm1>>1) on unsigned bytes without overflow, but \
+     pavgb computes (mm0+mm1+1>>1). \
+   The latter is exactly 1 too large when the low bit of two corresponding \
+    bytes is only set in one of them. \
+   Therefore we pxor the operands, pand to mask out the low bits, and psubb to \
+    correct the output of pavgb.*/ \
+  __asm  movq mm6,mm0 \
+  __asm  lea REF1,[REF1+YSTRIDE*2] \
+  __asm  pxor mm0,mm1 \
+  __asm  pavgb mm6,mm1 \
+  __asm  lea REF2,[REF2+YSTRIDE*2] \
+  __asm  movq mm1,mm2 \
+  __asm  pand mm0,mm7 \
+  __asm  pavgb mm2,mm3 \
+  __asm  pxor mm1,mm3 \
+  __asm  movq mm3,[REF2+YSTRIDE] \
+  __asm  psubb mm6,mm0 \
+  __asm  movq mm0,[REF1] \
+  __asm  pand mm1,mm7 \
+  __asm  psadbw mm4,mm6 \
+  __asm  movd mm6,RET \
+  __asm  psubb mm2,mm1 \
+  __asm  movq mm1,[REF2] \
+  __asm  lea SRC,[SRC+YSTRIDE*2] \
+  __asm  psadbw mm5,mm2 \
+  __asm  movq mm2,[REF1+YSTRIDE] \
+  __asm  paddw mm5,mm4 \
+  __asm  movq mm4,[SRC] \
+  __asm  paddw mm6,mm5 \
+  __asm  movq mm5,[SRC+YSTRIDE] \
+  __asm  movd RET,mm6 \
+}
+
+/*Same as above, but does not pre-load the next two rows.*/
+#define OC_SAD2_TAIL __asm{ \
+  __asm  movq mm6,mm0 \
+  __asm  pavgb mm0,mm1 \
+  __asm  pxor mm6,mm1 \
+  __asm  movq mm1,mm2 \
+  __asm  pand mm6,mm7 \
+  __asm  pavgb mm2,mm3 \
+  __asm  pxor mm1,mm3 \
+  __asm  psubb mm0,mm6 \
+  __asm  pand mm1,mm7 \
+  __asm  psadbw mm4,mm0 \
+  __asm  psubb mm2,mm1 \
+  __asm  movd mm6,RET \
+  __asm  psadbw mm5,mm2 \
+  __asm  paddw mm5,mm4 \
+  __asm  paddw mm6,mm5 \
+  __asm  movd RET,mm6 \
+}
+
+unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+ unsigned _thresh){
+  ptrdiff_t ret;
+  __asm{
+#define REF1 ecx
+#define REF2 edi
+#define YSTRIDE esi
+#define SRC edx
+#define RET eax
+    mov YSTRIDE,_ystride
+    mov SRC,_src
+    mov REF1,_ref1
+    mov REF2,_ref2
+    movq mm0,[REF1]
+    movq mm1,[REF2]
+    movq mm2,[REF1+YSTRIDE]
+    movq mm3,[REF2+YSTRIDE]
+    xor RET,RET
+    movq mm4,[SRC]
+    pxor mm7,mm7
+    pcmpeqb mm6,mm6
+    movq mm5,[SRC+YSTRIDE]
+    psubb mm7,mm6
+    OC_SAD2_LOOP
+    OC_SAD2_LOOP
+    OC_SAD2_LOOP
+    OC_SAD2_TAIL
+    mov [ret],RET
+#undef REF1
+#undef REF2
+#undef YSTRIDE
+#undef SRC
+#undef RET
+  }
+  return (unsigned)ret;
+}
+
+/*Load an 8x4 array of pixel values from %[src] and %[ref] and compute their
+  16-bit difference in mm0...mm7.*/
+#define OC_LOAD_SUB_8x4(_off) __asm{ \
+  __asm  movd mm0,[_off+SRC] \
+  __asm  movd mm4,[_off+REF] \
+  __asm  movd mm1,[_off+SRC+SRC_YSTRIDE] \
+  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
+  __asm  movd mm5,[_off+REF+REF_YSTRIDE] \
+  __asm  lea REF,[REF+REF_YSTRIDE*2] \
+  __asm  movd mm2,[_off+SRC] \
+  __asm  movd mm7,[_off+REF] \
+  __asm  movd mm3,[_off+SRC+SRC_YSTRIDE] \
+  __asm  movd mm6,[_off+REF+REF_YSTRIDE] \
+  __asm  punpcklbw mm0,mm4 \
+  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
+  __asm  punpcklbw mm4,mm4 \
+  __asm  lea REF,[REF+REF_YSTRIDE*2] \
+  __asm  psubw mm0,mm4 \
+  __asm  movd mm4,[_off+SRC] \
+  __asm  movq [_off*2+BUF],mm0 \
+  __asm  movd mm0,[_off+REF] \
+  __asm  punpcklbw mm1,mm5 \
+  __asm  punpcklbw mm5,mm5 \
+  __asm  psubw mm1,mm5 \
+  __asm  movd mm5,[_off+SRC+SRC_YSTRIDE] \
+  __asm  punpcklbw mm2,mm7 \
+  __asm  punpcklbw mm7,mm7 \
+  __asm  psubw mm2,mm7 \
+  __asm  movd mm7,[_off+REF+REF_YSTRIDE] \
+  __asm  punpcklbw mm3,mm6 \
+  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
+  __asm  punpcklbw mm6,mm6 \
+  __asm  psubw mm3,mm6 \
+  __asm  movd mm6,[_off+SRC] \
+  __asm  punpcklbw mm4,mm0 \
+  __asm  lea REF,[REF+REF_YSTRIDE*2] \
+  __asm  punpcklbw mm0,mm0 \
+  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
+  __asm  psubw mm4,mm0 \
+  __asm  movd mm0,[_off+REF] \
+  __asm  punpcklbw mm5,mm7 \
+  __asm  neg SRC_YSTRIDE \
+  __asm  punpcklbw mm7,mm7 \
+  __asm  psubw mm5,mm7 \
+  __asm  movd mm7,[_off+SRC+SRC_YSTRIDE] \
+  __asm  punpcklbw mm6,mm0 \
+  __asm  lea REF,[REF+REF_YSTRIDE*2] \
+  __asm  punpcklbw mm0,mm0 \
+  __asm  neg REF_YSTRIDE \
+  __asm  psubw mm6,mm0 \
+  __asm  movd mm0,[_off+REF+REF_YSTRIDE] \
+  __asm  lea SRC,[SRC+SRC_YSTRIDE*8] \
+  __asm  punpcklbw mm7,mm0 \
+  __asm  neg SRC_YSTRIDE \
+  __asm  punpcklbw mm0,mm0 \
+  __asm  lea REF,[REF+REF_YSTRIDE*8] \
+  __asm  psubw mm7,mm0 \
+  __asm  neg REF_YSTRIDE \
+  __asm  movq mm0,[_off*2+BUF] \
+}
+
+/*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/
+#define OC_LOAD_8x4(_off) __asm{ \
+  __asm  movd mm0,[_off+SRC] \
+  __asm  movd mm1,[_off+SRC+YSTRIDE] \
+  __asm  movd mm2,[_off+SRC+YSTRIDE*2] \
+  __asm  pxor mm7,mm7 \
+  __asm  movd mm3,[_off+SRC+YSTRIDE3] \
+  __asm  punpcklbw mm0,mm7 \
+  __asm  movd mm4,[_off+SRC4] \
+  __asm  punpcklbw mm1,mm7 \
+  __asm  movd mm5,[_off+SRC4+YSTRIDE] \
+  __asm  punpcklbw mm2,mm7 \
+  __asm  movd mm6,[_off+SRC4+YSTRIDE*2] \
+  __asm  punpcklbw mm3,mm7 \
+  __asm  movd mm7,[_off+SRC4+YSTRIDE3] \
+  __asm  punpcklbw mm4,mm4 \
+  __asm  punpcklbw mm5,mm5 \
+  __asm  psrlw mm4,8 \
+  __asm  psrlw mm5,8 \
+  __asm  punpcklbw mm6,mm6 \
+  __asm  punpcklbw mm7,mm7 \
+  __asm  psrlw mm6,8 \
+  __asm  psrlw mm7,8 \
+}
+
+/*Performs the first two stages of an 8-point 1-D Hadamard transform.
+  The transform is performed in place, except that outputs 0-3 are swapped with
+   outputs 4-7.
+  Outputs 2, 3, 6 and 7 from the second stage are negated (which allows us to
+   perform this stage in place with no temporary registers).*/
+#define OC_HADAMARD_AB_8x4 __asm{ \
+  /*Stage A: \
+    Outputs 0-3 are swapped with 4-7 here.*/ \
+  __asm  paddw mm5,mm1 \
+  __asm  paddw mm6,mm2 \
+  __asm  paddw mm1,mm1 \
+  __asm  paddw mm2,mm2 \
+  __asm  psubw mm1,mm5 \
+  __asm  psubw mm2,mm6 \
+  __asm  paddw mm7,mm3 \
+  __asm  paddw mm4,mm0 \
+  __asm  paddw mm3,mm3 \
+  __asm  paddw mm0,mm0 \
+  __asm  psubw mm3,mm7 \
+  __asm  psubw mm0,mm4 \
+   /*Stage B:*/ \
+  __asm  paddw mm0,mm2 \
+  __asm  paddw mm1,mm3 \
+  __asm  paddw mm4,mm6 \
+  __asm  paddw mm5,mm7 \
+  __asm  paddw mm2,mm2 \
+  __asm  paddw mm3,mm3 \
+  __asm  paddw mm6,mm6 \
+  __asm  paddw mm7,mm7 \
+  __asm  psubw mm2,mm0 \
+  __asm  psubw mm3,mm1 \
+  __asm  psubw mm6,mm4 \
+  __asm  psubw mm7,mm5 \
+}
+
+/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
+  Ouputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
+   place with no temporary registers).*/
+#define OC_HADAMARD_C_8x4 __asm{ \
+  /*Stage C:*/ \
+  __asm  paddw mm0,mm1 \
+  __asm  paddw mm2,mm3 \
+  __asm  paddw mm4,mm5 \
+  __asm  paddw mm6,mm7 \
+  __asm  paddw mm1,mm1 \
+  __asm  paddw mm3,mm3 \
+  __asm  paddw mm5,mm5 \
+  __asm  paddw mm7,mm7 \
+  __asm  psubw mm1,mm0 \
+  __asm  psubw mm3,mm2 \
+  __asm  psubw mm5,mm4 \
+  __asm  psubw mm7,mm6 \
+}
+
+/*Performs an 8-point 1-D Hadamard transform.
+  The transform is performed in place, except that outputs 0-3 are swapped with
+   outputs 4-7.
+  Outputs 1, 2, 5 and 6 are negated (which allows us to perform the transform
+   in place with no temporary registers).*/
+#define OC_HADAMARD_8x4 __asm{ \
+  OC_HADAMARD_AB_8x4 \
+  OC_HADAMARD_C_8x4 \
+}
+
+/*Performs the first part of the final stage of the Hadamard transform and
+   summing of absolute values.
+  At the end of this part, mm1 will contain the DC coefficient of the
+   transform.*/
+#define OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) __asm{ \
+  /*We use the fact that \
+      (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
+     to merge the final butterfly with the abs and the first stage of \
+     accumulation. \
+    Thus we can avoid using pabsw, which is not available until SSSE3. \
+    Emulating pabsw takes 3 instructions, so the straightforward MMXEXT \
+     implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
+     registers). \
+    Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
+    This implementation is only 26 (+4 for spilling registers).*/ \
+  __asm  movq [_r7+BUF],mm7 \
+  __asm  movq [_r6+BUF],mm6 \
+  /*mm7={0x7FFF}x4 \
+    mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \
+  __asm  pcmpeqb mm7,mm7 \
+  __asm  movq mm6,mm0 \
+  __asm  psrlw mm7,1 \
+  __asm  paddw mm6,mm1 \
+  __asm  pmaxsw mm0,mm1 \
+  __asm  paddsw mm6,mm7 \
+  __asm  psubw mm0,mm6 \
+  /*mm2=max(abs(mm2),abs(mm3))-0x7FFF \
+    mm4=max(abs(mm4),abs(mm5))-0x7FFF*/ \
+  __asm  movq mm6,mm2 \
+  __asm  movq mm1,mm4 \
+  __asm  pmaxsw mm2,mm3 \
+  __asm  pmaxsw mm4,mm5 \
+  __asm  paddw mm6,mm3 \
+  __asm  paddw mm1,mm5 \
+  __asm  movq mm3,[_r7+BUF] \
+}
+
+/*Performs the second part of the final stage of the Hadamard transform and
+   summing of absolute values.*/
+#define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) __asm{ \
+  __asm  paddsw mm6,mm7 \
+  __asm  movq mm5,[_r6+BUF] \
+  __asm  paddsw mm1,mm7 \
+  __asm  psubw mm2,mm6 \
+  __asm  psubw mm4,mm1 \
+  /*mm7={1}x4 (needed for the horizontal add that follows) \
+    mm0+=mm2+mm4+max(abs(mm3),abs(mm5))-0x7FFF*/ \
+  __asm  movq mm6,mm3 \
+  __asm  pmaxsw mm3,mm5 \
+  __asm  paddw mm0,mm2 \
+  __asm  paddw mm6,mm5 \
+  __asm  paddw mm0,mm4 \
+  __asm  paddsw mm6,mm7 \
+  __asm  paddw mm0,mm3 \
+  __asm  psrlw mm7,14 \
+  __asm  psubw mm0,mm6 \
+}
+
+/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
+   absolute value of each component, and accumulates everything into mm0.
+  This is the only portion of SATD which requires MMXEXT (we could use plain
+   MMX, but it takes 4 instructions and an extra register to work around the
+   lack of a pmaxsw, which is a pretty serious penalty).*/
+#define OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) __asm{ \
+  OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
+  OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
+}
+
+/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
+   component, and accumulates everything into mm0.
+  Note that mm0 will have an extra 4 added to each column, and that after
+   removing this value, the remainder will be half the conventional value.*/
+#define OC_HADAMARD_ABS_ACCUM_8x4(_r6,_r7) __asm{ \
+  OC_HADAMARD_AB_8x4 \
+  OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) \
+}
+
+/*Performs two 4x4 transposes (mostly) in place.
+  On input, {mm0,mm1,mm2,mm3} contains rows {e,f,g,h}, and {mm4,mm5,mm6,mm7}
+   contains rows {a,b,c,d}.
+  On output, {0x40,0x50,0x60,0x70}+_off+BUF contains {e,f,g,h}^T, and
+   {mm4,mm5,mm6,mm7} contains the transposed rows {a,b,c,d}^T.*/
+#define OC_TRANSPOSE_4x4x2(_off) __asm{ \
+  /*First 4x4 transpose:*/ \
+  __asm  movq [0x10+_off+BUF],mm5 \
+  /*mm0 = e3 e2 e1 e0 \
+    mm1 = f3 f2 f1 f0 \
+    mm2 = g3 g2 g1 g0 \
+    mm3 = h3 h2 h1 h0*/ \
+  __asm  movq mm5,mm2 \
+  __asm  punpcklwd mm2,mm3 \
+  __asm  punpckhwd mm5,mm3 \
+  __asm  movq mm3,mm0 \
+  __asm  punpcklwd mm0,mm1 \
+  __asm  punpckhwd mm3,mm1 \
+  /*mm0 = f1 e1 f0 e0 \
+    mm3 = f3 e3 f2 e2 \
+    mm2 = h1 g1 h0 g0 \
+    mm5 = h3 g3 h2 g2*/ \
+  __asm  movq mm1,mm0 \
+  __asm  punpckldq mm0,mm2 \
+  __asm  punpckhdq mm1,mm2 \
+  __asm  movq mm2,mm3 \
+  __asm  punpckhdq mm3,mm5 \
+  __asm  movq [0x40+_off+BUF],mm0 \
+  __asm  punpckldq mm2,mm5 \
+  /*mm0 = h0 g0 f0 e0 \
+    mm1 = h1 g1 f1 e1 \
+    mm2 = h2 g2 f2 e2 \
+    mm3 = h3 g3 f3 e3*/ \
+  __asm  movq mm5,[0x10+_off+BUF] \
+  /*Second 4x4 transpose:*/ \
+  /*mm4 = a3 a2 a1 a0 \
+    mm5 = b3 b2 b1 b0 \
+    mm6 = c3 c2 c1 c0 \
+    mm7 = d3 d2 d1 d0*/ \
+  __asm  movq mm0,mm6 \
+  __asm  punpcklwd mm6,mm7 \
+  __asm  movq [0x50+_off+BUF],mm1 \
+  __asm  punpckhwd mm0,mm7 \
+  __asm  movq mm7,mm4 \
+  __asm  punpcklwd mm4,mm5 \
+  __asm  movq [0x60+_off+BUF],mm2 \
+  __asm  punpckhwd mm7,mm5 \
+  /*mm4 = b1 a1 b0 a0 \
+    mm7 = b3 a3 b2 a2 \
+    mm6 = d1 c1 d0 c0 \
+    mm0 = d3 c3 d2 c2*/ \
+  __asm  movq mm5,mm4 \
+  __asm  punpckldq mm4,mm6 \
+  __asm  movq [0x70+_off+BUF],mm3 \
+  __asm  punpckhdq mm5,mm6 \
+  __asm  movq mm6,mm7 \
+  __asm  punpckhdq mm7,mm0 \
+  __asm  punpckldq mm6,mm0 \
+  /*mm4 = d0 c0 b0 a0 \
+    mm5 = d1 c1 b1 a1 \
+    mm6 = d2 c2 b2 a2 \
+    mm7 = d3 c3 b3 a3*/ \
+}
+
+static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src,
+ int _src_ystride,const unsigned char *_ref,int _ref_ystride,unsigned _thresh){
+  OC_ALIGN8(ogg_int16_t  buf[64]);
+  ogg_int16_t           *bufp;
+  unsigned               ret1;
+  unsigned               ret2;
+  bufp=buf;
+  __asm{
+#define SRC esi
+#define REF eax
+#define SRC_YSTRIDE ecx
+#define REF_YSTRIDE edx
+#define BUF edi
+#define RET eax
+#define RET2 edx
+    mov SRC,_src
+    mov SRC_YSTRIDE,_src_ystride
+    mov REF,_ref
+    mov REF_YSTRIDE,_ref_ystride
+    mov BUF,bufp
+    OC_LOAD_SUB_8x4(0x00)
+    OC_HADAMARD_8x4
+    OC_TRANSPOSE_4x4x2(0x00)
+    /*Finish swapping out this 8x4 block to make room for the next one.
+      mm0...mm3 have been swapped out already.*/
+    movq [0x00+BUF],mm4
+    movq [0x10+BUF],mm5
+    movq [0x20+BUF],mm6
+    movq [0x30+BUF],mm7
+    OC_LOAD_SUB_8x4(0x04)
+    OC_HADAMARD_8x4
+    OC_TRANSPOSE_4x4x2(0x08)
+    /*Here the first 4x4 block of output from the last transpose is the second
+       4x4 block of input for the next transform.
+      We have cleverly arranged that it already be in the appropriate place, so
+       we only have to do half the loads.*/
+    movq mm1,[0x10+BUF]
+    movq mm2,[0x20+BUF]
+    movq mm3,[0x30+BUF]
+    movq mm0,[0x00+BUF]
+    OC_HADAMARD_ABS_ACCUM_8x4(0x28,0x38)
+    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
+       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
+       for the factor of two we dropped + 3 for the vertical accumulation).
+      Now we finally have to promote things to dwords.
+      We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
+       latency of pmaddwd by starting the next series of loads now.*/
+    mov RET2,_thresh
+    pmaddwd mm0,mm7
+    movq mm1,[0x50+BUF]
+    movq mm5,[0x58+BUF]
+    movq mm4,mm0
+    movq mm2,[0x60+BUF]
+    punpckhdq mm0,mm0
+    movq mm6,[0x68+BUF]
+    paddd mm4,mm0
+    movq mm3,[0x70+BUF]
+    movd RET,mm4
+    movq mm7,[0x78+BUF]
+    /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
+       added to them, and a factor of two removed; correct the final sum here.*/
+    lea RET,[RET+RET-32]
+    movq mm0,[0x40+BUF]
+    cmp RET,RET2
+    movq mm4,[0x48+BUF]
+    jae at_end
+    OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
+    pmaddwd mm0,mm7
+    /*There isn't much to stick in here to hide the latency this time, but the
+       alternative to pmaddwd is movq->punpcklwd->punpckhwd->paddd, whose
+       latency is even worse.*/
+    sub RET,32
+    movq mm4,mm0
+    punpckhdq mm0,mm0
+    paddd mm4,mm0
+    movd RET2,mm4
+    lea RET,[RET+RET2*2]
+    align 16
+at_end:
+    mov ret1,RET
+#undef SRC
+#undef REF
+#undef SRC_YSTRIDE
+#undef REF_YSTRIDE
+#undef BUF
+#undef RET
+#undef RET2
+  }
+  return ret1;
+}
+
+unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,unsigned _thresh){
+  return oc_int_frag_satd_thresh_mmxext(_src,_ystride,_ref,_ystride,_thresh);
+}
+
+
+/*Our internal implementation of frag_copy2 takes an extra stride parameter so
+   we can share code with oc_enc_frag_satd2_thresh_mmxext().*/
+static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
+  __asm{
+    /*Load the first 3 rows.*/
+#define DST_YSTRIDE edi
+#define SRC_YSTRIDE esi
+#define DST eax
+#define SRC1 edx
+#define SRC2 ecx
+    mov DST_YSTRIDE,_dst_ystride
+    mov SRC_YSTRIDE,_src_ystride
+    mov DST,_dst
+    mov SRC1,_src1
+    mov SRC2,_src2
+    movq mm0,[SRC1]
+    movq mm1,[SRC2]
+    movq mm2,[SRC1+SRC_YSTRIDE]
+    lea SRC1,[SRC1+SRC_YSTRIDE*2]
+    movq mm3,[SRC2+SRC_YSTRIDE]
+    lea SRC2,[SRC2+SRC_YSTRIDE*2]
+    pxor mm7,mm7
+    movq mm4,[SRC1]
+    pcmpeqb mm6,mm6
+    movq mm5,[SRC2]
+    /*mm7={1}x8.*/
+    psubb mm7,mm6
+    /*Start averaging mm0 and mm1 into mm6.*/
+    movq mm6,mm0
+    pxor mm0,mm1
+    pavgb mm6,mm1
+    /*mm1 is free, start averaging mm3 into mm2 using mm1.*/
+    movq mm1,mm2
+    pand mm0,mm7
+    pavgb mm2,mm3
+    pxor mm1,mm3
+    /*mm3 is free.*/
+    psubb mm6,mm0
+    /*mm0 is free, start loading the next row.*/
+    movq mm0,[SRC1+SRC_YSTRIDE]
+    /*Start averaging mm5 and mm4 using mm3.*/
+    movq mm3,mm4
+    /*mm6 [row 0] is done; write it out.*/
+    movq [DST],mm6
+    pand mm1,mm7
+    pavgb mm4,mm5
+    psubb mm2,mm1
+    /*mm1 is free, continue loading the next row.*/
+    movq mm1,[SRC2+SRC_YSTRIDE]
+    pxor mm3,mm5
+    lea SRC1,[SRC1+SRC_YSTRIDE*2]
+    /*mm2 [row 1] is done; write it out.*/
+    movq [DST+DST_YSTRIDE],mm2
+    pand mm3,mm7
+    /*Start loading the next row.*/
+    movq mm2,[SRC1]
+    lea DST,[DST+DST_YSTRIDE*2]
+    psubb mm4,mm3
+    lea SRC2,[SRC2+SRC_YSTRIDE*2]
+    /*mm4 [row 2] is done; write it out.*/
+    movq [DST],mm4
+    /*Continue loading the next row.*/
+    movq mm3,[SRC2]
+    /*Start averaging mm0 and mm1 into mm6.*/
+    movq mm6,mm0
+    pxor mm0,mm1
+    /*Start loading the next row.*/
+    movq mm4,[SRC1+SRC_YSTRIDE]
+    pavgb mm6,mm1
+    /*mm1 is free; start averaging mm3 into mm2 using mm1.*/
+    movq mm1,mm2
+    pand mm0,mm7
+    /*Continue loading the next row.*/
+    movq mm5,[SRC2+SRC_YSTRIDE]
+    pavgb mm2,mm3
+    lea SRC1,[SRC1+SRC_YSTRIDE*2]
+    pxor mm1,mm3
+    /*mm3 is free.*/
+    psubb mm6,mm0
+    /*mm0 is free, start loading the next row.*/
+    movq mm0,[SRC1]
+    /*Start averaging mm5 into mm4 using mm3.*/
+    movq mm3,mm4
+    /*mm6 [row 3] is done; write it out.*/
+    movq [DST+DST_YSTRIDE],mm6
+    pand mm1,mm7
+    lea SRC2,[SRC2+SRC_YSTRIDE*2]
+    pavgb mm4,mm5
+    lea DST,[DST+DST_YSTRIDE*2]
+    psubb mm2,mm1
+    /*mm1 is free; continue loading the next row.*/
+    movq mm1,[SRC2]
+    pxor mm3,mm5
+    /*mm2 [row 4] is done; write it out.*/
+    movq [DST],mm2
+    pand mm3,mm7
+    /*Start loading the next row.*/
+    movq mm2,[SRC1+SRC_YSTRIDE]
+    psubb mm4,mm3
+    /*Start averaging mm0 and mm1 into mm6.*/
+    movq mm6,mm0
+    /*Continue loading the next row.*/
+    movq mm3,[SRC2+SRC_YSTRIDE]
+    /*mm4 [row 5] is done; write it out.*/
+    movq [DST+DST_YSTRIDE],mm4
+    pxor mm0,mm1
+    pavgb mm6,mm1
+    /*mm4 is free; start averaging mm3 into mm2 using mm4.*/
+    movq mm4,mm2
+    pand mm0,mm7
+    pavgb mm2,mm3
+    pxor mm4,mm3
+    lea DST,[DST+DST_YSTRIDE*2]
+    psubb mm6,mm0
+    pand mm4,mm7
+    /*mm6 [row 6] is done, write it out.*/
+    movq [DST],mm6
+    psubb mm2,mm4
+    /*mm2 [row 7] is done, write it out.*/
+    movq [DST+DST_YSTRIDE],mm2
+#undef SRC1
+#undef SRC2
+#undef SRC_YSTRIDE
+#undef DST_YSTRIDE
+#undef DST
+  }
+}
+
+unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+ unsigned _thresh){
+  OC_ALIGN8(unsigned char ref[64]);
+  oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
+  return oc_int_frag_satd_thresh_mmxext(_src,_ystride,ref,8,_thresh);
+}
+
+unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,
+ int _ystride){
+  OC_ALIGN8(ogg_int16_t  buf[64]);
+  ogg_int16_t           *bufp;
+  unsigned               ret1;
+  unsigned               ret2;
+  bufp=buf;
+  __asm{
+#define SRC eax
+#define SRC4 esi
+#define BUF edi
+#define RET eax
+#define RET_WORD ax
+#define RET2 ecx
+#define YSTRIDE edx
+#define YSTRIDE3 ecx
+    mov SRC,_src
+    mov BUF,bufp
+    mov YSTRIDE,_ystride
+    /* src4 = src+4*ystride */
+    lea SRC4,[SRC+YSTRIDE*4]
+    /* ystride3 = 3*ystride */
+    lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
+    OC_LOAD_8x4(0x00)
+    OC_HADAMARD_8x4
+    OC_TRANSPOSE_4x4x2(0x00)
+    /*Finish swapping out this 8x4 block to make room for the next one.
+      mm0...mm3 have been swapped out already.*/
+    movq [0x00+BUF],mm4
+    movq [0x10+BUF],mm5
+    movq [0x20+BUF],mm6
+    movq [0x30+BUF],mm7
+    OC_LOAD_8x4(0x04)
+    OC_HADAMARD_8x4
+    OC_TRANSPOSE_4x4x2(0x08)
+    /*Here the first 4x4 block of output from the last transpose is the second
+      4x4 block of input for the next transform.
+      We have cleverly arranged that it already be in the appropriate place, so
+      we only have to do half the loads.*/
+    movq mm1,[0x10+BUF]
+    movq mm2,[0x20+BUF]
+    movq mm3,[0x30+BUF]
+    movq mm0,[0x00+BUF]
+    /*We split out the stages here so we can save the DC coefficient in the
+      middle.*/
+    OC_HADAMARD_AB_8x4
+    OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
+    movd RET,mm1
+    OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
+    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
+      difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
+      for the factor of two we dropped + 3 for the vertical accumulation).
+      Now we finally have to promote things to dwords.
+      We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
+      latency of pmaddwd by starting the next series of loads now.*/
+    pmaddwd mm0,mm7
+    movq mm1,[0x50+BUF]
+    movq mm5,[0x58+BUF]
+    movq mm2,[0x60+BUF]
+    movq mm4,mm0
+    movq mm6,[0x68+BUF]
+    punpckhdq mm0,mm0
+    movq mm3,[0x70+BUF]
+    paddd mm4,mm0
+    movq mm7,[0x78+BUF]
+    movd RET2,mm4
+    movq mm0,[0x40+BUF]
+    movq mm4,[0x48+BUF]
+    OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
+    pmaddwd mm0,mm7
+    /*We assume that the DC coefficient is always positive (which is true,
+    because the input to the INTRA transform was not a difference).*/
+    movzx RET,RET_WORD
+    add RET2,RET2
+    sub RET2,RET
+    movq mm4,mm0
+    punpckhdq mm0,mm0
+    paddd mm4,mm0
+    movd RET,mm4
+    lea RET,[-64+RET2+RET*2]
+    mov [ret1],RET
+#undef SRC
+#undef SRC4
+#undef BUF
+#undef RET
+#undef RET_WORD
+#undef RET2
+#undef YSTRIDE
+#undef YSTRIDE3
+  }
+  return ret1;
+}
+
+void oc_enc_frag_sub_mmx(ogg_int16_t _residue[64],
+ const unsigned char *_src, const unsigned char *_ref,int _ystride){
+  int i;
+  __asm  pxor mm7,mm7
+  for(i=4;i-->0;){
+    __asm{
+#define SRC edx
+#define YSTRIDE esi
+#define RESIDUE eax
+#define REF ecx
+      mov YSTRIDE,_ystride
+      mov RESIDUE,_residue
+      mov SRC,_src
+      mov REF,_ref
+      /*mm0=[src]*/
+      movq mm0,[SRC]
+      /*mm1=[ref]*/
+      movq mm1,[REF]
+      /*mm4=[src+ystride]*/
+      movq mm4,[SRC+YSTRIDE]
+      /*mm5=[ref+ystride]*/
+      movq mm5,[REF+YSTRIDE]
+      /*Compute [src]-[ref].*/
+      movq mm2,mm0
+      punpcklbw mm0,mm7
+      movq mm3,mm1
+      punpckhbw mm2,mm7
+      punpcklbw mm1,mm7
+      punpckhbw mm3,mm7
+      psubw mm0,mm1
+      psubw mm2,mm3
+      /*Compute [src+ystride]-[ref+ystride].*/
+      movq mm1,mm4
+      punpcklbw mm4,mm7
+      movq mm3,mm5
+      punpckhbw mm1,mm7
+      lea SRC,[SRC+YSTRIDE*2]
+      punpcklbw mm5,mm7
+      lea REF,[REF+YSTRIDE*2]
+      punpckhbw mm3,mm7
+      psubw mm4,mm5
+      psubw mm1,mm3
+      /*Write the answer out.*/
+      movq [RESIDUE+0x00],mm0
+      movq [RESIDUE+0x08],mm2
+      movq [RESIDUE+0x10],mm4
+      movq [RESIDUE+0x18],mm1
+      lea RESIDUE,[RESIDUE+0x20]
+      mov _residue,RESIDUE
+      mov _src,SRC
+      mov _ref,REF
+#undef SRC
+#undef YSTRIDE
+#undef RESIDUE
+#undef REF
+    }
+  }
+}
+
+void oc_enc_frag_sub_128_mmx(ogg_int16_t _residue[64],
+ const unsigned char *_src,int _ystride){
+   __asm{
+#define YSTRIDE edx
+#define YSTRIDE3 edi
+#define RESIDUE ecx
+#define SRC eax
+    mov YSTRIDE,_ystride
+    mov RESIDUE,_residue
+    mov SRC,_src
+    /*mm0=[src]*/
+    movq mm0,[SRC]
+    /*mm1=[src+ystride]*/
+    movq mm1,[SRC+YSTRIDE]
+    /*mm6={-1}x4*/
+    pcmpeqw mm6,mm6
+    /*mm2=[src+2*ystride]*/
+    movq mm2,[SRC+YSTRIDE*2]
+    /*[ystride3]=3*[ystride]*/
+    lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
+    /*mm6={1}x4*/
+    psllw mm6,15
+    /*mm3=[src+3*ystride]*/
+    movq mm3,[SRC+YSTRIDE3]
+    /*mm6={128}x4*/
+    psrlw mm6,8
+    /*mm7=0*/ 
+    pxor mm7,mm7
+    /*[src]=[src]+4*[ystride]*/
+    lea SRC,[SRC+YSTRIDE*4]
+    /*Compute [src]-128 and [src+ystride]-128*/
+    movq mm4,mm0
+    punpcklbw mm0,mm7
+    movq mm5,mm1
+    punpckhbw mm4,mm7
+    psubw mm0,mm6
+    punpcklbw mm1,mm7
+    psubw mm4,mm6
+    punpckhbw mm5,mm7
+    psubw mm1,mm6
+    psubw mm5,mm6
+    /*Write the answer out.*/
+    movq [RESIDUE+0x00],mm0
+    movq [RESIDUE+0x08],mm4
+    movq [RESIDUE+0x10],mm1
+    movq [RESIDUE+0x18],mm5
+    /*mm0=[src+4*ystride]*/
+    movq mm0,[SRC]
+    /*mm1=[src+5*ystride]*/
+    movq mm1,[SRC+YSTRIDE]
+    /*Compute [src+2*ystride]-128 and [src+3*ystride]-128*/
+    movq mm4,mm2
+    punpcklbw mm2,mm7
+    movq mm5,mm3
+    punpckhbw mm4,mm7
+    psubw mm2,mm6
+    punpcklbw mm3,mm7
+    psubw mm4,mm6
+    punpckhbw mm5,mm7
+    psubw mm3,mm6
+    psubw mm5,mm6
+    /*Write the answer out.*/
+    movq [RESIDUE+0x20],mm2
+    movq [RESIDUE+0x28],mm4
+    movq [RESIDUE+0x30],mm3
+    movq [RESIDUE+0x38],mm5
+    /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
+    movq mm2,[SRC+YSTRIDE*2]
+    movq mm3,[SRC+YSTRIDE3]
+    movq mm4,mm0
+    punpcklbw mm0,mm7
+    movq mm5,mm1
+    punpckhbw mm4,mm7
+    psubw mm0,mm6
+    punpcklbw mm1,mm7
+    psubw mm4,mm6
+    punpckhbw mm5,mm7
+    psubw mm1,mm6
+    psubw mm5,mm6
+    /*Write the answer out.*/
+    movq [RESIDUE+0x40],mm0
+    movq [RESIDUE+0x48],mm4
+    movq [RESIDUE+0x50],mm1
+    movq [RESIDUE+0x58],mm5
+    /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
+    movq mm4,mm2
+    punpcklbw mm2,mm7
+    movq mm5,mm3
+    punpckhbw mm4,mm7
+    psubw mm2,mm6
+    punpcklbw mm3,mm7
+    psubw mm4,mm6
+    punpckhbw mm5,mm7
+    psubw mm3,mm6
+    psubw mm5,mm6
+    /*Write the answer out.*/
+    movq [RESIDUE+0x60],mm2
+    movq [RESIDUE+0x68],mm4
+    movq [RESIDUE+0x70],mm3
+    movq [RESIDUE+0x78],mm5
+#undef YSTRIDE
+#undef YSTRIDE3
+#undef RESIDUE
+#undef SRC
+  }
+}
+
+void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
+ const unsigned char *_src1,const unsigned char *_src2,int _ystride){
+  oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride);
+}
+
+#endif