thirdparty/libtheora/x86_vc/mmxencfrag.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969

/********************************************************************
 *                                                                  *
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
 *                                                                  *
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
 * by the Xiph.Org Foundation http://www.xiph.org/                  *
 *                                                                  *
 ********************************************************************

  function:
  last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $

 ********************************************************************/
#include <stddef.h>
#include "x86enc.h"

#if defined(OC_X86_ASM)

unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
 const unsigned char *_ref,int _ystride){
  ptrdiff_t ret;
  __asm{
#define SRC esi
#define REF edx
#define YSTRIDE ecx
#define YSTRIDE3 edi
    mov YSTRIDE,_ystride
    mov SRC,_src
    mov REF,_ref
    /*Load the first 4 rows of each block.*/
    movq mm0,[SRC]
    movq mm1,[REF]
    movq mm2,[SRC][YSTRIDE]
    movq mm3,[REF][YSTRIDE]
    lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
    movq mm4,[SRC+YSTRIDE*2]
    movq mm5,[REF+YSTRIDE*2]
    movq mm6,[SRC+YSTRIDE3]
    movq mm7,[REF+YSTRIDE3]
    /*Compute their SADs and add them in mm0*/
    psadbw mm0,mm1
    psadbw mm2,mm3
    lea SRC,[SRC+YSTRIDE*4]
    paddw mm0,mm2
    lea REF,[REF+YSTRIDE*4]
    /*Load the next 3 rows as registers become available.*/
    movq mm2,[SRC]
    movq mm3,[REF]
    psadbw mm4,mm5
    psadbw mm6,mm7
    paddw mm0,mm4
    movq mm5,[REF+YSTRIDE]
    movq mm4,[SRC+YSTRIDE]
    paddw mm0,mm6
    movq mm7,[REF+YSTRIDE*2]
    movq mm6,[SRC+YSTRIDE*2]
    /*Start adding their SADs to mm0*/
    psadbw mm2,mm3
    psadbw mm4,mm5
    paddw mm0,mm2
    psadbw mm6,mm7
    /*Load last row as registers become available.*/
    movq mm2,[SRC+YSTRIDE3]
    movq mm3,[REF+YSTRIDE3]
    /*And finish adding up their SADs.*/
    paddw mm0,mm4
    psadbw mm2,mm3
    paddw mm0,mm6
    paddw mm0,mm2
    movd [ret],mm0
#undef SRC
#undef REF
#undef YSTRIDE
#undef YSTRIDE3
  }
  return (unsigned)ret;
}

unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
 const unsigned char *_ref,int _ystride,unsigned _thresh){
  /*Early termination is for suckers.*/
  return oc_enc_frag_sad_mmxext(_src,_ref,_ystride);
}

#define OC_SAD2_LOOP __asm{ \
  /*We want to compute (mm0+mm1>>1) on unsigned bytes without overflow, but \
     pavgb computes (mm0+mm1+1>>1). \
   The latter is exactly 1 too large when the low bit of two corresponding \
    bytes is only set in one of them. \
   Therefore we pxor the operands, pand to mask out the low bits, and psubb to \
    correct the output of pavgb.*/ \
  __asm  movq mm6,mm0 \
  __asm  lea REF1,[REF1+YSTRIDE*2] \
  __asm  pxor mm0,mm1 \
  __asm  pavgb mm6,mm1 \
  __asm  lea REF2,[REF2+YSTRIDE*2] \
  __asm  movq mm1,mm2 \
  __asm  pand mm0,mm7 \
  __asm  pavgb mm2,mm3 \
  __asm  pxor mm1,mm3 \
  __asm  movq mm3,[REF2+YSTRIDE] \
  __asm  psubb mm6,mm0 \
  __asm  movq mm0,[REF1] \
  __asm  pand mm1,mm7 \
  __asm  psadbw mm4,mm6 \
  __asm  movd mm6,RET \
  __asm  psubb mm2,mm1 \
  __asm  movq mm1,[REF2] \
  __asm  lea SRC,[SRC+YSTRIDE*2] \
  __asm  psadbw mm5,mm2 \
  __asm  movq mm2,[REF1+YSTRIDE] \
  __asm  paddw mm5,mm4 \
  __asm  movq mm4,[SRC] \
  __asm  paddw mm6,mm5 \
  __asm  movq mm5,[SRC+YSTRIDE] \
  __asm  movd RET,mm6 \
}

/*Same as above, but does not pre-load the next two rows.*/
#define OC_SAD2_TAIL __asm{ \
  __asm  movq mm6,mm0 \
  __asm  pavgb mm0,mm1 \
  __asm  pxor mm6,mm1 \
  __asm  movq mm1,mm2 \
  __asm  pand mm6,mm7 \
  __asm  pavgb mm2,mm3 \
  __asm  pxor mm1,mm3 \
  __asm  psubb mm0,mm6 \
  __asm  pand mm1,mm7 \
  __asm  psadbw mm4,mm0 \
  __asm  psubb mm2,mm1 \
  __asm  movd mm6,RET \
  __asm  psadbw mm5,mm2 \
  __asm  paddw mm5,mm4 \
  __asm  paddw mm6,mm5 \
  __asm  movd RET,mm6 \
}

unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
 const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
 unsigned _thresh){
  ptrdiff_t ret;
  __asm{
#define REF1 ecx
#define REF2 edi
#define YSTRIDE esi
#define SRC edx
#define RET eax
    mov YSTRIDE,_ystride
    mov SRC,_src
    mov REF1,_ref1
    mov REF2,_ref2
    movq mm0,[REF1]
    movq mm1,[REF2]
    movq mm2,[REF1+YSTRIDE]
    movq mm3,[REF2+YSTRIDE]
    xor RET,RET
    movq mm4,[SRC]
    pxor mm7,mm7
    pcmpeqb mm6,mm6
    movq mm5,[SRC+YSTRIDE]
    psubb mm7,mm6
    OC_SAD2_LOOP
    OC_SAD2_LOOP
    OC_SAD2_LOOP
    OC_SAD2_TAIL
    mov [ret],RET
#undef REF1
#undef REF2
#undef YSTRIDE
#undef SRC
#undef RET
  }
  return (unsigned)ret;
}

/*Load an 8x4 array of pixel values from %[src] and %[ref] and compute their
  16-bit difference in mm0...mm7.*/
#define OC_LOAD_SUB_8x4(_off) __asm{ \
  __asm  movd mm0,[_off+SRC] \
  __asm  movd mm4,[_off+REF] \
  __asm  movd mm1,[_off+SRC+SRC_YSTRIDE] \
  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
  __asm  movd mm5,[_off+REF+REF_YSTRIDE] \
  __asm  lea REF,[REF+REF_YSTRIDE*2] \
  __asm  movd mm2,[_off+SRC] \
  __asm  movd mm7,[_off+REF] \
  __asm  movd mm3,[_off+SRC+SRC_YSTRIDE] \
  __asm  movd mm6,[_off+REF+REF_YSTRIDE] \
  __asm  punpcklbw mm0,mm4 \
  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
  __asm  punpcklbw mm4,mm4 \
  __asm  lea REF,[REF+REF_YSTRIDE*2] \
  __asm  psubw mm0,mm4 \
  __asm  movd mm4,[_off+SRC] \
  __asm  movq [_off*2+BUF],mm0 \
  __asm  movd mm0,[_off+REF] \
  __asm  punpcklbw mm1,mm5 \
  __asm  punpcklbw mm5,mm5 \
  __asm  psubw mm1,mm5 \
  __asm  movd mm5,[_off+SRC+SRC_YSTRIDE] \
  __asm  punpcklbw mm2,mm7 \
  __asm  punpcklbw mm7,mm7 \
  __asm  psubw mm2,mm7 \
  __asm  movd mm7,[_off+REF+REF_YSTRIDE] \
  __asm  punpcklbw mm3,mm6 \
  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
  __asm  punpcklbw mm6,mm6 \
  __asm  psubw mm3,mm6 \
  __asm  movd mm6,[_off+SRC] \
  __asm  punpcklbw mm4,mm0 \
  __asm  lea REF,[REF+REF_YSTRIDE*2] \
  __asm  punpcklbw mm0,mm0 \
  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
  __asm  psubw mm4,mm0 \
  __asm  movd mm0,[_off+REF] \
  __asm  punpcklbw mm5,mm7 \
  __asm  neg SRC_YSTRIDE \
  __asm  punpcklbw mm7,mm7 \
  __asm  psubw mm5,mm7 \
  __asm  movd mm7,[_off+SRC+SRC_YSTRIDE] \
  __asm  punpcklbw mm6,mm0 \
  __asm  lea REF,[REF+REF_YSTRIDE*2] \
  __asm  punpcklbw mm0,mm0 \
  __asm  neg REF_YSTRIDE \
  __asm  psubw mm6,mm0 \
  __asm  movd mm0,[_off+REF+REF_YSTRIDE] \
  __asm  lea SRC,[SRC+SRC_YSTRIDE*8] \
  __asm  punpcklbw mm7,mm0 \
  __asm  neg SRC_YSTRIDE \
  __asm  punpcklbw mm0,mm0 \
  __asm  lea REF,[REF+REF_YSTRIDE*8] \
  __asm  psubw mm7,mm0 \
  __asm  neg REF_YSTRIDE \
  __asm  movq mm0,[_off*2+BUF] \
}

/*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/
#define OC_LOAD_8x4(_off) __asm{ \
  __asm  movd mm0,[_off+SRC] \
  __asm  movd mm1,[_off+SRC+YSTRIDE] \
  __asm  movd mm2,[_off+SRC+YSTRIDE*2] \
  __asm  pxor mm7,mm7 \
  __asm  movd mm3,[_off+SRC+YSTRIDE3] \
  __asm  punpcklbw mm0,mm7 \
  __asm  movd mm4,[_off+SRC4] \
  __asm  punpcklbw mm1,mm7 \
  __asm  movd mm5,[_off+SRC4+YSTRIDE] \
  __asm  punpcklbw mm2,mm7 \
  __asm  movd mm6,[_off+SRC4+YSTRIDE*2] \
  __asm  punpcklbw mm3,mm7 \
  __asm  movd mm7,[_off+SRC4+YSTRIDE3] \
  __asm  punpcklbw mm4,mm4 \
  __asm  punpcklbw mm5,mm5 \
  __asm  psrlw mm4,8 \
  __asm  psrlw mm5,8 \
  __asm  punpcklbw mm6,mm6 \
  __asm  punpcklbw mm7,mm7 \
  __asm  psrlw mm6,8 \
  __asm  psrlw mm7,8 \
}

/*Performs the first two stages of an 8-point 1-D Hadamard transform.
  The transform is performed in place, except that outputs 0-3 are swapped with
   outputs 4-7.
  Outputs 2, 3, 6 and 7 from the second stage are negated (which allows us to
   perform this stage in place with no temporary registers).*/
#define OC_HADAMARD_AB_8x4 __asm{ \
  /*Stage A: \
    Outputs 0-3 are swapped with 4-7 here.*/ \
  __asm  paddw mm5,mm1 \
  __asm  paddw mm6,mm2 \
  __asm  paddw mm1,mm1 \
  __asm  paddw mm2,mm2 \
  __asm  psubw mm1,mm5 \
  __asm  psubw mm2,mm6 \
  __asm  paddw mm7,mm3 \
  __asm  paddw mm4,mm0 \
  __asm  paddw mm3,mm3 \
  __asm  paddw mm0,mm0 \
  __asm  psubw mm3,mm7 \
  __asm  psubw mm0,mm4 \
   /*Stage B:*/ \
  __asm  paddw mm0,mm2 \
  __asm  paddw mm1,mm3 \
  __asm  paddw mm4,mm6 \
  __asm  paddw mm5,mm7 \
  __asm  paddw mm2,mm2 \
  __asm  paddw mm3,mm3 \
  __asm  paddw mm6,mm6 \
  __asm  paddw mm7,mm7 \
  __asm  psubw mm2,mm0 \
  __asm  psubw mm3,mm1 \
  __asm  psubw mm6,mm4 \
  __asm  psubw mm7,mm5 \
}

/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
  Ouputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
   place with no temporary registers).*/
#define OC_HADAMARD_C_8x4 __asm{ \
  /*Stage C:*/ \
  __asm  paddw mm0,mm1 \
  __asm  paddw mm2,mm3 \
  __asm  paddw mm4,mm5 \
  __asm  paddw mm6,mm7 \
  __asm  paddw mm1,mm1 \
  __asm  paddw mm3,mm3 \
  __asm  paddw mm5,mm5 \
  __asm  paddw mm7,mm7 \
  __asm  psubw mm1,mm0 \
  __asm  psubw mm3,mm2 \
  __asm  psubw mm5,mm4 \
  __asm  psubw mm7,mm6 \
}

/*Performs an 8-point 1-D Hadamard transform.
  The transform is performed in place, except that outputs 0-3 are swapped with
   outputs 4-7.
  Outputs 1, 2, 5 and 6 are negated (which allows us to perform the transform
   in place with no temporary registers).*/
#define OC_HADAMARD_8x4 __asm{ \
  OC_HADAMARD_AB_8x4 \
  OC_HADAMARD_C_8x4 \
}

/*Performs the first part of the final stage of the Hadamard transform and
   summing of absolute values.
  At the end of this part, mm1 will contain the DC coefficient of the
   transform.*/
#define OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) __asm{ \
  /*We use the fact that \
      (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
     to merge the final butterfly with the abs and the first stage of \
     accumulation. \
    Thus we can avoid using pabsw, which is not available until SSSE3. \
    Emulating pabsw takes 3 instructions, so the straightforward MMXEXT \
     implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
     registers). \
    Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
    This implementation is only 26 (+4 for spilling registers).*/ \
  __asm  movq [_r7+BUF],mm7 \
  __asm  movq [_r6+BUF],mm6 \
  /*mm7={0x7FFF}x4 \
    mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \
  __asm  pcmpeqb mm7,mm7 \
  __asm  movq mm6,mm0 \
  __asm  psrlw mm7,1 \
  __asm  paddw mm6,mm1 \
  __asm  pmaxsw mm0,mm1 \
  __asm  paddsw mm6,mm7 \
  __asm  psubw mm0,mm6 \
  /*mm2=max(abs(mm2),abs(mm3))-0x7FFF \
    mm4=max(abs(mm4),abs(mm5))-0x7FFF*/ \
  __asm  movq mm6,mm2 \
  __asm  movq mm1,mm4 \
  __asm  pmaxsw mm2,mm3 \
  __asm  pmaxsw mm4,mm5 \
  __asm  paddw mm6,mm3 \
  __asm  paddw mm1,mm5 \
  __asm  movq mm3,[_r7+BUF] \
}

/*Performs the second part of the final stage of the Hadamard transform and
   summing of absolute values.*/
#define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) __asm{ \
  __asm  paddsw mm6,mm7 \
  __asm  movq mm5,[_r6+BUF] \
  __asm  paddsw mm1,mm7 \
  __asm  psubw mm2,mm6 \
  __asm  psubw mm4,mm1 \
  /*mm7={1}x4 (needed for the horizontal add that follows) \
    mm0+=mm2+mm4+max(abs(mm3),abs(mm5))-0x7FFF*/ \
  __asm  movq mm6,mm3 \
  __asm  pmaxsw mm3,mm5 \
  __asm  paddw mm0,mm2 \
  __asm  paddw mm6,mm5 \
  __asm  paddw mm0,mm4 \
  __asm  paddsw mm6,mm7 \
  __asm  paddw mm0,mm3 \
  __asm  psrlw mm7,14 \
  __asm  psubw mm0,mm6 \
}

/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
   absolute value of each component, and accumulates everything into mm0.
  This is the only portion of SATD which requires MMXEXT (we could use plain
   MMX, but it takes 4 instructions and an extra register to work around the
   lack of a pmaxsw, which is a pretty serious penalty).*/
#define OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) __asm{ \
  OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
  OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
}

/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
   component, and accumulates everything into mm0.
  Note that mm0 will have an extra 4 added to each column, and that after
   removing this value, the remainder will be half the conventional value.*/
#define OC_HADAMARD_ABS_ACCUM_8x4(_r6,_r7) __asm{ \
  OC_HADAMARD_AB_8x4 \
  OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) \
}

/*Performs two 4x4 transposes (mostly) in place.
  On input, {mm0,mm1,mm2,mm3} contains rows {e,f,g,h}, and {mm4,mm5,mm6,mm7}
   contains rows {a,b,c,d}.
  On output, {0x40,0x50,0x60,0x70}+_off+BUF contains {e,f,g,h}^T, and
   {mm4,mm5,mm6,mm7} contains the transposed rows {a,b,c,d}^T.*/
#define OC_TRANSPOSE_4x4x2(_off) __asm{ \
  /*First 4x4 transpose:*/ \
  __asm  movq [0x10+_off+BUF],mm5 \
  /*mm0 = e3 e2 e1 e0 \
    mm1 = f3 f2 f1 f0 \
    mm2 = g3 g2 g1 g0 \
    mm3 = h3 h2 h1 h0*/ \
  __asm  movq mm5,mm2 \
  __asm  punpcklwd mm2,mm3 \
  __asm  punpckhwd mm5,mm3 \
  __asm  movq mm3,mm0 \
  __asm  punpcklwd mm0,mm1 \
  __asm  punpckhwd mm3,mm1 \
  /*mm0 = f1 e1 f0 e0 \
    mm3 = f3 e3 f2 e2 \
    mm2 = h1 g1 h0 g0 \
    mm5 = h3 g3 h2 g2*/ \
  __asm  movq mm1,mm0 \
  __asm  punpckldq mm0,mm2 \
  __asm  punpckhdq mm1,mm2 \
  __asm  movq mm2,mm3 \
  __asm  punpckhdq mm3,mm5 \
  __asm  movq [0x40+_off+BUF],mm0 \
  __asm  punpckldq mm2,mm5 \
  /*mm0 = h0 g0 f0 e0 \
    mm1 = h1 g1 f1 e1 \
    mm2 = h2 g2 f2 e2 \
    mm3 = h3 g3 f3 e3*/ \
  __asm  movq mm5,[0x10+_off+BUF] \
  /*Second 4x4 transpose:*/ \
  /*mm4 = a3 a2 a1 a0 \
    mm5 = b3 b2 b1 b0 \
    mm6 = c3 c2 c1 c0 \
    mm7 = d3 d2 d1 d0*/ \
  __asm  movq mm0,mm6 \
  __asm  punpcklwd mm6,mm7 \
  __asm  movq [0x50+_off+BUF],mm1 \
  __asm  punpckhwd mm0,mm7 \
  __asm  movq mm7,mm4 \
  __asm  punpcklwd mm4,mm5 \
  __asm  movq [0x60+_off+BUF],mm2 \
  __asm  punpckhwd mm7,mm5 \
  /*mm4 = b1 a1 b0 a0 \
    mm7 = b3 a3 b2 a2 \
    mm6 = d1 c1 d0 c0 \
    mm0 = d3 c3 d2 c2*/ \
  __asm  movq mm5,mm4 \
  __asm  punpckldq mm4,mm6 \
  __asm  movq [0x70+_off+BUF],mm3 \
  __asm  punpckhdq mm5,mm6 \
  __asm  movq mm6,mm7 \
  __asm  punpckhdq mm7,mm0 \
  __asm  punpckldq mm6,mm0 \
  /*mm4 = d0 c0 b0 a0 \
    mm5 = d1 c1 b1 a1 \
    mm6 = d2 c2 b2 a2 \
    mm7 = d3 c3 b3 a3*/ \
}

static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src,
 int _src_ystride,const unsigned char *_ref,int _ref_ystride,unsigned _thresh){
  OC_ALIGN8(ogg_int16_t  buf[64]);
  ogg_int16_t           *bufp;
  unsigned               ret1;
  unsigned               ret2;
  bufp=buf;
  __asm{
#define SRC esi
#define REF eax
#define SRC_YSTRIDE ecx
#define REF_YSTRIDE edx
#define BUF edi
#define RET eax
#define RET2 edx
    mov SRC,_src
    mov SRC_YSTRIDE,_src_ystride
    mov REF,_ref
    mov REF_YSTRIDE,_ref_ystride
    mov BUF,bufp
    OC_LOAD_SUB_8x4(0x00)
    OC_HADAMARD_8x4
    OC_TRANSPOSE_4x4x2(0x00)
    /*Finish swapping out this 8x4 block to make room for the next one.
      mm0...mm3 have been swapped out already.*/
    movq [0x00+BUF],mm4
    movq [0x10+BUF],mm5
    movq [0x20+BUF],mm6
    movq [0x30+BUF],mm7
    OC_LOAD_SUB_8x4(0x04)
    OC_HADAMARD_8x4
    OC_TRANSPOSE_4x4x2(0x08)
    /*Here the first 4x4 block of output from the last transpose is the second
       4x4 block of input for the next transform.
      We have cleverly arranged that it already be in the appropriate place, so
       we only have to do half the loads.*/
    movq mm1,[0x10+BUF]
    movq mm2,[0x20+BUF]
    movq mm3,[0x30+BUF]
    movq mm0,[0x00+BUF]
    OC_HADAMARD_ABS_ACCUM_8x4(0x28,0x38)
    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
       for the factor of two we dropped + 3 for the vertical accumulation).
      Now we finally have to promote things to dwords.
      We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
       latency of pmaddwd by starting the next series of loads now.*/
    mov RET2,_thresh
    pmaddwd mm0,mm7
    movq mm1,[0x50+BUF]
    movq mm5,[0x58+BUF]
    movq mm4,mm0
    movq mm2,[0x60+BUF]
    punpckhdq mm0,mm0
    movq mm6,[0x68+BUF]
    paddd mm4,mm0
    movq mm3,[0x70+BUF]
    movd RET,mm4
    movq mm7,[0x78+BUF]
    /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
       added to them, and a factor of two removed; correct the final sum here.*/
    lea RET,[RET+RET-32]
    movq mm0,[0x40+BUF]
    cmp RET,RET2
    movq mm4,[0x48+BUF]
    jae at_end
    OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
    pmaddwd mm0,mm7
    /*There isn't much to stick in here to hide the latency this time, but the
       alternative to pmaddwd is movq->punpcklwd->punpckhwd->paddd, whose
       latency is even worse.*/
    sub RET,32
    movq mm4,mm0
    punpckhdq mm0,mm0
    paddd mm4,mm0
    movd RET2,mm4
    lea RET,[RET+RET2*2]
    align 16
at_end:
    mov ret1,RET
#undef SRC
#undef REF
#undef SRC_YSTRIDE
#undef REF_YSTRIDE
#undef BUF
#undef RET
#undef RET2
  }
  return ret1;
}

unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src,
 const unsigned char *_ref,int _ystride,unsigned _thresh){
  return oc_int_frag_satd_thresh_mmxext(_src,_ystride,_ref,_ystride,_thresh);
}


/*Our internal implementation of frag_copy2 takes an extra stride parameter so
   we can share code with oc_enc_frag_satd2_thresh_mmxext().*/
static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
 const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
  __asm{
    /*Load the first 3 rows.*/
#define DST_YSTRIDE edi
#define SRC_YSTRIDE esi
#define DST eax
#define SRC1 edx
#define SRC2 ecx
    mov DST_YSTRIDE,_dst_ystride
    mov SRC_YSTRIDE,_src_ystride
    mov DST,_dst
    mov SRC1,_src1
    mov SRC2,_src2
    movq mm0,[SRC1]
    movq mm1,[SRC2]
    movq mm2,[SRC1+SRC_YSTRIDE]
    lea SRC1,[SRC1+SRC_YSTRIDE*2]
    movq mm3,[SRC2+SRC_YSTRIDE]
    lea SRC2,[SRC2+SRC_YSTRIDE*2]
    pxor mm7,mm7
    movq mm4,[SRC1]
    pcmpeqb mm6,mm6
    movq mm5,[SRC2]
    /*mm7={1}x8.*/
    psubb mm7,mm6
    /*Start averaging mm0 and mm1 into mm6.*/
    movq mm6,mm0
    pxor mm0,mm1
    pavgb mm6,mm1
    /*mm1 is free, start averaging mm3 into mm2 using mm1.*/
    movq mm1,mm2
    pand mm0,mm7
    pavgb mm2,mm3
    pxor mm1,mm3
    /*mm3 is free.*/
    psubb mm6,mm0
    /*mm0 is free, start loading the next row.*/
    movq mm0,[SRC1+SRC_YSTRIDE]
    /*Start averaging mm5 and mm4 using mm3.*/
    movq mm3,mm4
    /*mm6 [row 0] is done; write it out.*/
    movq [DST],mm6
    pand mm1,mm7
    pavgb mm4,mm5
    psubb mm2,mm1
    /*mm1 is free, continue loading the next row.*/
    movq mm1,[SRC2+SRC_YSTRIDE]
    pxor mm3,mm5
    lea SRC1,[SRC1+SRC_YSTRIDE*2]
    /*mm2 [row 1] is done; write it out.*/
    movq [DST+DST_YSTRIDE],mm2
    pand mm3,mm7
    /*Start loading the next row.*/
    movq mm2,[SRC1]
    lea DST,[DST+DST_YSTRIDE*2]
    psubb mm4,mm3
    lea SRC2,[SRC2+SRC_YSTRIDE*2]
    /*mm4 [row 2] is done; write it out.*/
    movq [DST],mm4
    /*Continue loading the next row.*/
    movq mm3,[SRC2]
    /*Start averaging mm0 and mm1 into mm6.*/
    movq mm6,mm0
    pxor mm0,mm1
    /*Start loading the next row.*/
    movq mm4,[SRC1+SRC_YSTRIDE]
    pavgb mm6,mm1
    /*mm1 is free; start averaging mm3 into mm2 using mm1.*/
    movq mm1,mm2
    pand mm0,mm7
    /*Continue loading the next row.*/
    movq mm5,[SRC2+SRC_YSTRIDE]
    pavgb mm2,mm3
    lea SRC1,[SRC1+SRC_YSTRIDE*2]
    pxor mm1,mm3
    /*mm3 is free.*/
    psubb mm6,mm0
    /*mm0 is free, start loading the next row.*/
    movq mm0,[SRC1]
    /*Start averaging mm5 into mm4 using mm3.*/
    movq mm3,mm4
    /*mm6 [row 3] is done; write it out.*/
    movq [DST+DST_YSTRIDE],mm6
    pand mm1,mm7
    lea SRC2,[SRC2+SRC_YSTRIDE*2]
    pavgb mm4,mm5
    lea DST,[DST+DST_YSTRIDE*2]
    psubb mm2,mm1
    /*mm1 is free; continue loading the next row.*/
    movq mm1,[SRC2]
    pxor mm3,mm5
    /*mm2 [row 4] is done; write it out.*/
    movq [DST],mm2
    pand mm3,mm7
    /*Start loading the next row.*/
    movq mm2,[SRC1+SRC_YSTRIDE]
    psubb mm4,mm3
    /*Start averaging mm0 and mm1 into mm6.*/
    movq mm6,mm0
    /*Continue loading the next row.*/
    movq mm3,[SRC2+SRC_YSTRIDE]
    /*mm4 [row 5] is done; write it out.*/
    movq [DST+DST_YSTRIDE],mm4
    pxor mm0,mm1
    pavgb mm6,mm1
    /*mm4 is free; start averaging mm3 into mm2 using mm4.*/
    movq mm4,mm2
    pand mm0,mm7
    pavgb mm2,mm3
    pxor mm4,mm3
    lea DST,[DST+DST_YSTRIDE*2]
    psubb mm6,mm0
    pand mm4,mm7
    /*mm6 [row 6] is done, write it out.*/
    movq [DST],mm6
    psubb mm2,mm4
    /*mm2 [row 7] is done, write it out.*/
    movq [DST+DST_YSTRIDE],mm2
#undef SRC1
#undef SRC2
#undef SRC_YSTRIDE
#undef DST_YSTRIDE
#undef DST
  }
}

unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src,
 const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
 unsigned _thresh){
  OC_ALIGN8(unsigned char ref[64]);
  oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
  return oc_int_frag_satd_thresh_mmxext(_src,_ystride,ref,8,_thresh);
}

unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,
 int _ystride){
  OC_ALIGN8(ogg_int16_t  buf[64]);
  ogg_int16_t           *bufp;
  unsigned               ret1;
  unsigned               ret2;
  bufp=buf;
  __asm{
#define SRC eax
#define SRC4 esi
#define BUF edi
#define RET eax
#define RET_WORD ax
#define RET2 ecx
#define YSTRIDE edx
#define YSTRIDE3 ecx
    mov SRC,_src
    mov BUF,bufp
    mov YSTRIDE,_ystride
    /* src4 = src+4*ystride */
    lea SRC4,[SRC+YSTRIDE*4]
    /* ystride3 = 3*ystride */
    lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
    OC_LOAD_8x4(0x00)
    OC_HADAMARD_8x4
    OC_TRANSPOSE_4x4x2(0x00)
    /*Finish swapping out this 8x4 block to make room for the next one.
      mm0...mm3 have been swapped out already.*/
    movq [0x00+BUF],mm4
    movq [0x10+BUF],mm5
    movq [0x20+BUF],mm6
    movq [0x30+BUF],mm7
    OC_LOAD_8x4(0x04)
    OC_HADAMARD_8x4
    OC_TRANSPOSE_4x4x2(0x08)
    /*Here the first 4x4 block of output from the last transpose is the second
      4x4 block of input for the next transform.
      We have cleverly arranged that it already be in the appropriate place, so
      we only have to do half the loads.*/
    movq mm1,[0x10+BUF]
    movq mm2,[0x20+BUF]
    movq mm3,[0x30+BUF]
    movq mm0,[0x00+BUF]
    /*We split out the stages here so we can save the DC coefficient in the
      middle.*/
    OC_HADAMARD_AB_8x4
    OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
    movd RET,mm1
    OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
      difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
      for the factor of two we dropped + 3 for the vertical accumulation).
      Now we finally have to promote things to dwords.
      We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
      latency of pmaddwd by starting the next series of loads now.*/
    pmaddwd mm0,mm7
    movq mm1,[0x50+BUF]
    movq mm5,[0x58+BUF]
    movq mm2,[0x60+BUF]
    movq mm4,mm0
    movq mm6,[0x68+BUF]
    punpckhdq mm0,mm0
    movq mm3,[0x70+BUF]
    paddd mm4,mm0
    movq mm7,[0x78+BUF]
    movd RET2,mm4
    movq mm0,[0x40+BUF]
    movq mm4,[0x48+BUF]
    OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
    pmaddwd mm0,mm7
    /*We assume that the DC coefficient is always positive (which is true,
    because the input to the INTRA transform was not a difference).*/
    movzx RET,RET_WORD
    add RET2,RET2
    sub RET2,RET
    movq mm4,mm0
    punpckhdq mm0,mm0
    paddd mm4,mm0
    movd RET,mm4
    lea RET,[-64+RET2+RET*2]
    mov [ret1],RET
#undef SRC
#undef SRC4
#undef BUF
#undef RET
#undef RET_WORD
#undef RET2
#undef YSTRIDE
#undef YSTRIDE3
  }
  return ret1;
}

void oc_enc_frag_sub_mmx(ogg_int16_t _residue[64],
 const unsigned char *_src, const unsigned char *_ref,int _ystride){
  int i;
  __asm  pxor mm7,mm7
  for(i=4;i-->0;){
    __asm{
#define SRC edx
#define YSTRIDE esi
#define RESIDUE eax
#define REF ecx
      mov YSTRIDE,_ystride
      mov RESIDUE,_residue
      mov SRC,_src
      mov REF,_ref
      /*mm0=[src]*/
      movq mm0,[SRC]
      /*mm1=[ref]*/
      movq mm1,[REF]
      /*mm4=[src+ystride]*/
      movq mm4,[SRC+YSTRIDE]
      /*mm5=[ref+ystride]*/
      movq mm5,[REF+YSTRIDE]
      /*Compute [src]-[ref].*/
      movq mm2,mm0
      punpcklbw mm0,mm7
      movq mm3,mm1
      punpckhbw mm2,mm7
      punpcklbw mm1,mm7
      punpckhbw mm3,mm7
      psubw mm0,mm1
      psubw mm2,mm3
      /*Compute [src+ystride]-[ref+ystride].*/
      movq mm1,mm4
      punpcklbw mm4,mm7
      movq mm3,mm5
      punpckhbw mm1,mm7
      lea SRC,[SRC+YSTRIDE*2]
      punpcklbw mm5,mm7
      lea REF,[REF+YSTRIDE*2]
      punpckhbw mm3,mm7
      psubw mm4,mm5
      psubw mm1,mm3
      /*Write the answer out.*/
      movq [RESIDUE+0x00],mm0
      movq [RESIDUE+0x08],mm2
      movq [RESIDUE+0x10],mm4
      movq [RESIDUE+0x18],mm1
      lea RESIDUE,[RESIDUE+0x20]
      mov _residue,RESIDUE
      mov _src,SRC
      mov _ref,REF
#undef SRC
#undef YSTRIDE
#undef RESIDUE
#undef REF
    }
  }
}

void oc_enc_frag_sub_128_mmx(ogg_int16_t _residue[64],
 const unsigned char *_src,int _ystride){
   __asm{
#define YSTRIDE edx
#define YSTRIDE3 edi
#define RESIDUE ecx
#define SRC eax
    mov YSTRIDE,_ystride
    mov RESIDUE,_residue
    mov SRC,_src
    /*mm0=[src]*/
    movq mm0,[SRC]
    /*mm1=[src+ystride]*/
    movq mm1,[SRC+YSTRIDE]
    /*mm6={-1}x4*/
    pcmpeqw mm6,mm6
    /*mm2=[src+2*ystride]*/
    movq mm2,[SRC+YSTRIDE*2]
    /*[ystride3]=3*[ystride]*/
    lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
    /*mm6={1}x4*/
    psllw mm6,15
    /*mm3=[src+3*ystride]*/
    movq mm3,[SRC+YSTRIDE3]
    /*mm6={128}x4*/
    psrlw mm6,8
    /*mm7=0*/ 
    pxor mm7,mm7
    /*[src]=[src]+4*[ystride]*/
    lea SRC,[SRC+YSTRIDE*4]
    /*Compute [src]-128 and [src+ystride]-128*/
    movq mm4,mm0
    punpcklbw mm0,mm7
    movq mm5,mm1
    punpckhbw mm4,mm7
    psubw mm0,mm6
    punpcklbw mm1,mm7
    psubw mm4,mm6
    punpckhbw mm5,mm7
    psubw mm1,mm6
    psubw mm5,mm6
    /*Write the answer out.*/
    movq [RESIDUE+0x00],mm0
    movq [RESIDUE+0x08],mm4
    movq [RESIDUE+0x10],mm1
    movq [RESIDUE+0x18],mm5
    /*mm0=[src+4*ystride]*/
    movq mm0,[SRC]
    /*mm1=[src+5*ystride]*/
    movq mm1,[SRC+YSTRIDE]
    /*Compute [src+2*ystride]-128 and [src+3*ystride]-128*/
    movq mm4,mm2
    punpcklbw mm2,mm7
    movq mm5,mm3
    punpckhbw mm4,mm7
    psubw mm2,mm6
    punpcklbw mm3,mm7
    psubw mm4,mm6
    punpckhbw mm5,mm7
    psubw mm3,mm6
    psubw mm5,mm6
    /*Write the answer out.*/
    movq [RESIDUE+0x20],mm2
    movq [RESIDUE+0x28],mm4
    movq [RESIDUE+0x30],mm3
    movq [RESIDUE+0x38],mm5
    /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
    movq mm2,[SRC+YSTRIDE*2]
    movq mm3,[SRC+YSTRIDE3]
    movq mm4,mm0
    punpcklbw mm0,mm7
    movq mm5,mm1
    punpckhbw mm4,mm7
    psubw mm0,mm6
    punpcklbw mm1,mm7
    psubw mm4,mm6
    punpckhbw mm5,mm7
    psubw mm1,mm6
    psubw mm5,mm6
    /*Write the answer out.*/
    movq [RESIDUE+0x40],mm0
    movq [RESIDUE+0x48],mm4
    movq [RESIDUE+0x50],mm1
    movq [RESIDUE+0x58],mm5
    /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
    movq mm4,mm2
    punpcklbw mm2,mm7
    movq mm5,mm3
    punpckhbw mm4,mm7
    psubw mm2,mm6
    punpcklbw mm3,mm7
    psubw mm4,mm6
    punpckhbw mm5,mm7
    psubw mm3,mm6
    psubw mm5,mm6
    /*Write the answer out.*/
    movq [RESIDUE+0x60],mm2
    movq [RESIDUE+0x68],mm4
    movq [RESIDUE+0x70],mm3
    movq [RESIDUE+0x78],mm5
#undef YSTRIDE
#undef YSTRIDE3
#undef RESIDUE
#undef SRC
  }
}

void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
 const unsigned char *_src1,const unsigned char *_src2,int _ystride){
  oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride);
}

#endif