Minor fix-ups to r17728.
authorTim Terriberry <tterribe@xiph.org>
Tue, 7 Dec 2010 14:13:55 +0000 (14:13 +0000)
committerTim Terriberry <tterribe@xiph.org>
Tue, 7 Dec 2010 14:13:55 +0000 (14:13 +0000)
Convert references to the stack buffer in the MMX fDCT to use esp-relative
 offsets, saving a register.
Move the MSVC MMX fDCT into the MMXEXT section (as was done for the gcc one),
 since it now requires pshufw for the zig-zagging.

svn path=/trunk/theora/; revision=17736

lib/x86/mmxfdct.c
lib/x86/sse2fdct.c
lib/x86/x86zigzag.h
lib/x86_vc/mmxfdct.c
lib/x86_vc/x86enc.c
lib/x86_vc/x86enc.h

index bc966e0..1766835 100644 (file)
 
 /*MMX implementation of the fDCT.*/
 void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
-  ogg_int16_t buf[64] __attribute__((aligned(8)));
+  OC_ALIGN8(ogg_int16_t buf[64]);
   ptrdiff_t   a;
   __asm__ __volatile__(
     /*Add two extra bits of working precision to improve accuracy; any more and
@@ -597,27 +597,27 @@ void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
     "psubw %%mm2,%%mm6\n\t"
     "psraw $2,%%mm4\n\t"
     "psubw %%mm2,%%mm0\n\t"
-    "movq %%mm4,0x00(%[buf])\n\t"
+    "movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"
     "movq 0x30(%[y]),%%mm4\n\t"
     "psraw $2,%%mm6\n\t"
     "psubw %%mm2,%%mm5\n\t"
-    "movq %%mm6,0x20(%[buf])\n\t"
+    "movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"
     "psraw $2,%%mm0\n\t"
     "psubw %%mm2,%%mm3\n\t"
-    "movq %%mm0,0x40(%[buf])\n\t"
+    "movq %%mm0,"OC_MEM_OFFS(0x40,buf)"\n\t"
     "psraw $2,%%mm5\n\t"
     "psubw %%mm2,%%mm1\n\t"
-    "movq %%mm5,0x50(%[buf])\n\t"
+    "movq %%mm5,"OC_MEM_OFFS(0x50,buf)"\n\t"
     "psraw $2,%%mm3\n\t"
     "psubw %%mm2,%%mm7\n\t"
-    "movq %%mm3,0x60(%[buf])\n\t"
+    "movq %%mm3,"OC_MEM_OFFS(0x60,buf)"\n\t"
     "psraw $2,%%mm1\n\t"
     "psubw %%mm2,%%mm4\n\t"
-    "movq %%mm1,0x70(%[buf])\n\t"
+    "movq %%mm1,"OC_MEM_OFFS(0x70,buf)"\n\t"
     "psraw $2,%%mm7\n\t"
-    "movq %%mm7,0x10(%[buf])\n\t"
+    "movq %%mm7,"OC_MEM_OFFS(0x10,buf)"\n\t"
     "psraw $2,%%mm4\n\t"
-    "movq %%mm4,0x30(%[buf])\n\t"
+    "movq %%mm4,"OC_MEM_OFFS(0x30,buf)"\n\t"
     /*Load the next block.*/
     "movq 0x40(%[y]),%%mm0\n\t"
     "movq 0x78(%[y]),%%mm7\n\t"
@@ -638,39 +638,39 @@ void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
     "psubw %%mm2,%%mm6\n\t"
     "psraw $2,%%mm4\n\t"
     "psubw %%mm2,%%mm0\n\t"
-    "movq %%mm4,0x08(%[buf])\n\t"
+    "movq %%mm4,"OC_MEM_OFFS(0x08,buf)"\n\t"
     "movq 0x70(%[y]),%%mm4\n\t"
     "psraw $2,%%mm6\n\t"
     "psubw %%mm2,%%mm5\n\t"
-    "movq %%mm6,0x28(%[buf])\n\t"
+    "movq %%mm6,"OC_MEM_OFFS(0x28,buf)"\n\t"
     "psraw $2,%%mm0\n\t"
     "psubw %%mm2,%%mm3\n\t"
-    "movq %%mm0,0x48(%[buf])\n\t"
+    "movq %%mm0,"OC_MEM_OFFS(0x48,buf)"\n\t"
     "psraw $2,%%mm5\n\t"
     "psubw %%mm2,%%mm1\n\t"
-    "movq %%mm5,0x58(%[buf])\n\t"
+    "movq %%mm5,"OC_MEM_OFFS(0x58,buf)"\n\t"
     "psraw $2,%%mm3\n\t"
     "psubw %%mm2,%%mm7\n\t"
-    "movq %%mm3,0x68(%[buf])\n\t"
+    "movq %%mm3,"OC_MEM_OFFS(0x68,buf)"\n\t"
     "psraw $2,%%mm1\n\t"
     "psubw %%mm2,%%mm4\n\t"
-    "movq %%mm1,0x78(%[buf])\n\t"
+    "movq %%mm1,"OC_MEM_OFFS(0x78,buf)"\n\t"
     "psraw $2,%%mm7\n\t"
-    "movq %%mm7,0x18(%[buf])\n\t"
+    "movq %%mm7,"OC_MEM_OFFS(0x18,buf)"\n\t"
     "psraw $2,%%mm4\n\t"
-    "movq %%mm4,0x38(%[buf])\n\t"
+    "movq %%mm4,"OC_MEM_OFFS(0x38,buf)"\n\t"
     /*Final transpose and zig-zag.*/
 #define OC_ZZ_LOAD_ROW_LO(_row,_reg) \
-    "movq 0x"_row"0(%[buf]),"_reg"\n\t" \
+    "movq "OC_MEM_OFFS(16*_row,buf)","_reg"\n\t" \
 
 #define OC_ZZ_LOAD_ROW_HI(_row,_reg) \
-    "movq 0x"_row"8(%[buf]),"_reg"\n\t" \
+    "movq "OC_MEM_OFFS(16*_row+8,buf)","_reg"\n\t" \
 
     OC_TRANSPOSE_ZIG_ZAG_MMXEXT
 #undef OC_ZZ_LOAD_ROW_LO
 #undef OC_ZZ_LOAD_ROW_HI
-    :[a]"=&r"(a)
-    :[y]"r"(_y),[x]"r"(_x),[buf]"r"(buf)
+    :[a]"=&r"(a),[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
+    :[y]"r"(_y),[x]"r"(_x)
     :"memory"
   );
 }
index c741ebe..64c1d27 100644 (file)
@@ -435,11 +435,11 @@ void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
     /*We could probably do better using SSSE3's palignr, but re-using MMXEXT
        version will do for now.*/
 #define OC_ZZ_LOAD_ROW_LO(_row,_reg) \
-    "movdq2q %%xmm"_row","_reg"\n\t" \
+    "movdq2q %%xmm"#_row","_reg"\n\t" \
 
 #define OC_ZZ_LOAD_ROW_HI(_row,_reg) \
-    "punpckhqdq %%xmm"_row",%%xmm"_row"\n\t" \
-    "movdq2q %%xmm"_row","_reg"\n\t" \
+    "punpckhqdq %%xmm"#_row",%%xmm"#_row"\n\t" \
+    "movdq2q %%xmm"#_row","_reg"\n\t" \
 
     OC_TRANSPOSE_ZIG_ZAG_MMXEXT
 #undef OC_ZZ_LOAD_ROW_LO
index a9addc5..fb21e0b 100644 (file)
@@ -23,9 +23,9 @@
 /*Converts DCT coefficients from transposed order into zig-zag scan order and
    stores them in %[y].
   This relies on two macros to load the contents of each row:
-   OC_ZZ_LOAD_ROW_LO(row,reg) and OC_ZZ_LOAD_ROW_HI(row,reg), which load the
-   first four and second four entries of each row into the specified register,
-   respectively.
+   OC_ZZ_LOAD_ROW_LO(row,"reg") and OC_ZZ_LOAD_ROW_HI(row,"reg"), which load
+   the first four and second four entries of each row into the specified
+   register, respectively.
   OC_ZZ_LOAD_ROW_LO must be called before OC_ZZ_LOAD_ROW_HI for the same row
    (because when the rows are already in SSE2 registers, loading the high half
    destructively modifies the register).
   The order of the coefficients within each tuple is reversed in the comments
    below to reflect the usual MSB to LSB notation.*/
 #define OC_TRANSPOSE_ZIG_ZAG_MMXEXT \
-  OC_ZZ_LOAD_ROW_LO("0","%%mm0") /*mm0=03 02 01 00*/ \
-  OC_ZZ_LOAD_ROW_LO("1","%%mm1") /*mm1=11 10 09 08*/ \
-  OC_ZZ_LOAD_ROW_LO("2","%%mm2") /*mm2=19 18 17 16*/ \
-  OC_ZZ_LOAD_ROW_LO("3","%%mm3") /*mm3=27 26 25 24*/ \
-  OC_ZZ_LOAD_ROW_HI("0","%%mm4") /*mm4=07 06 05 04*/ \
-  OC_ZZ_LOAD_ROW_HI("1","%%mm5") /*mm5=15 14 13 12*/ \
-  OC_ZZ_LOAD_ROW_HI("2","%%mm6") /*mm6=23 22 21 20*/ \
+  OC_ZZ_LOAD_ROW_LO(0,"%%mm0")   /*mm0=03 02 01 00*/ \
+  OC_ZZ_LOAD_ROW_LO(1,"%%mm1")   /*mm1=11 10 09 08*/ \
+  OC_ZZ_LOAD_ROW_LO(2,"%%mm2")   /*mm2=19 18 17 16*/ \
+  OC_ZZ_LOAD_ROW_LO(3,"%%mm3")   /*mm3=27 26 25 24*/ \
+  OC_ZZ_LOAD_ROW_HI(0,"%%mm4")   /*mm4=07 06 05 04*/ \
+  OC_ZZ_LOAD_ROW_HI(1,"%%mm5")   /*mm5=15 14 13 12*/ \
+  OC_ZZ_LOAD_ROW_HI(2,"%%mm6")   /*mm6=23 22 21 20*/ \
   "movq %%mm0,%%mm7\n\t"         /*mm7=03 02 01 00*/ \
   "punpckhdq %%mm1,%%mm0\n\t"    /*mm0=11 10 03 02*/ \
   "pshufw $0x39,%%mm4,%%mm4\n\t" /*mm4=04 07 06 05*/ \
@@ -64,9 +64,9 @@
   "punpcklwd %%mm3,%%mm1\n\t"    /*mm1=25 07 24 09*/ \
   "punpcklwd %%mm6,%%mm5\n\t"    /*mm5=21 14 20 13*/ \
   "punpcklwd %%mm2,%%mm1\n\t"    /*mm1=17 24 16 09 *B*/ \
-  OC_ZZ_LOAD_ROW_LO("4","%%mm2") /*mm2=35 34 33 32*/ \
+  OC_ZZ_LOAD_ROW_LO(4,"%%mm2")   /*mm2=35 34 33 32*/ \
   "movq %%mm1,0x08(%[y])\n\t" \
-  OC_ZZ_LOAD_ROW_LO("5","%%mm1") /*mm1=43 42 41 40*/ \
+  OC_ZZ_LOAD_ROW_LO(5,"%%mm1")   /*mm1=43 42 41 40*/ \
   "pshufw $0x78,%%mm0,%%mm0\n\t" /*mm0=11 04 03 10 *C*/ \
   "movq %%mm0,0x10(%[y])\n\t" \
   "punpckhdq %%mm4,%%mm6\n\t"    /*mm6=?? 07 23 22*/ \
   "punpckhwd %%mm1,%%mm3\n\t"    /*mm3=43 .. 42 27*/ \
   "punpckldq %%mm2,%%mm4\n\t"    /*mm4=25 32 40 18*/ \
   "punpcklwd %%mm0,%%mm3\n\t"    /*mm3=35 42 34 27*/ \
-  OC_ZZ_LOAD_ROW_LO("6","%%mm0") /*mm0=51 50 49 48*/ \
+  OC_ZZ_LOAD_ROW_LO(6,"%%mm0")   /*mm0=51 50 49 48*/ \
   "pshufw $0x6C,%%mm4,%%mm4\n\t" /*mm4=40 32 25 18 *E*/ \
   "movq %%mm4,0x18(%[y])\n\t" \
-  OC_ZZ_LOAD_ROW_LO("7","%%mm4") /*mm4=59 58 57 56*/ \
+  OC_ZZ_LOAD_ROW_LO(7,"%%mm4")   /*mm4=59 58 57 56*/ \
   "punpckhdq %%mm7,%%mm2\n\t"    /*mm2=12 19 26 33 *F*/ \
   "movq %%mm2,0x20(%[y])\n\t" \
   "pshufw $0xD0,%%mm1,%%mm1\n\t" /*mm1=43 41 ?? ??*/ \
   "movq %%mm3,0x30(%[y])\n\t" \
   "punpckhdq %%mm4,%%mm1\n\t"    /*mm1=58 57 50 43 *H*/ \
   "movq %%mm1,0x50(%[y])\n\t" \
-  OC_ZZ_LOAD_ROW_HI("7","%%mm1") /*mm1=63 62 61 60*/ \
+  OC_ZZ_LOAD_ROW_HI(7,"%%mm1")   /*mm1=63 62 61 60*/ \
   "punpcklwd %%mm0,%%mm4\n\t"    /*mm4=49 56 51 59*/ \
-  OC_ZZ_LOAD_ROW_HI("6","%%mm0") /*mm0=55 54 53 52*/ \
+  OC_ZZ_LOAD_ROW_HI(6,"%%mm0")   /*mm0=55 54 53 52*/ \
   "psllq $16,%%mm6\n\t"          /*mm6=07 23 22 ..*/ \
   "movq %%mm4,%%mm3\n\t"         /*mm3=49 56 51 59*/ \
   "punpckhdq %%mm2,%%mm4\n\t"    /*mm4=35 42 49 56 *I*/ \
-  OC_ZZ_LOAD_ROW_HI("3","%%mm2") /*mm2=31 30 29 28*/ \
+  OC_ZZ_LOAD_ROW_HI(3,"%%mm2")   /*mm2=31 30 29 28*/ \
   "movq %%mm4,0x38(%[y])\n\t" \
   "punpcklwd %%mm1,%%mm3\n\t"    /*mm3=61 51 60 59*/ \
   "punpcklwd %%mm6,%%mm7\n\t"    /*mm7=22 15 .. ??*/ \
   "movq %%mm3,%%mm4\n\t"         /*mm4=61 51 60 59*/ \
   "punpcklwd %%mm0,%%mm3\n\t"    /*mm3=53 60 52 59*/ \
   "punpckhwd %%mm0,%%mm4\n\t"    /*mm4=55 61 54 51*/ \
-  OC_ZZ_LOAD_ROW_HI("4","%%mm0") /*mm0=39 38 37 36*/ \
+  OC_ZZ_LOAD_ROW_HI(4,"%%mm0")   /*mm0=39 38 37 36*/ \
   "pshufw $0xE1,%%mm3,%%mm3\n\t" /*mm3=53 60 59 52 *J*/ \
   "movq %%mm3,0x68(%[y])\n\t" \
   "movq %%mm4,%%mm3\n\t"         /*mm3=?? ?? 54 51*/ \
   "pshufw $0x39,%%mm2,%%mm2\n\t" /*mm2=28 31 30 29*/ \
   "punpckhwd %%mm1,%%mm4\n\t"    /*mm4=63 55 62 61 *K*/ \
-  OC_ZZ_LOAD_ROW_HI("5","%%mm1") /*mm1=47 46 45 44*/ \
+  OC_ZZ_LOAD_ROW_HI(5,"%%mm1")   /*mm1=47 46 45 44*/ \
   "movq %%mm4,0x78(%[y])\n\t" \
   "punpckhwd %%mm2,%%mm6\n\t"    /*mm6=28 07 31 23*/ \
   "punpcklwd %%mm0,%%mm2\n\t"    /*mm2=37 30 36 29*/ \
index 07d9faf..c4cb2d1 100644 (file)
 \r
 /*MMX implementation of the fDCT.*/\r
 void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){\r
-  __declspec (align(8)) ogg_int16_t buf[64];\r
+  OC_ALIGN8(ogg_int16_t buf[64]);\r
   ptrdiff_t a;\r
   __asm{\r
 #define X edx\r
index 06595d6..e9d59e8 100644 (file)
@@ -27,7 +27,6 @@ void oc_enc_accel_init_x86(oc_enc_ctx *_enc){
     _enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_mmx;
     _enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
     _enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
-    _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmx;
   }
   if(cpu_flags&OC_CPU_X86_MMXEXT){
     _enc->opt_vtable.frag_sad=oc_enc_frag_sad_mmxext;
@@ -37,6 +36,7 @@ void oc_enc_accel_init_x86(oc_enc_ctx *_enc){
     _enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_mmxext;
     _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_mmxext;
     _enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_mmxext;
+    _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmxext;
   }
   if(cpu_flags&OC_CPU_X86_SSE2){
 # if defined(OC_X86_64_ASM)
index 3d3f5c4..885406a 100644 (file)
@@ -45,7 +45,7 @@ void oc_enc_frag_sub_128_mmx(ogg_int16_t _diff[64],
  const unsigned char *_x,int _stride);
 void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
  const unsigned char *_src1,const unsigned char *_src2,int _ystride);
-void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
 void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
 
 #endif