Move zig-zagging from quantization into the fDCT.
[theora.git] / lib / x86 / sse2fdct.c
1 /********************************************************************
2  *                                                                  *
3  * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4  * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5  * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7  *                                                                  *
8  * THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006                *
9  * by the Xiph.Org Foundation http://www.xiph.org/                  *
10  *                                                                  *
11  ********************************************************************/
12 /*SSE2 fDCT implementation for x86_64.*/
13 /*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
14 #include <stddef.h>
15 #include "x86enc.h"
16 #include "x86zigzag.h"
17 #include "sse2trans.h"
18
19 #if defined(OC_X86_64_ASM)
20
21 # define OC_FDCT_8x8 \
22  /*Note: xmm15={0}x8 and xmm14={-1}x8.*/ \
23  "#OC_FDCT_8x8\n\t" \
24  /*Stage 1:*/ \
25  "movdqa %%xmm0,%%xmm11\n\t" \
26  "movdqa %%xmm1,%%xmm10\n\t" \
27  "movdqa %%xmm2,%%xmm9\n\t" \
28  "movdqa %%xmm3,%%xmm8\n\t" \
29  /*xmm11=t7'=t0-t7*/ \
30  "psubw %%xmm7,%%xmm11\n\t" \
31  /*xmm10=t6'=t1-t6*/ \
32  "psubw %%xmm6,%%xmm10\n\t" \
33  /*xmm9=t5'=t2-t5*/ \
34  "psubw %%xmm5,%%xmm9\n\t" \
35  /*xmm8=t4'=t3-t4*/ \
36  "psubw %%xmm4,%%xmm8\n\t" \
37  /*xmm0=t0'=t0+t7*/ \
38  "paddw %%xmm7,%%xmm0\n\t" \
39  /*xmm1=t1'=t1+t6*/ \
40  "paddw %%xmm6,%%xmm1\n\t" \
41  /*xmm5=t2'=t2+t5*/ \
42  "paddw %%xmm2,%%xmm5\n\t" \
43  /*xmm4=t3'=t3+t4*/ \
44  "paddw %%xmm3,%%xmm4\n\t" \
45  /*xmm2,3,6,7 are now free.*/ \
46  /*Stage 2:*/ \
47  "movdqa %%xmm0,%%xmm3\n\t" \
48  "mov $0x5A806A0A,%[a]\n\t" \
49  "movdqa %%xmm1,%%xmm2\n\t" \
50  "movd %[a],%%xmm13\n\t" \
51  "movdqa %%xmm10,%%xmm6\n\t" \
52  "pshufd $00,%%xmm13,%%xmm13\n\t" \
53  /*xmm2=t2''=t1'-t2'*/ \
54  "psubw %%xmm5,%%xmm2\n\t" \
55  "pxor %%xmm12,%%xmm12\n\t" \
56  /*xmm3=t3''=t0'-t3'*/ \
57  "psubw %%xmm4,%%xmm3\n\t" \
58  "psubw %%xmm14,%%xmm12\n\t" \
59  /*xmm10=t5''=t6'-t5'*/ \
60  "psubw %%xmm9,%%xmm10\n\t" \
61  "paddw %%xmm12,%%xmm12\n\t" \
62  /*xmm4=t0''=t0'+t3'*/ \
63  "paddw %%xmm0,%%xmm4\n\t" \
64  /*xmm1=t1''=t1'+t2'*/ \
65  "paddw %%xmm5,%%xmm1\n\t" \
66  /*xmm6=t6''=t6'+t5'*/ \
67  "paddw %%xmm9,%%xmm6\n\t" \
68  /*xmm0,xmm5,xmm9 are now free.*/ \
69  /*Stage 3:*/ \
70  /*xmm10:xmm5=t5''*27146+0xB500 \
71    xmm0=t5''*/ \
72  "movdqa %%xmm10,%%xmm5\n\t" \
73  "movdqa %%xmm10,%%xmm0\n\t" \
74  "punpckhwd %%xmm12,%%xmm10\n\t" \
75  "pmaddwd %%xmm13,%%xmm10\n\t" \
76  "punpcklwd %%xmm12,%%xmm5\n\t" \
77  "pmaddwd %%xmm13,%%xmm5\n\t" \
78  /*xmm5=(t5''*27146+0xB500>>16)+t5''*/ \
79  "psrad $16,%%xmm10\n\t" \
80  "psrad $16,%%xmm5\n\t" \
81  "packssdw %%xmm10,%%xmm5\n\t" \
82  "paddw %%xmm0,%%xmm5\n\t" \
83  /*xmm0=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \
84  "pcmpeqw %%xmm15,%%xmm0\n\t" \
85  "psubw %%xmm14,%%xmm0\n\t" \
86  "paddw %%xmm5,%%xmm0\n\t" \
87  "movdqa %%xmm8,%%xmm5\n\t" \
88  "psraw $1,%%xmm0\n\t" \
89  /*xmm5=t5'''=t4'-s*/ \
90  "psubw %%xmm0,%%xmm5\n\t" \
91  /*xmm8=t4''=t4'+s*/ \
92  "paddw %%xmm0,%%xmm8\n\t" \
93  /*xmm0,xmm7,xmm9,xmm10 are free.*/ \
94  /*xmm7:xmm9=t6''*27146+0xB500*/ \
95  "movdqa %%xmm6,%%xmm7\n\t" \
96  "movdqa %%xmm6,%%xmm9\n\t" \
97  "punpckhwd %%xmm12,%%xmm7\n\t" \
98  "pmaddwd %%xmm13,%%xmm7\n\t" \
99  "punpcklwd %%xmm12,%%xmm9\n\t" \
100  "pmaddwd %%xmm13,%%xmm9\n\t" \
101  /*xmm9=(t6''*27146+0xB500>>16)+t6''*/ \
102  "psrad $16,%%xmm7\n\t" \
103  "psrad $16,%%xmm9\n\t" \
104  "packssdw %%xmm7,%%xmm9\n\t" \
105  "paddw %%xmm6,%%xmm9\n\t" \
106  /*xmm9=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \
107  "pcmpeqw %%xmm15,%%xmm6\n\t" \
108  "psubw %%xmm14,%%xmm6\n\t" \
109  "paddw %%xmm6,%%xmm9\n\t" \
110  "movdqa %%xmm11,%%xmm7\n\t" \
111  "psraw $1,%%xmm9\n\t" \
112  /*xmm7=t6'''=t7'-s*/ \
113  "psubw %%xmm9,%%xmm7\n\t" \
114  /*xmm9=t7''=t7'+s*/ \
115  "paddw %%xmm11,%%xmm9\n\t" \
116  /*xmm0,xmm6,xmm10,xmm11 are free.*/ \
117  /*Stage 4:*/ \
118  /*xmm10:xmm0=t1''*27146+0xB500*/ \
119  "movdqa %%xmm1,%%xmm0\n\t" \
120  "movdqa %%xmm1,%%xmm10\n\t" \
121  "punpcklwd %%xmm12,%%xmm0\n\t" \
122  "pmaddwd %%xmm13,%%xmm0\n\t" \
123  "punpckhwd %%xmm12,%%xmm10\n\t" \
124  "pmaddwd %%xmm13,%%xmm10\n\t" \
125  /*xmm0=(t1''*27146+0xB500>>16)+t1''*/ \
126  "psrad $16,%%xmm0\n\t" \
127  "psrad $16,%%xmm10\n\t" \
128  "mov $0x20006A0A,%[a]\n\t" \
129  "packssdw %%xmm10,%%xmm0\n\t" \
130  "movd %[a],%%xmm13\n\t" \
131  "paddw %%xmm1,%%xmm0\n\t" \
132  /*xmm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \
133  "pcmpeqw %%xmm15,%%xmm1\n\t" \
134  "pshufd $00,%%xmm13,%%xmm13\n\t" \
135  "psubw %%xmm14,%%xmm1\n\t" \
136  "paddw %%xmm1,%%xmm0\n\t" \
137  /*xmm10:xmm4=t0''*27146+0x4000*/ \
138  "movdqa %%xmm4,%%xmm1\n\t" \
139  "movdqa %%xmm4,%%xmm10\n\t" \
140  "punpcklwd %%xmm12,%%xmm4\n\t" \
141  "pmaddwd %%xmm13,%%xmm4\n\t" \
142  "punpckhwd %%xmm12,%%xmm10\n\t" \
143  "pmaddwd %%xmm13,%%xmm10\n\t" \
144  /*xmm4=(t0''*27146+0x4000>>16)+t0''*/ \
145  "psrad $16,%%xmm4\n\t" \
146  "psrad $16,%%xmm10\n\t" \
147  "mov $0x6CB7,%[a]\n\t" \
148  "packssdw %%xmm10,%%xmm4\n\t" \
149  "movd %[a],%%xmm12\n\t" \
150  "paddw %%xmm1,%%xmm4\n\t" \
151  /*xmm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \
152  "pcmpeqw %%xmm15,%%xmm1\n\t" \
153  "pshufd $00,%%xmm12,%%xmm12\n\t" \
154  "psubw %%xmm14,%%xmm1\n\t" \
155  "mov $0x7FFF6C84,%[a]\n\t" \
156  "paddw %%xmm1,%%xmm4\n\t" \
157  /*xmm0=_y[0]=u=r+s>>1 \
158    The naive implementation could cause overflow, so we use \
159     u=(r&s)+((r^s)>>1).*/ \
160  "movdqa %%xmm0,%%xmm6\n\t" \
161  "pxor %%xmm4,%%xmm0\n\t" \
162  "pand %%xmm4,%%xmm6\n\t" \
163  "psraw $1,%%xmm0\n\t" \
164  "movd %[a],%%xmm13\n\t" \
165  "paddw %%xmm6,%%xmm0\n\t" \
166  /*xmm4=_y[4]=v=r-u*/ \
167  "pshufd $00,%%xmm13,%%xmm13\n\t" \
168  "psubw %%xmm0,%%xmm4\n\t" \
169  /*xmm1,xmm6,xmm10,xmm11 are free.*/ \
170  /*xmm6:xmm10=60547*t3''+0x6CB7*/ \
171  "movdqa %%xmm3,%%xmm10\n\t" \
172  "movdqa %%xmm3,%%xmm6\n\t" \
173  "punpcklwd %%xmm3,%%xmm10\n\t" \
174  "pmaddwd %%xmm13,%%xmm10\n\t" \
175  "mov $0x61F861F8,%[a]\n\t" \
176  "punpckhwd %%xmm3,%%xmm6\n\t" \
177  "pmaddwd %%xmm13,%%xmm6\n\t" \
178  "movd %[a],%%xmm13\n\t" \
179  "paddd %%xmm12,%%xmm10\n\t" \
180  "pshufd $00,%%xmm13,%%xmm13\n\t" \
181  "paddd %%xmm12,%%xmm6\n\t" \
182  /*xmm1:xmm2=25080*t2'' \
183    xmm12=t2''*/ \
184  "movdqa %%xmm2,%%xmm11\n\t" \
185  "movdqa %%xmm2,%%xmm12\n\t" \
186  "pmullw %%xmm13,%%xmm2\n\t" \
187  "pmulhw %%xmm13,%%xmm11\n\t" \
188  "movdqa %%xmm2,%%xmm1\n\t" \
189  "punpcklwd %%xmm11,%%xmm2\n\t" \
190  "punpckhwd %%xmm11,%%xmm1\n\t" \
191  /*xmm10=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \
192  "paddd %%xmm2,%%xmm10\n\t" \
193  "paddd %%xmm1,%%xmm6\n\t" \
194  "psrad $16,%%xmm10\n\t" \
195  "pcmpeqw %%xmm15,%%xmm3\n\t" \
196  "psrad $16,%%xmm6\n\t" \
197  "psubw %%xmm14,%%xmm3\n\t" \
198  "packssdw %%xmm6,%%xmm10\n\t" \
199  "paddw %%xmm3,%%xmm10\n\t" \
200  /*xmm2=_y[2]=u \
201    xmm10=s=(25080*u>>16)-t2''*/ \
202  "movdqa %%xmm10,%%xmm2\n\t" \
203  "pmulhw %%xmm13,%%xmm10\n\t" \
204  "psubw %%xmm12,%%xmm10\n\t" \
205  /*xmm1:xmm6=s*21600+0x2800*/ \
206  "pxor %%xmm12,%%xmm12\n\t" \
207  "psubw %%xmm14,%%xmm12\n\t" \
208  "mov $0x28005460,%[a]\n\t" \
209  "movd %[a],%%xmm13\n\t" \
210  "pshufd $00,%%xmm13,%%xmm13\n\t" \
211  "movdqa %%xmm10,%%xmm6\n\t" \
212  "movdqa %%xmm10,%%xmm1\n\t" \
213  "punpcklwd %%xmm12,%%xmm6\n\t" \
214  "pmaddwd %%xmm13,%%xmm6\n\t" \
215  "mov $0x0E3D,%[a]\n\t" \
216  "punpckhwd %%xmm12,%%xmm1\n\t" \
217  "pmaddwd %%xmm13,%%xmm1\n\t" \
218  /*xmm6=(s*21600+0x2800>>18)+s*/ \
219  "psrad $18,%%xmm6\n\t" \
220  "psrad $18,%%xmm1\n\t" \
221  "movd %[a],%%xmm12\n\t" \
222  "packssdw %%xmm1,%%xmm6\n\t" \
223  "pshufd $00,%%xmm12,%%xmm12\n\t" \
224  "paddw %%xmm10,%%xmm6\n\t" \
225  /*xmm6=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \
226  "mov $0x7FFF54DC,%[a]\n\t" \
227  "pcmpeqw %%xmm15,%%xmm10\n\t" \
228  "movd %[a],%%xmm13\n\t" \
229  "psubw %%xmm14,%%xmm10\n\t" \
230  "pshufd $00,%%xmm13,%%xmm13\n\t" \
231  "paddw %%xmm10,%%xmm6\n\t " \
232  /*xmm1,xmm3,xmm10,xmm11 are free.*/ \
233  /*xmm11:xmm10=54491*t5'''+0x0E3D*/ \
234  "movdqa %%xmm5,%%xmm10\n\t" \
235  "movdqa %%xmm5,%%xmm11\n\t" \
236  "punpcklwd %%xmm5,%%xmm10\n\t" \
237  "pmaddwd %%xmm13,%%xmm10\n\t" \
238  "mov $0x8E3A8E3A,%[a]\n\t" \
239  "punpckhwd %%xmm5,%%xmm11\n\t" \
240  "pmaddwd %%xmm13,%%xmm11\n\t" \
241  "movd %[a],%%xmm13\n\t" \
242  "paddd %%xmm12,%%xmm10\n\t" \
243  "pshufd $00,%%xmm13,%%xmm13\n\t" \
244  "paddd %%xmm12,%%xmm11\n\t" \
245  /*xmm7:xmm12=36410*t6''' \
246    xmm1=t6'''*/ \
247  "movdqa %%xmm7,%%xmm3\n\t" \
248  "movdqa %%xmm7,%%xmm1\n\t" \
249  "pmulhw %%xmm13,%%xmm3\n\t" \
250  "pmullw %%xmm13,%%xmm7\n\t" \
251  "paddw %%xmm1,%%xmm3\n\t" \
252  "movdqa %%xmm7,%%xmm12\n\t" \
253  "punpckhwd %%xmm3,%%xmm7\n\t" \
254  "punpcklwd %%xmm3,%%xmm12\n\t" \
255  /*xmm10=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \
256  "paddd %%xmm12,%%xmm10\n\t" \
257  "paddd %%xmm7,%%xmm11\n\t" \
258  "psrad $16,%%xmm10\n\t" \
259  "pcmpeqw %%xmm15,%%xmm5\n\t" \
260  "psrad $16,%%xmm11\n\t" \
261  "psubw %%xmm14,%%xmm5\n\t" \
262  "packssdw %%xmm11,%%xmm10\n\t" \
263  "pxor %%xmm12,%%xmm12\n\t" \
264  "paddw %%xmm5,%%xmm10\n\t" \
265  /*xmm5=_y[5]=u \
266    xmm1=s=t6'''-(36410*u>>16)*/ \
267  "psubw %%xmm14,%%xmm12\n\t" \
268  "movdqa %%xmm10,%%xmm5\n\t" \
269  "mov $0x340067C8,%[a]\n\t" \
270  "pmulhw %%xmm13,%%xmm10\n\t" \
271  "movd %[a],%%xmm13\n\t" \
272  "paddw %%xmm5,%%xmm10\n\t" \
273  "pshufd $00,%%xmm13,%%xmm13\n\t" \
274  "psubw %%xmm10,%%xmm1\n\t" \
275  /*xmm11:xmm3=s*26568+0x3400*/ \
276  "movdqa %%xmm1,%%xmm3\n\t" \
277  "movdqa %%xmm1,%%xmm11\n\t" \
278  "punpcklwd %%xmm12,%%xmm3\n\t" \
279  "pmaddwd %%xmm13,%%xmm3\n\t" \
280  "mov $0x7B1B,%[a]\n\t" \
281  "punpckhwd %%xmm12,%%xmm11\n\t" \
282  "pmaddwd %%xmm13,%%xmm11\n\t" \
283  /*xmm3=(s*26568+0x3400>>17)+s*/ \
284  "psrad $17,%%xmm3\n\t" \
285  "psrad $17,%%xmm11\n\t" \
286  "movd %[a],%%xmm12\n\t" \
287  "packssdw %%xmm11,%%xmm3\n\t" \
288  "pshufd $00,%%xmm12,%%xmm12\n\t" \
289  "paddw %%xmm1,%%xmm3\n\t" \
290  /*xmm3=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \
291  "mov $0x7FFF7B16,%[a]\n\t" \
292  "pcmpeqw %%xmm15,%%xmm1\n\t" \
293  "movd %[a],%%xmm13\n\t" \
294  "psubw %%xmm14,%%xmm1\n\t" \
295  "pshufd $00,%%xmm13,%%xmm13\n\t" \
296  "paddw %%xmm1,%%xmm3\n\t " \
297  /*xmm1,xmm7,xmm10,xmm11 are free.*/ \
298  /*xmm11:xmm10=64277*t7''+0x7B1B*/ \
299  "movdqa %%xmm9,%%xmm10\n\t" \
300  "movdqa %%xmm9,%%xmm11\n\t" \
301  "punpcklwd %%xmm9,%%xmm10\n\t" \
302  "pmaddwd %%xmm13,%%xmm10\n\t" \
303  "mov $0x31F131F1,%[a]\n\t" \
304  "punpckhwd %%xmm9,%%xmm11\n\t" \
305  "pmaddwd %%xmm13,%%xmm11\n\t" \
306  "movd %[a],%%xmm13\n\t" \
307  "paddd %%xmm12,%%xmm10\n\t" \
308  "pshufd $00,%%xmm13,%%xmm13\n\t" \
309  "paddd %%xmm12,%%xmm11\n\t" \
310  /*xmm12:xmm7=12785*t4''*/ \
311  "movdqa %%xmm8,%%xmm7\n\t" \
312  "movdqa %%xmm8,%%xmm1\n\t" \
313  "pmullw %%xmm13,%%xmm7\n\t" \
314  "pmulhw %%xmm13,%%xmm1\n\t" \
315  "movdqa %%xmm7,%%xmm12\n\t" \
316  "punpcklwd %%xmm1,%%xmm7\n\t" \
317  "punpckhwd %%xmm1,%%xmm12\n\t" \
318  /*xmm10=u=(12785*t4''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \
319  "paddd %%xmm7,%%xmm10\n\t" \
320  "paddd %%xmm12,%%xmm11\n\t" \
321  "psrad $16,%%xmm10\n\t" \
322  "pcmpeqw %%xmm15,%%xmm9\n\t" \
323  "psrad $16,%%xmm11\n\t" \
324  "psubw %%xmm14,%%xmm9\n\t" \
325  "packssdw %%xmm11,%%xmm10\n\t" \
326  "pxor %%xmm12,%%xmm12\n\t" \
327  "paddw %%xmm9,%%xmm10\n\t" \
328  /*xmm1=_y[1]=u \
329    xmm10=s=(12785*u>>16)-t4''*/ \
330  "psubw %%xmm14,%%xmm12\n\t" \
331  "movdqa %%xmm10,%%xmm1\n\t" \
332  "mov $0x3000503B,%[a]\n\t" \
333  "pmulhw %%xmm13,%%xmm10\n\t" \
334  "movd %[a],%%xmm13\n\t" \
335  "psubw %%xmm8,%%xmm10\n\t" \
336  "pshufd $00,%%xmm13,%%xmm13\n\t" \
337  /*xmm8:xmm7=s*20539+0x3000*/ \
338  "movdqa %%xmm10,%%xmm7\n\t" \
339  "movdqa %%xmm10,%%xmm8\n\t" \
340  "punpcklwd %%xmm12,%%xmm7\n\t" \
341  "pmaddwd %%xmm13,%%xmm7\n\t" \
342  "punpckhwd %%xmm12,%%xmm8\n\t" \
343  "pmaddwd %%xmm13,%%xmm8\n\t" \
344  /*xmm7=(s*20539+0x3000>>20)+s*/ \
345  "psrad $20,%%xmm7\n\t" \
346  "psrad $20,%%xmm8\n\t" \
347  "packssdw %%xmm8,%%xmm7\n\t" \
348  "paddw %%xmm10,%%xmm7\n\t" \
349  /*xmm7=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \
350  "pcmpeqw %%xmm15,%%xmm10\n\t" \
351  "psubw %%xmm14,%%xmm10\n\t" \
352  "paddw %%xmm10,%%xmm7\n\t " \
353
354 /*SSE2 implementation of the fDCT for x86-64 only.
355   Because of the 8 extra XMM registers on x86-64, this version can operate
356    without any temporary stack access at all.*/
357 void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
358   ptrdiff_t a;
359   __asm__ __volatile__(
360     /*Load the input.*/
361     "movdqa 0x00(%[x]),%%xmm0\n\t"
362     "movdqa 0x10(%[x]),%%xmm1\n\t"
363     "movdqa 0x20(%[x]),%%xmm2\n\t"
364     "movdqa 0x30(%[x]),%%xmm3\n\t"
365     "movdqa 0x40(%[x]),%%xmm4\n\t"
366     "movdqa 0x50(%[x]),%%xmm5\n\t"
367     "movdqa 0x60(%[x]),%%xmm6\n\t"
368     "movdqa 0x70(%[x]),%%xmm7\n\t"
369     /*Add two extra bits of working precision to improve accuracy; any more and
370        we could overflow.*/
371     /*We also add a few biases to correct for some systematic error that
372        remains in the full fDCT->iDCT round trip.*/
373     /*xmm15={0}x8*/
374     "pxor %%xmm15,%%xmm15\n\t"
375     /*xmm14={-1}x8*/
376     "pcmpeqb %%xmm14,%%xmm14\n\t"
377     "psllw $2,%%xmm0\n\t"
378     /*xmm8=xmm0*/
379     "movdqa %%xmm0,%%xmm8\n\t"
380     "psllw $2,%%xmm1\n\t"
381     /*xmm8={_x[7...0]==0}*/
382     "pcmpeqw %%xmm15,%%xmm8\n\t"
383     "psllw $2,%%xmm2\n\t"
384     /*xmm8={_x[7...0]!=0}*/
385     "psubw %%xmm14,%%xmm8\n\t"
386     "psllw $2,%%xmm3\n\t"
387     /*%[a]=1*/
388     "mov $1,%[a]\n\t"
389     /*xmm8={_x[6]!=0,0,_x[4]!=0,0,_x[2]!=0,0,_x[0]!=0,0}*/
390     "pslld $16,%%xmm8\n\t"
391     "psllw $2,%%xmm4\n\t"
392     /*xmm9={0,0,0,0,0,0,0,1}*/
393     "movd %[a],%%xmm9\n\t"
394     /*xmm8={0,0,_x[2]!=0,0,_x[0]!=0,0}*/
395     "pshufhw $0x00,%%xmm8,%%xmm8\n\t"
396     "psllw $2,%%xmm5\n\t"
397     /*%[a]={1}x2*/
398     "mov $0x10001,%[a]\n\t"
399     /*xmm8={0,0,0,0,0,0,0,_x[0]!=0}*/
400     "pshuflw $0x01,%%xmm8,%%xmm8\n\t"
401     "psllw $2,%%xmm6\n\t"
402     /*xmm10={0,0,0,0,0,0,1,1}*/
403     "movd %[a],%%xmm10\n\t"
404     /*xmm0=_x[7...0]+{0,0,0,0,0,0,0,_x[0]!=0}*/
405     "paddw %%xmm8,%%xmm0\n\t"
406     "psllw $2,%%xmm7\n\t"
407     /*xmm0=_x[7...0]+{0,0,0,0,0,0,1,(_x[0]!=0)+1}*/
408     "paddw %%xmm10,%%xmm0\n\t"
409     /*xmm1=_x[15...8]-{0,0,0,0,0,0,0,1}*/
410     "psubw %%xmm9,%%xmm1\n\t"
411     /*Transform columns.*/
412     OC_FDCT_8x8
413     /*Transform rows.*/
414     OC_TRANSPOSE_8x8
415     OC_FDCT_8x8
416     /*xmm14={-2,-2,-2,-2,-2,-2,-2,-2}*/
417     "paddw %%xmm14,%%xmm14\n\t"
418     "psubw %%xmm14,%%xmm0\n\t"
419     "psubw %%xmm14,%%xmm1\n\t"
420     "psraw $2,%%xmm0\n\t"
421     "psubw %%xmm14,%%xmm2\n\t"
422     "psraw $2,%%xmm1\n\t"
423     "psubw %%xmm14,%%xmm3\n\t"
424     "psraw $2,%%xmm2\n\t"
425     "psubw %%xmm14,%%xmm4\n\t"
426     "psraw $2,%%xmm3\n\t"
427     "psubw %%xmm14,%%xmm5\n\t"
428     "psraw $2,%%xmm4\n\t"
429     "psubw %%xmm14,%%xmm6\n\t"
430     "psraw $2,%%xmm5\n\t"
431     "psubw %%xmm14,%%xmm7\n\t"
432     "psraw $2,%%xmm6\n\t"
433     "psraw $2,%%xmm7\n\t"
434     /*Transpose, zig-zag, and store the result.*/
435     /*We could probably do better using SSSE3's palignr, but re-using MMXEXT
436        version will do for now.*/
437 #define OC_ZZ_LOAD_ROW_LO(_row,_reg) \
438     "movdq2q %%xmm"_row","_reg"\n\t" \
439
440 #define OC_ZZ_LOAD_ROW_HI(_row,_reg) \
441     "punpckhqdq %%xmm"_row",%%xmm"_row"\n\t" \
442     "movdq2q %%xmm"_row","_reg"\n\t" \
443
444     OC_TRANSPOSE_ZIG_ZAG_MMXEXT
445 #undef OC_ZZ_LOAD_ROW_LO
446 #undef OC_ZZ_LOAD_ROW_HI
447     :[a]"=&r"(a)
448     :[y]"r"(_y),[x]"r"(_x)
449     :"memory"
450   );
451 }
452 #endif