Add a new libtheora_info example program.
[theora.git] / lib / x86 / sse2fdct.c
1 /********************************************************************
2  *                                                                  *
3  * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4  * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5  * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7  *                                                                  *
8  * THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006                *
9  * by the Xiph.Org Foundation http://www.xiph.org/                  *
10  *                                                                  *
11  ********************************************************************/
12 /*SSE2 fDCT implementation for x86_64.*/
13 /*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
14 #include <stddef.h>
15 #include "x86enc.h"
16 #include "sse2trans.h"
17
18 #if defined(OC_X86_64_ASM)
19
20 # define OC_FDCT_8x8 \
21  /*Note: xmm15={0}x8 and xmm14={-1}x8.*/ \
22  "#OC_FDCT_8x8\n\t" \
23  /*Stage 1:*/ \
24  "movdqa %%xmm0,%%xmm11\n\t" \
25  "movdqa %%xmm1,%%xmm10\n\t" \
26  "movdqa %%xmm2,%%xmm9\n\t" \
27  "movdqa %%xmm3,%%xmm8\n\t" \
28  /*xmm11=t7'=t0-t7*/ \
29  "psubw %%xmm7,%%xmm11\n\t" \
30  /*xmm10=t6'=t1-t6*/ \
31  "psubw %%xmm6,%%xmm10\n\t" \
32  /*xmm9=t5'=t2-t5*/ \
33  "psubw %%xmm5,%%xmm9\n\t" \
34  /*xmm8=t4'=t3-t4*/ \
35  "psubw %%xmm4,%%xmm8\n\t" \
36  /*xmm0=t0'=t0+t7*/ \
37  "paddw %%xmm7,%%xmm0\n\t" \
38  /*xmm1=t1'=t1+t6*/ \
39  "paddw %%xmm6,%%xmm1\n\t" \
40  /*xmm5=t2'=t2+t5*/ \
41  "paddw %%xmm2,%%xmm5\n\t" \
42  /*xmm4=t3'=t3+t4*/ \
43  "paddw %%xmm3,%%xmm4\n\t" \
44  /*xmm2,3,6,7 are now free.*/ \
45  /*Stage 2:*/ \
46  "movdqa %%xmm0,%%xmm3\n\t" \
47  "mov $0x5A806A0A,%[a]\n\t" \
48  "movdqa %%xmm1,%%xmm2\n\t" \
49  "movd %[a],%%xmm13\n\t" \
50  "movdqa %%xmm10,%%xmm6\n\t" \
51  "pshufd $00,%%xmm13,%%xmm13\n\t" \
52  /*xmm2=t2''=t1'-t2'*/ \
53  "psubw %%xmm5,%%xmm2\n\t" \
54  "pxor %%xmm12,%%xmm12\n\t" \
55  /*xmm3=t3''=t0'-t3'*/ \
56  "psubw %%xmm4,%%xmm3\n\t" \
57  "psubw %%xmm14,%%xmm12\n\t" \
58  /*xmm10=t5''=t6'-t5'*/ \
59  "psubw %%xmm9,%%xmm10\n\t" \
60  "paddw %%xmm12,%%xmm12\n\t" \
61  /*xmm4=t0''=t0'+t3'*/ \
62  "paddw %%xmm0,%%xmm4\n\t" \
63  /*xmm1=t1''=t1'+t2'*/ \
64  "paddw %%xmm5,%%xmm1\n\t" \
65  /*xmm6=t6''=t6'+t5'*/ \
66  "paddw %%xmm9,%%xmm6\n\t" \
67  /*xmm0,xmm5,xmm9 are now free.*/ \
68  /*Stage 3:*/ \
69  /*xmm10:xmm5=t5''*27146+0xB500 \
70    xmm0=t5''*/ \
71  "movdqa %%xmm10,%%xmm5\n\t" \
72  "movdqa %%xmm10,%%xmm0\n\t" \
73  "punpckhwd %%xmm12,%%xmm10\n\t" \
74  "pmaddwd %%xmm13,%%xmm10\n\t" \
75  "punpcklwd %%xmm12,%%xmm5\n\t" \
76  "pmaddwd %%xmm13,%%xmm5\n\t" \
77  /*xmm5=(t5''*27146+0xB500>>16)+t5''*/ \
78  "psrad $16,%%xmm10\n\t" \
79  "psrad $16,%%xmm5\n\t" \
80  "packssdw %%xmm10,%%xmm5\n\t" \
81  "paddw %%xmm0,%%xmm5\n\t" \
82  /*xmm0=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \
83  "pcmpeqw %%xmm15,%%xmm0\n\t" \
84  "psubw %%xmm14,%%xmm0\n\t" \
85  "paddw %%xmm5,%%xmm0\n\t" \
86  "movdqa %%xmm8,%%xmm5\n\t" \
87  "psraw $1,%%xmm0\n\t" \
88  /*xmm5=t5'''=t4'-s*/ \
89  "psubw %%xmm0,%%xmm5\n\t" \
90  /*xmm8=t4''=t4'+s*/ \
91  "paddw %%xmm0,%%xmm8\n\t" \
92  /*xmm0,xmm7,xmm9,xmm10 are free.*/ \
93  /*xmm7:xmm9=t6''*27146+0xB500*/ \
94  "movdqa %%xmm6,%%xmm7\n\t" \
95  "movdqa %%xmm6,%%xmm9\n\t" \
96  "punpckhwd %%xmm12,%%xmm7\n\t" \
97  "pmaddwd %%xmm13,%%xmm7\n\t" \
98  "punpcklwd %%xmm12,%%xmm9\n\t" \
99  "pmaddwd %%xmm13,%%xmm9\n\t" \
100  /*xmm9=(t6''*27146+0xB500>>16)+t6''*/ \
101  "psrad $16,%%xmm7\n\t" \
102  "psrad $16,%%xmm9\n\t" \
103  "packssdw %%xmm7,%%xmm9\n\t" \
104  "paddw %%xmm6,%%xmm9\n\t" \
105  /*xmm9=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \
106  "pcmpeqw %%xmm15,%%xmm6\n\t" \
107  "psubw %%xmm14,%%xmm6\n\t" \
108  "paddw %%xmm6,%%xmm9\n\t" \
109  "movdqa %%xmm11,%%xmm7\n\t" \
110  "psraw $1,%%xmm9\n\t" \
111  /*xmm7=t6'''=t7'-s*/ \
112  "psubw %%xmm9,%%xmm7\n\t" \
113  /*xmm9=t7''=t7'+s*/ \
114  "paddw %%xmm11,%%xmm9\n\t" \
115  /*xmm0,xmm6,xmm10,xmm11 are free.*/ \
116  /*Stage 4:*/ \
117  /*xmm10:xmm0=t1''*27146+0xB500*/ \
118  "movdqa %%xmm1,%%xmm0\n\t" \
119  "movdqa %%xmm1,%%xmm10\n\t" \
120  "punpcklwd %%xmm12,%%xmm0\n\t" \
121  "pmaddwd %%xmm13,%%xmm0\n\t" \
122  "punpckhwd %%xmm12,%%xmm10\n\t" \
123  "pmaddwd %%xmm13,%%xmm10\n\t" \
124  /*xmm0=(t1''*27146+0xB500>>16)+t1''*/ \
125  "psrad $16,%%xmm0\n\t" \
126  "psrad $16,%%xmm10\n\t" \
127  "mov $0x20006A0A,%[a]\n\t" \
128  "packssdw %%xmm10,%%xmm0\n\t" \
129  "movd %[a],%%xmm13\n\t" \
130  "paddw %%xmm1,%%xmm0\n\t" \
131  /*xmm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \
132  "pcmpeqw %%xmm15,%%xmm1\n\t" \
133  "pshufd $00,%%xmm13,%%xmm13\n\t" \
134  "psubw %%xmm14,%%xmm1\n\t" \
135  "paddw %%xmm1,%%xmm0\n\t" \
136  /*xmm10:xmm4=t0''*27146+0x4000*/ \
137  "movdqa %%xmm4,%%xmm1\n\t" \
138  "movdqa %%xmm4,%%xmm10\n\t" \
139  "punpcklwd %%xmm12,%%xmm4\n\t" \
140  "pmaddwd %%xmm13,%%xmm4\n\t" \
141  "punpckhwd %%xmm12,%%xmm10\n\t" \
142  "pmaddwd %%xmm13,%%xmm10\n\t" \
143  /*xmm4=(t0''*27146+0x4000>>16)+t0''*/ \
144  "psrad $16,%%xmm4\n\t" \
145  "psrad $16,%%xmm10\n\t" \
146  "mov $0x6CB7,%[a]\n\t" \
147  "packssdw %%xmm10,%%xmm4\n\t" \
148  "movd %[a],%%xmm12\n\t" \
149  "paddw %%xmm1,%%xmm4\n\t" \
150  /*xmm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \
151  "pcmpeqw %%xmm15,%%xmm1\n\t" \
152  "pshufd $00,%%xmm12,%%xmm12\n\t" \
153  "psubw %%xmm14,%%xmm1\n\t" \
154  "mov $0x7FFF6C84,%[a]\n\t" \
155  "paddw %%xmm1,%%xmm4\n\t" \
156  /*xmm0=_y[0]=u=r+s>>1 \
157    The naive implementation could cause overflow, so we use \
158     u=(r&s)+((r^s)>>1).*/ \
159  "movdqa %%xmm0,%%xmm6\n\t" \
160  "pxor %%xmm4,%%xmm0\n\t" \
161  "pand %%xmm4,%%xmm6\n\t" \
162  "psraw $1,%%xmm0\n\t" \
163  "movd %[a],%%xmm13\n\t" \
164  "paddw %%xmm6,%%xmm0\n\t" \
165  /*xmm4=_y[4]=v=r-u*/ \
166  "pshufd $00,%%xmm13,%%xmm13\n\t" \
167  "psubw %%xmm0,%%xmm4\n\t" \
168  /*xmm1,xmm6,xmm10,xmm11 are free.*/ \
169  /*xmm6:xmm10=60547*t3''+0x6CB7*/ \
170  "movdqa %%xmm3,%%xmm10\n\t" \
171  "movdqa %%xmm3,%%xmm6\n\t" \
172  "punpcklwd %%xmm3,%%xmm10\n\t" \
173  "pmaddwd %%xmm13,%%xmm10\n\t" \
174  "mov $0x61F861F8,%[a]\n\t" \
175  "punpckhwd %%xmm3,%%xmm6\n\t" \
176  "pmaddwd %%xmm13,%%xmm6\n\t" \
177  "movd %[a],%%xmm13\n\t" \
178  "paddd %%xmm12,%%xmm10\n\t" \
179  "pshufd $00,%%xmm13,%%xmm13\n\t" \
180  "paddd %%xmm12,%%xmm6\n\t" \
181  /*xmm1:xmm2=25080*t2'' \
182    xmm12=t2''*/ \
183  "movdqa %%xmm2,%%xmm11\n\t" \
184  "movdqa %%xmm2,%%xmm12\n\t" \
185  "pmullw %%xmm13,%%xmm2\n\t" \
186  "pmulhw %%xmm13,%%xmm11\n\t" \
187  "movdqa %%xmm2,%%xmm1\n\t" \
188  "punpcklwd %%xmm11,%%xmm2\n\t" \
189  "punpckhwd %%xmm11,%%xmm1\n\t" \
190  /*xmm10=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \
191  "paddd %%xmm2,%%xmm10\n\t" \
192  "paddd %%xmm1,%%xmm6\n\t" \
193  "psrad $16,%%xmm10\n\t" \
194  "pcmpeqw %%xmm15,%%xmm3\n\t" \
195  "psrad $16,%%xmm6\n\t" \
196  "psubw %%xmm14,%%xmm3\n\t" \
197  "packssdw %%xmm6,%%xmm10\n\t" \
198  "paddw %%xmm3,%%xmm10\n\t" \
199  /*xmm2=_y[2]=u \
200    xmm10=s=(25080*u>>16)-t2''*/ \
201  "movdqa %%xmm10,%%xmm2\n\t" \
202  "pmulhw %%xmm13,%%xmm10\n\t" \
203  "psubw %%xmm12,%%xmm10\n\t" \
204  /*xmm1:xmm6=s*21600+0x2800*/ \
205  "pxor %%xmm12,%%xmm12\n\t" \
206  "psubw %%xmm14,%%xmm12\n\t" \
207  "mov $0x28005460,%[a]\n\t" \
208  "movd %[a],%%xmm13\n\t" \
209  "pshufd $00,%%xmm13,%%xmm13\n\t" \
210  "movdqa %%xmm10,%%xmm6\n\t" \
211  "movdqa %%xmm10,%%xmm1\n\t" \
212  "punpcklwd %%xmm12,%%xmm6\n\t" \
213  "pmaddwd %%xmm13,%%xmm6\n\t" \
214  "mov $0x0E3D,%[a]\n\t" \
215  "punpckhwd %%xmm12,%%xmm1\n\t" \
216  "pmaddwd %%xmm13,%%xmm1\n\t" \
217  /*xmm6=(s*21600+0x2800>>18)+s*/ \
218  "psrad $18,%%xmm6\n\t" \
219  "psrad $18,%%xmm1\n\t" \
220  "movd %[a],%%xmm12\n\t" \
221  "packssdw %%xmm1,%%xmm6\n\t" \
222  "pshufd $00,%%xmm12,%%xmm12\n\t" \
223  "paddw %%xmm10,%%xmm6\n\t" \
224  /*xmm6=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \
225  "mov $0x7FFF54DC,%[a]\n\t" \
226  "pcmpeqw %%xmm15,%%xmm10\n\t" \
227  "movd %[a],%%xmm13\n\t" \
228  "psubw %%xmm14,%%xmm10\n\t" \
229  "pshufd $00,%%xmm13,%%xmm13\n\t" \
230  "paddw %%xmm10,%%xmm6\n\t " \
231  /*xmm1,xmm3,xmm10,xmm11 are free.*/ \
232  /*xmm11:xmm10=54491*t5'''+0x0E3D*/ \
233  "movdqa %%xmm5,%%xmm10\n\t" \
234  "movdqa %%xmm5,%%xmm11\n\t" \
235  "punpcklwd %%xmm5,%%xmm10\n\t" \
236  "pmaddwd %%xmm13,%%xmm10\n\t" \
237  "mov $0x8E3A8E3A,%[a]\n\t" \
238  "punpckhwd %%xmm5,%%xmm11\n\t" \
239  "pmaddwd %%xmm13,%%xmm11\n\t" \
240  "movd %[a],%%xmm13\n\t" \
241  "paddd %%xmm12,%%xmm10\n\t" \
242  "pshufd $00,%%xmm13,%%xmm13\n\t" \
243  "paddd %%xmm12,%%xmm11\n\t" \
244  /*xmm7:xmm12=36410*t6''' \
245    xmm1=t6'''*/ \
246  "movdqa %%xmm7,%%xmm3\n\t" \
247  "movdqa %%xmm7,%%xmm1\n\t" \
248  "pmulhw %%xmm13,%%xmm3\n\t" \
249  "pmullw %%xmm13,%%xmm7\n\t" \
250  "paddw %%xmm1,%%xmm3\n\t" \
251  "movdqa %%xmm7,%%xmm12\n\t" \
252  "punpckhwd %%xmm3,%%xmm7\n\t" \
253  "punpcklwd %%xmm3,%%xmm12\n\t" \
254  /*xmm10=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \
255  "paddd %%xmm12,%%xmm10\n\t" \
256  "paddd %%xmm7,%%xmm11\n\t" \
257  "psrad $16,%%xmm10\n\t" \
258  "pcmpeqw %%xmm15,%%xmm5\n\t" \
259  "psrad $16,%%xmm11\n\t" \
260  "psubw %%xmm14,%%xmm5\n\t" \
261  "packssdw %%xmm11,%%xmm10\n\t" \
262  "pxor %%xmm12,%%xmm12\n\t" \
263  "paddw %%xmm5,%%xmm10\n\t" \
264  /*xmm5=_y[5]=u \
265    xmm1=s=t6'''-(36410*u>>16)*/ \
266  "psubw %%xmm14,%%xmm12\n\t" \
267  "movdqa %%xmm10,%%xmm5\n\t" \
268  "mov $0x340067C8,%[a]\n\t" \
269  "pmulhw %%xmm13,%%xmm10\n\t" \
270  "movd %[a],%%xmm13\n\t" \
271  "paddw %%xmm5,%%xmm10\n\t" \
272  "pshufd $00,%%xmm13,%%xmm13\n\t" \
273  "psubw %%xmm10,%%xmm1\n\t" \
274  /*xmm11:xmm3=s*26568+0x3400*/ \
275  "movdqa %%xmm1,%%xmm3\n\t" \
276  "movdqa %%xmm1,%%xmm11\n\t" \
277  "punpcklwd %%xmm12,%%xmm3\n\t" \
278  "pmaddwd %%xmm13,%%xmm3\n\t" \
279  "mov $0x7B1B,%[a]\n\t" \
280  "punpckhwd %%xmm12,%%xmm11\n\t" \
281  "pmaddwd %%xmm13,%%xmm11\n\t" \
282  /*xmm3=(s*26568+0x3400>>17)+s*/ \
283  "psrad $17,%%xmm3\n\t" \
284  "psrad $17,%%xmm11\n\t" \
285  "movd %[a],%%xmm12\n\t" \
286  "packssdw %%xmm11,%%xmm3\n\t" \
287  "pshufd $00,%%xmm12,%%xmm12\n\t" \
288  "paddw %%xmm1,%%xmm3\n\t" \
289  /*xmm3=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \
290  "mov $0x7FFF7B16,%[a]\n\t" \
291  "pcmpeqw %%xmm15,%%xmm1\n\t" \
292  "movd %[a],%%xmm13\n\t" \
293  "psubw %%xmm14,%%xmm1\n\t" \
294  "pshufd $00,%%xmm13,%%xmm13\n\t" \
295  "paddw %%xmm1,%%xmm3\n\t " \
296  /*xmm1,xmm7,xmm10,xmm11 are free.*/ \
297  /*xmm11:xmm10=64277*t7''+0x7B1B*/ \
298  "movdqa %%xmm9,%%xmm10\n\t" \
299  "movdqa %%xmm9,%%xmm11\n\t" \
300  "punpcklwd %%xmm9,%%xmm10\n\t" \
301  "pmaddwd %%xmm13,%%xmm10\n\t" \
302  "mov $0x31F131F1,%[a]\n\t" \
303  "punpckhwd %%xmm9,%%xmm11\n\t" \
304  "pmaddwd %%xmm13,%%xmm11\n\t" \
305  "movd %[a],%%xmm13\n\t" \
306  "paddd %%xmm12,%%xmm10\n\t" \
307  "pshufd $00,%%xmm13,%%xmm13\n\t" \
308  "paddd %%xmm12,%%xmm11\n\t" \
309  /*xmm12:xmm7=12785*t4''*/ \
310  "movdqa %%xmm8,%%xmm7\n\t" \
311  "movdqa %%xmm8,%%xmm1\n\t" \
312  "pmullw %%xmm13,%%xmm7\n\t" \
313  "pmulhw %%xmm13,%%xmm1\n\t" \
314  "movdqa %%xmm7,%%xmm12\n\t" \
315  "punpcklwd %%xmm1,%%xmm7\n\t" \
316  "punpckhwd %%xmm1,%%xmm12\n\t" \
317  /*xmm10=u=(12785*t4''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \
318  "paddd %%xmm7,%%xmm10\n\t" \
319  "paddd %%xmm12,%%xmm11\n\t" \
320  "psrad $16,%%xmm10\n\t" \
321  "pcmpeqw %%xmm15,%%xmm9\n\t" \
322  "psrad $16,%%xmm11\n\t" \
323  "psubw %%xmm14,%%xmm9\n\t" \
324  "packssdw %%xmm11,%%xmm10\n\t" \
325  "pxor %%xmm12,%%xmm12\n\t" \
326  "paddw %%xmm9,%%xmm10\n\t" \
327  /*xmm1=_y[1]=u \
328    xmm10=s=(12785*u>>16)-t4''*/ \
329  "psubw %%xmm14,%%xmm12\n\t" \
330  "movdqa %%xmm10,%%xmm1\n\t" \
331  "mov $0x3000503B,%[a]\n\t" \
332  "pmulhw %%xmm13,%%xmm10\n\t" \
333  "movd %[a],%%xmm13\n\t" \
334  "psubw %%xmm8,%%xmm10\n\t" \
335  "pshufd $00,%%xmm13,%%xmm13\n\t" \
336  /*xmm8:xmm7=s*20539+0x3000*/ \
337  "movdqa %%xmm10,%%xmm7\n\t" \
338  "movdqa %%xmm10,%%xmm8\n\t" \
339  "punpcklwd %%xmm12,%%xmm7\n\t" \
340  "pmaddwd %%xmm13,%%xmm7\n\t" \
341  "punpckhwd %%xmm12,%%xmm8\n\t" \
342  "pmaddwd %%xmm13,%%xmm8\n\t" \
343  /*xmm7=(s*20539+0x3000>>20)+s*/ \
344  "psrad $20,%%xmm7\n\t" \
345  "psrad $20,%%xmm8\n\t" \
346  "packssdw %%xmm8,%%xmm7\n\t" \
347  "paddw %%xmm10,%%xmm7\n\t" \
348  /*xmm7=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \
349  "pcmpeqw %%xmm15,%%xmm10\n\t" \
350  "psubw %%xmm14,%%xmm10\n\t" \
351  "paddw %%xmm10,%%xmm7\n\t " \
352
353 /*SSE2 implementation of the fDCT for x86-64 only.
354   Because of the 8 extra XMM registers on x86-64, this version can operate
355    without any temporary stack access at all.*/
356 void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
357   ptrdiff_t a;
358   __asm__ __volatile__(
359     /*Load the input.*/
360     "movdqa 0x00(%[x]),%%xmm0\n\t"
361     "movdqa 0x10(%[x]),%%xmm1\n\t"
362     "movdqa 0x20(%[x]),%%xmm2\n\t"
363     "movdqa 0x30(%[x]),%%xmm3\n\t"
364     "movdqa 0x40(%[x]),%%xmm4\n\t"
365     "movdqa 0x50(%[x]),%%xmm5\n\t"
366     "movdqa 0x60(%[x]),%%xmm6\n\t"
367     "movdqa 0x70(%[x]),%%xmm7\n\t"
368     /*Add two extra bits of working precision to improve accuracy; any more and
369        we could overflow.*/
370     /*We also add a few biases to correct for some systematic error that
371        remains in the full fDCT->iDCT round trip.*/
372     /*xmm15={0}x8*/
373     "pxor %%xmm15,%%xmm15\n\t"
374     /*xmm14={-1}x8*/
375     "pcmpeqb %%xmm14,%%xmm14\n\t"
376     "psllw $2,%%xmm0\n\t"
377     /*xmm8=xmm0*/
378     "movdqa %%xmm0,%%xmm8\n\t"
379     "psllw $2,%%xmm1\n\t"
380     /*xmm8={_x[7...0]==0}*/
381     "pcmpeqw %%xmm15,%%xmm8\n\t"
382     "psllw $2,%%xmm2\n\t"
383     /*xmm8={_x[7...0]!=0}*/
384     "psubw %%xmm14,%%xmm8\n\t"
385     "psllw $2,%%xmm3\n\t"
386     /*%[a]=1*/
387     "mov $1,%[a]\n\t"
388     /*xmm8={_x[6]!=0,0,_x[4]!=0,0,_x[2]!=0,0,_x[0]!=0,0}*/
389     "pslld $16,%%xmm8\n\t"
390     "psllw $2,%%xmm4\n\t"
391     /*xmm9={0,0,0,0,0,0,0,1}*/
392     "movd %[a],%%xmm9\n\t"
393     /*xmm8={0,0,_x[2]!=0,0,_x[0]!=0,0}*/
394     "pshufhw $0x00,%%xmm8,%%xmm8\n\t"
395     "psllw $2,%%xmm5\n\t"
396     /*%[a]={1}x2*/
397     "mov $0x10001,%[a]\n\t"
398     /*xmm8={0,0,0,0,0,0,0,_x[0]!=0}*/
399     "pshuflw $0x01,%%xmm8,%%xmm8\n\t"
400     "psllw $2,%%xmm6\n\t"
401     /*xmm10={0,0,0,0,0,0,1,1}*/
402     "movd %[a],%%xmm10\n\t"
403     /*xmm0=_x[7...0]+{0,0,0,0,0,0,0,_x[0]!=0}*/
404     "paddw %%xmm8,%%xmm0\n\t"
405     "psllw $2,%%xmm7\n\t"
406     /*xmm0=_x[7...0]+{0,0,0,0,0,0,1,(_x[0]!=0)+1}*/
407     "paddw %%xmm10,%%xmm0\n\t"
408     /*xmm1=_x[15...8]-{0,0,0,0,0,0,0,1}*/
409     "psubw %%xmm9,%%xmm1\n\t"
410     /*Transform columns.*/
411     OC_FDCT_8x8
412     /*Transform rows.*/
413     OC_TRANSPOSE_8x8
414     OC_FDCT_8x8
415     /*TODO: zig-zag ordering?*/
416     OC_TRANSPOSE_8x8
417     /*xmm14={-2,-2,-2,-2,-2,-2,-2,-2}*/
418     "paddw %%xmm14,%%xmm14\n\t"
419     "psubw %%xmm14,%%xmm0\n\t"
420     "psubw %%xmm14,%%xmm1\n\t"
421     "psraw $2,%%xmm0\n\t"
422     "psubw %%xmm14,%%xmm2\n\t"
423     "psraw $2,%%xmm1\n\t"
424     "psubw %%xmm14,%%xmm3\n\t"
425     "psraw $2,%%xmm2\n\t"
426     "psubw %%xmm14,%%xmm4\n\t"
427     "psraw $2,%%xmm3\n\t"
428     "psubw %%xmm14,%%xmm5\n\t"
429     "psraw $2,%%xmm4\n\t"
430     "psubw %%xmm14,%%xmm6\n\t"
431     "psraw $2,%%xmm5\n\t"
432     "psubw %%xmm14,%%xmm7\n\t"
433     "psraw $2,%%xmm6\n\t"
434     "psraw $2,%%xmm7\n\t"
435     /*Store the result.*/
436     "movdqa %%xmm0,0x00(%[y])\n\t"
437     "movdqa %%xmm1,0x10(%[y])\n\t"
438     "movdqa %%xmm2,0x20(%[y])\n\t"
439     "movdqa %%xmm3,0x30(%[y])\n\t"
440     "movdqa %%xmm4,0x40(%[y])\n\t"
441     "movdqa %%xmm5,0x50(%[y])\n\t"
442     "movdqa %%xmm6,0x60(%[y])\n\t"
443     "movdqa %%xmm7,0x70(%[y])\n\t"
444     :[a]"=&r"(a)
445     :[y]"r"(_y),[x]"r"(_x)
446     :"memory"
447   );
448 }
449 #endif