Add a new libtheora_info example program.
[theora.git] / lib / c64x / c64xidct.c
1 /********************************************************************
2  *                                                                  *
3  * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4  * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5  * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7  *                                                                  *
8  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
9  * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
10  *                                                                  *
11  ********************************************************************
12
13   function:
14     last mod: $Id$
15
16  ********************************************************************/
17 #include <string.h>
18 #include "c64xint.h"
19 #include "dct.h"
20
21 #define OC_C1S7D ((OC_C1S7<<16)|(OC_C1S7&0xFFFF))
22 #define OC_C2S6D ((OC_C2S6<<16)|(OC_C2S6&0xFFFF))
23 #define OC_C3S5D ((OC_C3S5<<16)|(OC_C3S5&0xFFFF))
24 #define OC_C4S4D ((OC_C4S4<<16)|(OC_C4S4&0xFFFF))
25 #define OC_C5S3D ((OC_C5S3<<16)|(OC_C5S3&0xFFFF))
26 #define OC_C6S2D ((OC_C6S2<<16)|(OC_C6S2&0xFFFF))
27 #define OC_C7S1D ((OC_C7S1<<16)|(OC_C7S1&0xFFFF))
28
29 /*Various building blocks for the iDCT implementations.
30   These are done in macros instead of functions so that we can use all local
31    variables, which avoids leaving the compiler to try to sort out memory
32    reference dependencies.*/
33
34 /*Load two rows into x0...x7.*/
35 #define OC_IDCT8x2_LOAD8(_x) \
36   do{ \
37     long long ll; \
38     ll=_dpack2(_amem4_const((_x)+8),_amem4_const((_x)+0)); \
39     x0=_loll(ll); \
40     x1=_hill(ll); \
41     ll=_dpack2(_amem4_const((_x)+10),_amem4_const((_x)+2)); \
42     x2=_loll(ll); \
43     x3=_hill(ll); \
44     ll=_dpack2(_amem4_const((_x)+12),_amem4_const((_x)+4)); \
45     x4=_loll(ll); \
46     x5=_hill(ll); \
47     ll=_dpack2(_amem4_const((_x)+14),_amem4_const((_x)+6)); \
48     x6=_loll(ll); \
49     x7=_hill(ll); \
50   } \
51   while(0)
52
53 /*Load two rows into x0...x3.
54   Uses ll as a temporary.*/
55 #define OC_IDCT8x2_LOAD4(_x) \
56   do{ \
57     long long ll; \
58     ll=_dpack2(_amem4_const((_x)+8),_amem4_const((_x)+0)); \
59     x0=_loll(ll); \
60     x1=_hill(ll); \
61     ll=_dpack2(_amem4_const((_x)+10),_amem4_const((_x)+2)); \
62     x2=_loll(ll); \
63     x3=_hill(ll); \
64   } \
65   while(0)
66
67 /*Load two rows into x0...x1.*/
68 #define OC_IDCT8x2_LOAD2(_x) \
69   do{ \
70     long long ll; \
71     ll=_dpack2(_amem4_const((_x)+8),_amem4_const((_x)+0)); \
72     x0=_loll(ll); \
73     x1=_hill(ll); \
74   } \
75   while(0)
76
77 /*Load two columns into x0...x1.*/
78 #define OC_IDCT8x2_LOAD2T(_x) \
79   do{ \
80     x0=_amem4_const((_x)+(0<<3)); \
81     x1=_amem4_const((_x)+(1<<3)); \
82   } \
83   while(0)
84
85 /*Transform x0...x7 into t0...t7.*/
86 #define OC_IDCT8x2() \
87   do{ \
88     long long ll; \
89     int       a; \
90     int       b; \
91     /*Stage 1:*/ \
92     ll=_addsub2(x0,x4); \
93     a=_hill(ll); \
94     b=_loll(ll); \
95     t0=_packh2(_mpyhus(OC_C4S4D,a),_mpyus(OC_C4S4D,a)); \
96     t1=_packh2(_mpyhus(OC_C4S4D,b),_mpyus(OC_C4S4D,b)); \
97     ll=_mpy2ll(OC_C6S2D,x2); \
98     a=_packh2(_hill(ll),_loll(ll)); \
99     ll=_mpy2ll(OC_C2S6D,x6); \
100     b=_add2(_packh2(_hill(ll),_loll(ll)),x6); \
101     t2=_sub2(a,b); \
102     ll=_mpy2ll(OC_C2S6D,x2); \
103     a=_add2(_packh2(_hill(ll),_loll(ll)),x2); \
104     ll=_mpy2ll(OC_C6S2D,x6); \
105     b=_packh2(_hill(ll),_loll(ll)); \
106     t3=_add2(a,b); \
107     ll=_mpy2ll(OC_C7S1D,x1); \
108     a=_packh2(_hill(ll),_loll(ll)); \
109     ll=_mpy2ll(OC_C1S7D,x7); \
110     b=_add2(_packh2(_hill(ll),_loll(ll)),x7); \
111     t4=_sub2(a,b); \
112     ll=_mpy2ll(OC_C3S5D,x5); \
113     a=_add2(_packh2(_hill(ll),_loll(ll)),x5); \
114     ll=_mpy2ll(OC_C5S3D,x3); \
115     b=_add2(_packh2(_hill(ll),_loll(ll)),x3); \
116     t5=_sub2(a,b); \
117     ll=_mpy2ll(OC_C5S3D,x5); \
118     a=_add2(_packh2(_hill(ll),_loll(ll)),x5); \
119     ll=_mpy2ll(OC_C3S5D,x3); \
120     b=_add2(_packh2(_hill(ll),_loll(ll)),x3); \
121     t6=_add2(a,b); \
122     ll=_mpy2ll(OC_C1S7D,x1); \
123     a=_add2(_packh2(_hill(ll),_loll(ll)),x1); \
124     ll=_mpy2ll(OC_C7S1D,x7); \
125     b=_packh2(_hill(ll),_loll(ll)); \
126     t7=_add2(a,b); \
127     /*Stage 2:*/ \
128     ll=_addsub2(t4,t5); \
129     t4=_hill(ll); \
130     b=_loll(ll); \
131     ll=_mpy2ll(OC_C4S4D,b); \
132     t5=_add2(_packh2(_hill(ll),_loll(ll)),b); \
133     ll=_addsub2(t7,t6); \
134     t7=_hill(ll); \
135     b=_loll(ll); \
136     ll=_mpy2ll(OC_C4S4D,b); \
137     t6=_add2(_packh2(_hill(ll),_loll(ll)),b); \
138     /*Stage 3:*/ \
139     ll=_addsub2(t0,t3); \
140     t0=_hill(ll); \
141     t3=_loll(ll); \
142     ll=_addsub2(t1,t2); \
143     t1=_hill(ll); \
144     t2=_loll(ll); \
145     ll=_addsub2(t6,t5); \
146     t6=_hill(ll); \
147     t5=_loll(ll); \
148   } \
149   while(0)
150
151 /*Transform x0...x3 into t0...t7, assuming x4...x7 are zero.*/
152 #define OC_IDCT8x2_4() \
153   do{ \
154     long long ll; \
155     int       a; \
156     /*Stage 1:*/ \
157     ll=_mpy2ll(OC_C4S4D,x0); \
158     t0=_add2(_packh2(_hill(ll),_loll(ll)),x0); \
159     t1=t0; \
160     ll=_mpy2ll(OC_C6S2D,x2); \
161     t2=_packh2(_hill(ll),_loll(ll)); \
162     ll=_mpy2ll(OC_C2S6D,x2); \
163     t3=_add2(_packh2(_hill(ll),_loll(ll)),x2); \
164     ll=_mpy2ll(OC_C7S1D,x1); \
165     t4=_packh2(_hill(ll),_loll(ll)); \
166     ll=_mpy2ll(OC_C5S3D,x3); \
167     t5=_add2(_packh2(_hill(ll),_loll(ll)),x3); \
168     ll=_mpy2ll(OC_C3S5D,x3); \
169     t6=_add2(_packh2(_hill(ll),_loll(ll)),x3); \
170     ll=_mpy2ll(OC_C1S7D,x1); \
171     t7=_add2(_packh2(_hill(ll),_loll(ll)),x1); \
172     /*Stage 2:*/ \
173     ll=_addsub2(t4,t5); \
174     t4=_loll(ll); \
175     a=_hill(ll); \
176     ll=_mpy2ll(OC_C4S4D,a); \
177     t5=_add2(_packh2(_hill(ll),_loll(ll)),a); \
178     ll=_addsub2(t7,t6); \
179     t7=_hill(ll); \
180     a=_loll(ll); \
181     ll=_mpy2ll(OC_C4S4D,a); \
182     t6=_add2(_packh2(_hill(ll),_loll(ll)),a); \
183     /*Stage 3:*/ \
184     ll=_addsub2(t0,t3); \
185     t0=_hill(ll); \
186     t3=_loll(ll); \
187     ll=_addsub2(t1,t2); \
188     t1=_hill(ll); \
189     t2=_loll(ll); \
190     ll=_addsub2(t6,t5); \
191     t6=_hill(ll); \
192     t5=_loll(ll); \
193   } \
194   while(0)
195
196 /*Transform x0...x1 into t0...t7, assuming x2...x7 are zero.*/
197 #define OC_IDCT8x2_2() \
198   do{ \
199     long long ll; \
200     /*Stage 1:*/ \
201     ll=_mpy2ll(OC_C4S4D,x0); \
202     t0=_add2(_packh2(_hill(ll),_loll(ll)),x0); \
203     t1=t0; \
204     ll=_mpy2ll(OC_C7S1D,x1); \
205     t4=_packh2(_hill(ll),_loll(ll)); \
206     ll=_mpy2ll(OC_C1S7D,x1); \
207     t7=_add2(_packh2(_hill(ll),_loll(ll)),x1); \
208     /*Stage 2:*/ \
209     ll=_mpy2ll(OC_C4S4D,t4); \
210     t5=_add2(_packh2(_hill(ll),_loll(ll)),t4); \
211     ll=_mpy2ll(OC_C4S4D,t7); \
212     t6=_add2(_packh2(_hill(ll),_loll(ll)),t7); \
213     /*Stage 3:*/ \
214     t3=t0; \
215     t2=t1; \
216     ll=_addsub2(t6,t5); \
217     t6=_hill(ll); \
218     t5=_loll(ll); \
219   } \
220   while(0)
221
222 /*Finish transforming t0...t7 and store two rows.*/
223 #define OC_IDCT8x2_STORE(_y) \
224   do{ \
225     long long ll; \
226     int       a; \
227     int       b; \
228     int       c; \
229     int       d; \
230     /*Stage 4:*/ \
231     ll=_addsub2(t0,t7); \
232     a=_hill(ll); \
233     c=_loll(ll); \
234     ll=_addsub2(t1,t6); \
235     b=_hill(ll); \
236     d=_loll(ll); \
237     ll=_dpack2(b,a); \
238     _amem4((_y)+0)=_loll(ll); \
239     _amem4((_y)+8)=_hill(ll); \
240     ll=_dpack2(c,d); \
241     _amem4((_y)+6)=_loll(ll); \
242     _amem4((_y)+14)=_hill(ll); \
243     ll=_addsub2(t2,t5); \
244     a=_hill(ll); \
245     c=_loll(ll); \
246     ll=_addsub2(t3,t4); \
247     b=_hill(ll); \
248     d=_loll(ll); \
249     ll=_dpack2(b,a); \
250     _amem4((_y)+2)=_loll(ll); \
251     _amem4((_y)+10)=_hill(ll); \
252     ll=_dpack2(c,d); \
253     _amem4((_y)+4)=_loll(ll); \
254     _amem4((_y)+12)=_hill(ll); \
255   } \
256   while(0)
257
258 /*Finish transforming t0...t7 and store two columns.*/
259 #define OC_IDCT8x2_STORET(_y) \
260   do{ \
261     long long ll; \
262     /*Stage 4:*/ \
263     ll=_addsub2(t0,t7); \
264     _amem4((_y)+(0<<3))=_hill(ll); \
265     _amem4((_y)+(7<<3))=_loll(ll); \
266     ll=_addsub2(t1,t6); \
267     _amem4((_y)+(1<<3))=_hill(ll); \
268     _amem4((_y)+(6<<3))=_loll(ll); \
269     ll=_addsub2(t2,t5); \
270     _amem4((_y)+(2<<3))=_hill(ll); \
271     _amem4((_y)+(5<<3))=_loll(ll); \
272     ll=_addsub2(t3,t4); \
273     _amem4((_y)+(3<<3))=_hill(ll); \
274     _amem4((_y)+(4<<3))=_loll(ll); \
275   } \
276   while(0)
277
278 /*Finish transforming t0...t7, round and scale, and store two columns.*/
279 #define OC_IDCT8x2_ROUND_STORET(_y) \
280   do{ \
281     long long ll; \
282     /*Stage 4:*/ \
283     /*Adjust for the scale factor.*/ \
284     ll=_addsub2(t0,t7); \
285     _amem4((_y)+(0<<3))=_shr2(_add2(_hill(ll),0x00080008),4); \
286     _amem4((_y)+(7<<3))=_shr2(_add2(_loll(ll),0x00080008),4); \
287     ll=_addsub2(t1,t6); \
288     _amem4((_y)+(1<<3))=_shr2(_add2(_hill(ll),0x00080008),4); \
289     _amem4((_y)+(6<<3))=_shr2(_add2(_loll(ll),0x00080008),4); \
290     ll=_addsub2(t2,t5); \
291     _amem4((_y)+(2<<3))=_shr2(_add2(_hill(ll),0x00080008),4); \
292     _amem4((_y)+(5<<3))=_shr2(_add2(_loll(ll),0x00080008),4); \
293     ll=_addsub2(t3,t4); \
294     _amem4((_y)+(3<<3))=_shr2(_add2(_hill(ll),0x00080008),4); \
295     _amem4((_y)+(4<<3))=_shr2(_add2(_loll(ll),0x00080008),4); \
296   } \
297   while(0)
298
299 /*196 cycles.*/
300 static void oc_idct8x8_slow_c64x(ogg_int16_t _y[64],ogg_int16_t _x[64]){
301   ogg_int16_t w[64];
302   int         x0;
303   int         x1;
304   int         x2;
305   int         x3;
306   int         x4;
307   int         x5;
308   int         x6;
309   int         x7;
310   int         t0;
311   int         t1;
312   int         t2;
313   int         t3;
314   int         t4;
315   int         t5;
316   int         t6;
317   int         t7;
318   int         i;
319   /*Transform rows of x into columns of w.*/
320   for(i=0;i<8;i+=2){
321     OC_IDCT8x2_LOAD8(_x+i*8);
322     if(_x!=_y){
323       _amem8(_x+i*8)=0LL;
324       _amem8(_x+i*8+4)=0LL;
325       _amem8(_x+i*8+8)=0LL;
326       _amem8(_x+i*8+12)=0LL;
327     }
328     OC_IDCT8x2();
329     OC_IDCT8x2_STORET(w+i);
330   }
331   /*Transform rows of w into columns of y.*/
332   for(i=0;i<8;i+=2){
333     OC_IDCT8x2_LOAD8(w+i*8);
334     OC_IDCT8x2();
335     OC_IDCT8x2_ROUND_STORET(_y+i);
336   }
337 }
338
339 /*106 cycles.*/
340 static void oc_idct8x8_10_c64x(ogg_int16_t _y[64],ogg_int16_t _x[64]){
341   ogg_int16_t w[64];
342   int         t0;
343   int         t1;
344   int         t2;
345   int         t3;
346   int         t4;
347   int         t5;
348   int         t6;
349   int         t7;
350   int         x0;
351   int         x1;
352   int         x2;
353   int         x3;
354   int         i;
355   /*Transform rows of x into columns of w.*/
356   OC_IDCT8x2_LOAD4(_x);
357   OC_IDCT8x2_4();
358   OC_IDCT8x2_STORET(w);
359   OC_IDCT8x2_LOAD2(_x+16);
360   if(_x!=_y){
361     _amem8(_x)=0LL;
362     _amem8(_x+8)=0LL;
363     _amem4(_x+16)=0;
364     _amem4(_x+24)=0;
365   }
366   OC_IDCT8x2_2();
367   OC_IDCT8x2_STORET(w+2);
368   /*Transform rows of w into columns of y.*/
369   for(i=0;i<8;i+=2){
370     OC_IDCT8x2_LOAD4(w+i*8);
371     OC_IDCT8x2_4();
372     OC_IDCT8x2_ROUND_STORET(_y+i);
373   }
374 }
375
376 #if 0
377 /*This used to compile to something faster (88 cycles), but no longer, and I'm
378    not sure what changed to cause this.
379   In any case, it's barely an advantage over the 10-coefficient version, and is
380    now hardly worth the icache space.*/
381 /*95 cycles.*/
382 static inline void oc_idct8x8_3_c64x(ogg_int16_t _y[64],ogg_int16_t _x[64]){
383   ogg_int16_t w[64];
384   int         t0;
385   int         t1;
386   int         t2;
387   int         t3;
388   int         t4;
389   int         t5;
390   int         t6;
391   int         t7;
392   int         x0;
393   int         x1;
394   int         i;
395   /*Transform rows of x into rows of w.*/
396   for(i=0;i<2;i+=2){
397     OC_IDCT8x2_LOAD2(_x+i*8);
398     OC_IDCT8x2_2();
399     OC_IDCT8x2_STORE(w+i*8);
400   }
401   if(_x!=_y){
402     _amem4(_x)=0;
403     _amem4(_x+8)=0;
404   }
405   /*Transform columns of w into columns of y.*/
406   for(i=0;i<8;i+=2){
407     OC_IDCT8x2_LOAD2T(w+i);
408     OC_IDCT8x2_2();
409     OC_IDCT8x2_ROUND_STORET(_y+i);
410   }
411 }
412 #endif
413
414 /*Performs an inverse 8x8 Type-II DCT transform.
415   The input is assumed to be scaled by a factor of 4 relative to orthonormal
416    version of the transform.*/
417 void oc_idct8x8_c64x(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
418   /*if(_last_zzi<=3)oc_idct8x8_3_c64x(_y,_x);
419   else*/ if(_last_zzi<=10)oc_idct8x8_10_c64x(_y,_x);
420   else oc_idct8x8_slow_c64x(_y,_x);
421 }