ce7e7800b2e0259d3a887a3a035a89cf15f63f72
[theora.git] / lib / fdct.c
1 /********************************************************************
2  *                                                                  *
3  * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4  * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5  * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7  *                                                                  *
8  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
9  * by the Xiph.Org Foundation http://www.xiph.org/                  *
10  *                                                                  *
11  ********************************************************************
12
13   function:
14   last mod: $Id$
15
16  ********************************************************************/
17 #include "encint.h"
18 #include "dct.h"
19
20
21
22 /*Performs a forward 8 point Type-II DCT transform.
23   The output is scaled by a factor of 2 from the orthonormal version of the
24    transform.
25   _y: The buffer to store the result in.
26       Data will be placed the first 8 entries (e.g., in a row of an 8x8 block).
27   _x: The input coefficients.
28       Every 8th entry is used (e.g., from a column of an 8x8 block).*/
29 static void oc_fdct8(ogg_int16_t _y[8],const ogg_int16_t *_x){
30   int t0;
31   int t1;
32   int t2;
33   int t3;
34   int t4;
35   int t5;
36   int t6;
37   int t7;
38   int r;
39   int s;
40   int u;
41   int v;
42   /*Stage 1:*/
43   /*0-7 butterfly.*/
44   t0=_x[0<<3]+(int)_x[7<<3];
45   t7=_x[0<<3]-(int)_x[7<<3];
46   /*1-6 butterfly.*/
47   t1=_x[1<<3]+(int)_x[6<<3];
48   t6=_x[1<<3]-(int)_x[6<<3];
49   /*2-5 butterfly.*/
50   t2=_x[2<<3]+(int)_x[5<<3];
51   t5=_x[2<<3]-(int)_x[5<<3];
52   /*3-4 butterfly.*/
53   t3=_x[3<<3]+(int)_x[4<<3];
54   t4=_x[3<<3]-(int)_x[4<<3];
55   /*Stage 2:*/
56   /*0-3 butterfly.*/
57   r=t0+t3;
58   t3=t0-t3;
59   t0=r;
60   /*1-2 butterfly.*/
61   r=t1+t2;
62   t2=t1-t2;
63   t1=r;
64   /*6-5 butterfly.*/
65   r=t6+t5;
66   t5=t6-t5;
67   t6=r;
68   /*Stages 3 and 4 are where all the approximation occurs.
69     These are chosen to be as close to an exact inverse of the approximations
70      made in the iDCT as possible, while still using mostly 16-bit arithmetic.
71     We use some 16x16->32 signed MACs, but those still commonly execute in 1
72      cycle on a 16-bit DSP.
73     For example, s=(27146*t5+0x4000>>16)+t5+(t5!=0) is an exact inverse of
74      t5=(OC_C4S4*s>>16).
75     That is, applying the latter to the output of the former will recover t5
76      exactly (over the valid input range of t5, -23171...23169).
77     We increase the rounding bias to 0xB500 in this particular case so that
78      errors inverting the subsequent butterfly are not one-sided (e.g., the
79      mean error is very close to zero).
80     The (t5!=0) term could be replaced simply by 1, but we want to send 0 to 0.
81     The fDCT of an all-zeros block will still not be zero, because of the
82      biases we added at the very beginning of the process, but it will be close
83      enough that it is guaranteed to round to zero.*/
84   /*Stage 3:*/
85   /*4-5 butterfly.*/
86   s=(27146*t5+0xB500>>16)+t5+(t5!=0)>>1;
87   r=t4+s;
88   t5=t4-s;
89   t4=r;
90   /*7-6 butterfly.*/
91   s=(27146*t6+0xB500>>16)+t6+(t6!=0)>>1;
92   r=t7+s;
93   t6=t7-s;
94   t7=r;
95   /*Stage 4:*/
96   /*0-1 butterfly.*/
97   r=(27146*t0+0x4000>>16)+t0+(t0!=0);
98   s=(27146*t1+0xB500>>16)+t1+(t1!=0);
99   u=r+s>>1;
100   v=r-u;
101   _y[0]=u;
102   _y[4]=v;
103   /*3-2 rotation by 6pi/16*/
104   u=(OC_C6S2*t2+OC_C2S6*t3+0x6CB7>>16)+(t3!=0);
105   s=(OC_C6S2*u>>16)-t2;
106   v=(s*21600+0x2800>>18)+s+(s!=0);
107   _y[2]=u;
108   _y[6]=v;
109   /*6-5 rotation by 3pi/16*/
110   u=(OC_C5S3*t6+OC_C3S5*t5+0x0E3D>>16)+(t5!=0);
111   s=t6-(OC_C5S3*u>>16);
112   v=(s*26568+0x3400>>17)+s+(s!=0);
113   _y[5]=u;
114   _y[3]=v;
115   /*7-4 rotation by 7pi/16*/
116   u=(OC_C7S1*t4+OC_C1S7*t7+0x7B1B>>16)+(t7!=0);
117   s=(OC_C7S1*u>>16)-t4;
118   v=(s*20539+0x3000>>20)+s+(s!=0);
119   _y[1]=u;
120   _y[7]=v;
121 }
122
123 /*Performs a forward 8x8 Type-II DCT transform.
124   The output is scaled by a factor of 4 relative to the orthonormal version
125    of the transform.
126   _y: The buffer to store the result in.
127       This may be the same as _x.
128   _x: The input coefficients. */
129 void oc_enc_fdct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
130   const ogg_int16_t *in;
131   ogg_int16_t       *end;
132   ogg_int16_t       *out;
133   ogg_int16_t        w[64];
134   int                i;
135   /*Add two extra bits of working precision to improve accuracy; any more and
136      we could overflow.*/
137   for(i=0;i<64;i++)w[i]=_x[i]<<2;
138   /*These biases correct for some systematic error that remains in the full
139      fDCT->iDCT round trip.*/
140   w[0]+=(w[0]!=0)+1;
141   w[1]++;
142   w[8]--;
143   /*Transform columns of w into rows of _y.*/
144   for(in=w,out=_y,end=out+64;out<end;in++,out+=8)oc_fdct8(out,in);
145   /*Transform columns of _y into rows of w.*/
146   for(in=_y,out=w,end=out+64;out<end;in++,out+=8)oc_fdct8(out,in);
147   /*Round the result back to the external working precision (which is still
148      scaled by four relative to the orthogonal result).
149     TODO: We should just update the external working precision.*/
150   for(i=0;i<64;i++)_y[i]=w[i]+2>>2;
151 }
152
153
154
155 /*This does not seem to outperform simple LFE border padding before MC.
156   It yields higher PSNR, but much higher bitrate usage.*/
157 #if 0
158 typedef struct oc_extension_info oc_extension_info;
159
160
161
162 /*Information needed to pad boundary blocks.
163   We multiply each row/column by an extension matrix that fills in the padding
164    values as a linear combination of the active values, so that an equivalent
165    number of coefficients are forced to zero.
166   This costs at most 16 multiplies, the same as a 1-D fDCT itself, and as
167    little as 7 multiplies.
168   We compute the extension matrices for every possible shape in advance, as
169    there are only 35.
170   The coefficients for all matrices are stored in a single array to take
171    advantage of the overlap and repetitiveness of many of the shapes.
172   A similar technique is applied to the offsets into this array.
173   This reduces the required table storage by about 48%.
174   See tools/extgen.c for details.
175   We could conceivably do the same for all 256 possible shapes.*/
176 struct oc_extension_info{
177   /*The mask of the active pixels in the shape.*/
178   short                     mask;
179   /*The number of active pixels in the shape.*/
180   short                     na;
181   /*The extension matrix.
182     This is (8-na)xna*/
183   const ogg_int16_t *const *ext;
184   /*The pixel indices: na active pixels followed by 8-na padding pixels.*/
185   unsigned char             pi[8];
186   /*The coefficient indices: na unconstrained coefficients followed by 8-na
187      coefficients to be forced to zero.*/
188   unsigned char             ci[8];
189 };
190
191
192 /*The number of shapes we need.*/
193 #define OC_NSHAPES   (35)
194
195 static const ogg_int16_t OC_EXT_COEFFS[229]={
196   0x7FFF,0xE1F8,0x6903,0xAA79,0x5587,0x7FFF,0x1E08,0x7FFF,
197   0x5587,0xAA79,0x6903,0xE1F8,0x7FFF,0x0000,0x0000,0x0000,
198   0x7FFF,0x0000,0x0000,0x7FFF,0x8000,0x7FFF,0x0000,0x0000,
199   0x7FFF,0xE1F8,0x1E08,0xB0A7,0xAA1D,0x337C,0x7FFF,0x4345,
200   0x2267,0x4345,0x7FFF,0x337C,0xAA1D,0xB0A7,0x8A8C,0x4F59,
201   0x03B4,0xE2D6,0x7FFF,0x2CF3,0x7FFF,0xE2D6,0x03B4,0x4F59,
202   0x8A8C,0x1103,0x7AEF,0x5225,0xDF60,0xC288,0xDF60,0x5225,
203   0x7AEF,0x1103,0x668A,0xD6EE,0x3A16,0x0E6C,0xFA07,0x0E6C,
204   0x3A16,0xD6EE,0x668A,0x2A79,0x2402,0x980F,0x50F5,0x4882,
205   0x50F5,0x980F,0x2402,0x2A79,0xF976,0x2768,0x5F22,0x2768,
206   0xF976,0x1F91,0x76C1,0xE9AE,0x76C1,0x1F91,0x7FFF,0xD185,
207   0x0FC8,0xD185,0x7FFF,0x4F59,0x4345,0xED62,0x4345,0x4F59,
208   0xF574,0x5D99,0x2CF3,0x5D99,0xF574,0x5587,0x3505,0x30FC,
209   0xF482,0x953C,0xEAC4,0x7FFF,0x4F04,0x7FFF,0xEAC4,0x953C,
210   0xF482,0x30FC,0x4F04,0x273D,0xD8C3,0x273D,0x1E09,0x61F7,
211   0x1E09,0x273D,0xD8C3,0x273D,0x4F04,0x30FC,0xA57E,0x153C,
212   0x6AC4,0x3C7A,0x1E08,0x3C7A,0x6AC4,0x153C,0xA57E,0x7FFF,
213   0xA57E,0x5A82,0x6AC4,0x153C,0xC386,0xE1F8,0xC386,0x153C,
214   0x6AC4,0x5A82,0xD8C3,0x273D,0x7FFF,0xE1F7,0x7FFF,0x273D,
215   0xD8C3,0x4F04,0x30FC,0xD8C3,0x273D,0xD8C3,0x30FC,0x4F04,
216   0x1FC8,0x67AD,0x1853,0xE038,0x1853,0x67AD,0x1FC8,0x4546,
217   0xE038,0x1FC8,0x3ABA,0x1FC8,0xE038,0x4546,0x3505,0x5587,
218   0xF574,0xBC11,0x78F4,0x4AFB,0xE6F3,0x4E12,0x3C11,0xF8F4,
219   0x4AFB,0x3C7A,0xF88B,0x3C11,0x78F4,0xCAFB,0x7FFF,0x08CC,
220   0x070C,0x236D,0x5587,0x236D,0x070C,0xF88B,0x3C7A,0x4AFB,
221   0xF8F4,0x3C11,0x7FFF,0x153C,0xCAFB,0x153C,0x7FFF,0x1E08,
222   0xE1F8,0x7FFF,0x08CC,0x7FFF,0xCAFB,0x78F4,0x3C11,0x4E12,
223   0xE6F3,0x4AFB,0x78F4,0xBC11,0xFE3D,0x7FFF,0xFE3D,0x2F3A,
224   0x7FFF,0x2F3A,0x89BC,0x7FFF,0x89BC
225 };
226
227 static const ogg_int16_t *const OC_EXT_ROWS[96]={
228   OC_EXT_COEFFS+   0,OC_EXT_COEFFS+   0,OC_EXT_COEFFS+   0,OC_EXT_COEFFS+   0,
229   OC_EXT_COEFFS+   0,OC_EXT_COEFFS+   0,OC_EXT_COEFFS+   0,OC_EXT_COEFFS+   6,
230   OC_EXT_COEFFS+  27,OC_EXT_COEFFS+  38,OC_EXT_COEFFS+  43,OC_EXT_COEFFS+  32,
231   OC_EXT_COEFFS+  49,OC_EXT_COEFFS+  58,OC_EXT_COEFFS+  67,OC_EXT_COEFFS+  71,
232   OC_EXT_COEFFS+  62,OC_EXT_COEFFS+  53,OC_EXT_COEFFS+  12,OC_EXT_COEFFS+  15,
233   OC_EXT_COEFFS+  14,OC_EXT_COEFFS+  13,OC_EXT_COEFFS+  76,OC_EXT_COEFFS+  81,
234   OC_EXT_COEFFS+  86,OC_EXT_COEFFS+  91,OC_EXT_COEFFS+  96,OC_EXT_COEFFS+  98,
235   OC_EXT_COEFFS+  93,OC_EXT_COEFFS+  88,OC_EXT_COEFFS+  83,OC_EXT_COEFFS+  78,
236   OC_EXT_COEFFS+  12,OC_EXT_COEFFS+  15,OC_EXT_COEFFS+  15,OC_EXT_COEFFS+  12,
237   OC_EXT_COEFFS+  12,OC_EXT_COEFFS+  15,OC_EXT_COEFFS+  12,OC_EXT_COEFFS+  15,
238   OC_EXT_COEFFS+  15,OC_EXT_COEFFS+  12,OC_EXT_COEFFS+ 103,OC_EXT_COEFFS+ 108,
239   OC_EXT_COEFFS+ 126,OC_EXT_COEFFS+  16,OC_EXT_COEFFS+ 137,OC_EXT_COEFFS+ 141,
240   OC_EXT_COEFFS+  20,OC_EXT_COEFFS+ 130,OC_EXT_COEFFS+ 113,OC_EXT_COEFFS+ 116,
241   OC_EXT_COEFFS+ 146,OC_EXT_COEFFS+ 153,OC_EXT_COEFFS+ 160,OC_EXT_COEFFS+ 167,
242   OC_EXT_COEFFS+ 170,OC_EXT_COEFFS+ 163,OC_EXT_COEFFS+ 156,OC_EXT_COEFFS+ 149,
243   OC_EXT_COEFFS+ 119,OC_EXT_COEFFS+ 122,OC_EXT_COEFFS+ 174,OC_EXT_COEFFS+ 177,
244   OC_EXT_COEFFS+ 182,OC_EXT_COEFFS+ 187,OC_EXT_COEFFS+ 192,OC_EXT_COEFFS+ 197,
245   OC_EXT_COEFFS+ 202,OC_EXT_COEFFS+ 207,OC_EXT_COEFFS+ 210,OC_EXT_COEFFS+ 215,
246   OC_EXT_COEFFS+ 179,OC_EXT_COEFFS+ 189,OC_EXT_COEFFS+  24,OC_EXT_COEFFS+ 204,
247   OC_EXT_COEFFS+ 184,OC_EXT_COEFFS+ 194,OC_EXT_COEFFS+ 212,OC_EXT_COEFFS+ 199,
248   OC_EXT_COEFFS+ 217,OC_EXT_COEFFS+ 100,OC_EXT_COEFFS+ 134,OC_EXT_COEFFS+ 135,
249   OC_EXT_COEFFS+ 135,OC_EXT_COEFFS+  12,OC_EXT_COEFFS+  15,OC_EXT_COEFFS+ 134,
250   OC_EXT_COEFFS+ 134,OC_EXT_COEFFS+ 135,OC_EXT_COEFFS+ 220,OC_EXT_COEFFS+ 223,
251   OC_EXT_COEFFS+ 226,OC_EXT_COEFFS+ 227,OC_EXT_COEFFS+ 224,OC_EXT_COEFFS+ 221
252 };
253
254 static const oc_extension_info OC_EXTENSION_INFO[OC_NSHAPES]={
255   {0x7F,7,OC_EXT_ROWS+  0,{0,1,2,3,4,5,6,7},{0,1,2,4,5,6,7,3}},
256   {0xFE,7,OC_EXT_ROWS+  7,{1,2,3,4,5,6,7,0},{0,1,2,4,5,6,7,3}},
257   {0x3F,6,OC_EXT_ROWS+  8,{0,1,2,3,4,5,7,6},{0,1,3,4,6,7,5,2}},
258   {0xFC,6,OC_EXT_ROWS+ 10,{2,3,4,5,6,7,1,0},{0,1,3,4,6,7,5,2}},
259   {0x1F,5,OC_EXT_ROWS+ 12,{0,1,2,3,4,7,6,5},{0,2,3,5,7,6,4,1}},
260   {0xF8,5,OC_EXT_ROWS+ 15,{3,4,5,6,7,2,1,0},{0,2,3,5,7,6,4,1}},
261   {0x0F,4,OC_EXT_ROWS+ 18,{0,1,2,3,7,6,5,4},{0,2,4,6,7,5,3,1}},
262   {0xF0,4,OC_EXT_ROWS+ 18,{4,5,6,7,3,2,1,0},{0,2,4,6,7,5,3,1}},
263   {0x07,3,OC_EXT_ROWS+ 22,{0,1,2,7,6,5,4,3},{0,3,6,7,5,4,2,1}},
264   {0xE0,3,OC_EXT_ROWS+ 27,{5,6,7,4,3,2,1,0},{0,3,6,7,5,4,2,1}},
265   {0x03,2,OC_EXT_ROWS+ 32,{0,1,7,6,5,4,3,2},{0,4,7,6,5,3,2,1}},
266   {0xC0,2,OC_EXT_ROWS+ 32,{6,7,5,4,3,2,1,0},{0,4,7,6,5,3,2,1}},
267   {0x01,1,OC_EXT_ROWS+  0,{0,7,6,5,4,3,2,1},{0,7,6,5,4,3,2,1}},
268   {0x80,1,OC_EXT_ROWS+  0,{7,6,5,4,3,2,1,0},{0,7,6,5,4,3,2,1}},
269   {0x7E,6,OC_EXT_ROWS+ 42,{1,2,3,4,5,6,7,0},{0,1,2,5,6,7,4,3}},
270   {0x7C,5,OC_EXT_ROWS+ 44,{2,3,4,5,6,7,1,0},{0,1,4,5,7,6,3,2}},
271   {0x3E,5,OC_EXT_ROWS+ 47,{1,2,3,4,5,7,6,0},{0,1,4,5,7,6,3,2}},
272   {0x78,4,OC_EXT_ROWS+ 50,{3,4,5,6,7,2,1,0},{0,4,5,7,6,3,2,1}},
273   {0x3C,4,OC_EXT_ROWS+ 54,{2,3,4,5,7,6,1,0},{0,3,4,7,6,5,2,1}},
274   {0x1E,4,OC_EXT_ROWS+ 58,{1,2,3,4,7,6,5,0},{0,4,5,7,6,3,2,1}},
275   {0x70,3,OC_EXT_ROWS+ 62,{4,5,6,7,3,2,1,0},{0,5,7,6,4,3,2,1}},
276   {0x38,3,OC_EXT_ROWS+ 67,{3,4,5,7,6,2,1,0},{0,5,6,7,4,3,2,1}},
277   {0x1C,3,OC_EXT_ROWS+ 72,{2,3,4,7,6,5,1,0},{0,5,6,7,4,3,2,1}},
278   {0x0E,3,OC_EXT_ROWS+ 77,{1,2,3,7,6,5,4,0},{0,5,7,6,4,3,2,1}},
279   {0x60,2,OC_EXT_ROWS+ 82,{5,6,7,4,3,2,1,0},{0,2,7,6,5,4,3,1}},
280   {0x30,2,OC_EXT_ROWS+ 36,{4,5,7,6,3,2,1,0},{0,4,7,6,5,3,2,1}},
281   {0x18,2,OC_EXT_ROWS+ 90,{3,4,7,6,5,2,1,0},{0,1,7,6,5,4,3,2}},
282   {0x0C,2,OC_EXT_ROWS+ 34,{2,3,7,6,5,4,1,0},{0,4,7,6,5,3,2,1}},
283   {0x06,2,OC_EXT_ROWS+ 84,{1,2,7,6,5,4,3,0},{0,2,7,6,5,4,3,1}},
284   {0x40,1,OC_EXT_ROWS+  0,{6,7,5,4,3,2,1,0},{0,7,6,5,4,3,2,1}},
285   {0x20,1,OC_EXT_ROWS+  0,{5,7,6,4,3,2,1,0},{0,7,6,5,4,3,2,1}},
286   {0x10,1,OC_EXT_ROWS+  0,{4,7,6,5,3,2,1,0},{0,7,6,5,4,3,2,1}},
287   {0x08,1,OC_EXT_ROWS+  0,{3,7,6,5,4,2,1,0},{0,7,6,5,4,3,2,1}},
288   {0x04,1,OC_EXT_ROWS+  0,{2,7,6,5,4,3,1,0},{0,7,6,5,4,3,2,1}},
289   {0x02,1,OC_EXT_ROWS+  0,{1,7,6,5,4,3,2,0},{0,7,6,5,4,3,2,1}}
290 };
291
292
293
294 /*Pads a single column of a partial block and then performs a forward Type-II
295    DCT on the result.
296   The input is scaled by a factor of 4 and biased appropriately for the current
297    fDCT implementation.
298   The output is scaled by an additional factor of 2 from the orthonormal
299    version of the transform.
300   _y: The buffer to store the result in.
301       Data will be placed the first 8 entries (e.g., in a row of an 8x8 block).
302   _x: The input coefficients.
303       Every 8th entry is used (e.g., from a column of an 8x8 block).
304   _e: The extension information for the shape.*/
305 static void oc_fdct8_ext(ogg_int16_t _y[8],ogg_int16_t *_x,
306  const oc_extension_info *_e){
307   const unsigned char *pi;
308   int                  na;
309   na=_e->na;
310   pi=_e->pi;
311   if(na==1){
312     int ci;
313     /*While the branch below is still correct for shapes with na==1, we can
314        perform the entire transform with just 1 multiply in this case instead
315        of 23.*/
316     _y[0]=(ogg_int16_t)(OC_DIV2_16(OC_C4S4*(_x[pi[0]])));
317     for(ci=1;ci<8;ci++)_y[ci]=0;
318   }
319   else{
320     const ogg_int16_t *const *ext;
321     int                       zpi;
322     int                       api;
323     int                       nz;
324     /*First multiply by the extension matrix to compute the padding values.*/
325     nz=8-na;
326     ext=_e->ext;
327     for(zpi=0;zpi<nz;zpi++){
328       ogg_int32_t v;
329       v=0;
330       for(api=0;api<na;api++){
331         v+=ext[zpi][api]*(ogg_int32_t)(_x[pi[api]<<3]<<1);
332       }
333       _x[pi[na+zpi]<<3]=(ogg_int16_t)(v+0x8000>>16)+1>>1;
334     }
335     oc_fdct8(_y,_x);
336   }
337 }
338
339 /*Performs a forward 8x8 Type-II DCT transform on blocks which overlap the
340    border of the picture region.
341   This method ONLY works with rectangular regions.
342   _border: A description of which pixels are inside the border.
343   _y:      The buffer to store the result in.
344            This may be the same as _x.
345   _x:      The input pixel values.
346            Pixel values outside the border will be ignored.*/
347 void oc_fdct8x8_border(const oc_border_info *_border,
348  ogg_int16_t _y[64],const ogg_int16_t _x[64]){
349   ogg_int16_t             *in;
350   ogg_int16_t             *out;
351   ogg_int16_t              w[64];
352   ogg_int64_t              mask;
353   const oc_extension_info *cext;
354   const oc_extension_info *rext;
355   int                      cmask;
356   int                      rmask;
357   int                      ri;
358   int                      ci;
359   /*Identify the shapes of the non-zero rows and columns.*/
360   rmask=cmask=0;
361   mask=_border->mask;
362   for(ri=0;ri<8;ri++){
363     /*This aggregation is _only_ correct for rectangular masks.*/
364     cmask|=((mask&0xFF)!=0)<<ri;
365     rmask|=mask&0xFF;
366     mask>>=8;
367   }
368   /*Find the associated extension info for these shapes.*/
369   if(cmask==0xFF)cext=NULL;
370   else for(cext=OC_EXTENSION_INFO;cext->mask!=cmask;){
371     /*If we somehow can't find the shape, then just do an unpadded fDCT.
372       It won't be efficient, but it should still be correct.*/
373     if(++cext>=OC_EXTENSION_INFO+OC_NSHAPES){
374       oc_enc_fdct8x8_c(_y,_x);
375       return;
376     }
377   }
378   if(rmask==0xFF)rext=NULL;
379   else for(rext=OC_EXTENSION_INFO;rext->mask!=rmask;){
380     /*If we somehow can't find the shape, then just do an unpadded fDCT.
381       It won't be efficient, but it should still be correct.*/
382     if(++rext>=OC_EXTENSION_INFO+OC_NSHAPES){
383       oc_enc_fdct8x8_c(_y,_x);
384       return;
385     }
386   }
387   /*Add two extra bits of working precision to improve accuracy; any more and
388      we could overflow.*/
389   for(ci=0;ci<64;ci++)w[ci]=_x[ci]<<2;
390   /*These biases correct for some systematic error that remains in the full
391      fDCT->iDCT round trip.
392     We can safely add them before padding, since if these pixel values are
393      overwritten, we didn't care what they were anyway (and the unbiased values
394      will usually yield smaller DCT coefficient magnitudes).*/
395   w[0]+=(w[0]!=0)+1;
396   w[1]++;
397   w[8]--;
398   /*Transform the columns.
399     We can ignore zero columns without a problem.*/
400   in=w;
401   out=_y;
402   if(cext==NULL)for(ci=0;ci<8;ci++)oc_fdct8(out+(ci<<3),in+ci);
403   else for(ci=0;ci<8;ci++)if(rmask&(1<<ci))oc_fdct8_ext(out+(ci<<3),in+ci,cext);
404   /*Transform the rows.
405     We transform even rows that are supposedly zero, because rounding errors
406      may make them slightly non-zero, and this will give a more precise
407      reconstruction with very small quantizers.*/
408   in=_y;
409   out=w;
410   if(rext==NULL)for(ri=0;ri<8;ri++)oc_fdct8(out+(ri<<3),in+ri);
411   else for(ri=0;ri<8;ri++)oc_fdct8_ext(out+(ri<<3),in+ri,rext);
412   /*Round the result back to the external working precision (which is still
413      scaled by four relative to the orthogonal result).
414     TODO: We should just update the external working precision.*/
415   for(ci=0;ci<64;ci++)_y[ci]=w[ci]+2>>2;
416 }
417 #endif