1 


/* 
2 


* This file is part of the Independent JPEG Group's software. 
3 


* 
4 


* The authors make NO WARRANTY or representation, either express or implied, 
5 


* with respect to this software, its quality, accuracy, merchantability, or 
6 


* fitness for a particular purpose. This software is provided "AS IS", and 
7 


* you, its user, assume the entire risk as to its quality and accuracy. 
8 


* 
9 


* This software is copyright (C) 19941996, Thomas G. Lane. 
10 


* All Rights Reserved except as specified below. 
11 


* 
12 


* Permission is hereby granted to use, copy, modify, and distribute this 
13 


* software (or portions thereof) for any purpose, without fee, subject to 
14 


* these conditions: 
15 


* (1) If any part of the source code for this software is distributed, then 
16 


* this README file must be included, with this copyright and nowarranty 
17 


* notice unaltered; and any additions, deletions, or changes to the original 
18 


* files must be clearly indicated in accompanying documentation. 
19 


* (2) If only executable code is distributed, then the accompanying 
20 


* documentation must state that "this software is based in part on the work 
21 


* of the Independent JPEG Group". 
22 


* (3) Permission for use of this software is granted only if the user accepts 
23 


* full responsibility for any undesirable consequences; the authors accept 
24 


* NO LIABILITY for damages of any kind. 
25 


* 
26 


* These conditions apply to any software derived from or based on the IJG 
27 


* code, not just to the unmodified library. If you use our work, you ought 
28 


* to acknowledge us. 
29 


* 
30 


* Permission is NOT granted for the use of any IJG author's name or company 
31 


* name in advertising or publicity relating to this software or products 
32 


* derived from it. This software may be referred to only as "the Independent 
33 


* JPEG Group's software". 
34 


* 
35 


* We specifically permit and encourage the use of this software as the basis 
36 


* of commercial products, provided that all warranty or liability claims are 
37 


* assumed by the product vendor. 
38 


* 
39 


* This file contains a fast, not so accurate integer implementation of the 
40 


* forward DCT (Discrete Cosine Transform). 
41 


* 
42 


* A 2D DCT can be done by 1D DCT on each row followed by 1D DCT 
43 


* on each column. Direct algorithms are also available, but they are 
44 


* much more complex and seem not to be any faster when reduced to code. 
45 


* 
46 


* This implementation is based on Arai, Agui, and Nakajima's algorithm for 
47 


* scaled DCT. Their original paper (Trans. IEICE E71(11):1095) is in 
48 


* Japanese, but the algorithm is described in the Pennebaker & Mitchell 
49 


* JPEG textbook (see REFERENCES section in file README). The following code 
50 


* is based directly on figure 48 in P&M. 
51 


* While an 8point DCT cannot be done in less than 11 multiplies, it is 
52 


* possible to arrange the computation so that many of the multiplies are 
53 


* simple scalings of the final outputs. These multiplies can then be 
54 


* folded into the multiplications or divisions by the JPEG quantization 
55 


* table entries. The AA&N method leaves only 5 multiplies and 29 adds 
56 


* to be done in the DCT itself. 
57 


* The primary disadvantage of this method is that with fixedpoint math, 
58 


* accuracy is lost due to imprecise representation of the scaled 
59 


* quantization values. The smaller the quantization table entry, the less 
60 


* precise the scaled value, so this implementation does worse with high 
61 


* qualitysetting files than with lowquality ones. 
62 


*/ 
63 



64 


/** 
65 


* @file 
66 


* Independent JPEG Group's fast AAN dct. 
67 


*/ 
68 



69 


#include <stdlib.h> 
70 


#include <stdio.h> 
71 


#include "libavutil/common.h" 
72 


#include "dct.h" 
73 



74 


#define DCTSIZE 8 
75 


#define GLOBAL(x) x 
76 


#define RIGHT_SHIFT(x, n) ((x) >> (n)) 
77 



78 


/* 
79 


* This module is specialized to the case DCTSIZE = 8. 
80 


*/ 
81 



82 


#if DCTSIZE != 8 
83 


Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */ 
84 


#endif 
85 



86 



87 


/* Scaling decisions are generally the same as in the LL&M algorithm; 
88 


* see jfdctint.c for more details. However, we choose to descale 
89 


* (right shift) multiplication products as soon as they are formed, 
90 


* rather than carrying additional fractional bits into subsequent additions. 
91 


* This compromises accuracy slightly, but it lets us save a few shifts. 
92 


* More importantly, 16bit arithmetic is then adequate (for 8bit samples) 
93 


* everywhere except in the multiplications proper; this saves a good deal 
94 


* of work on 16bitint machines. 
95 


* 
96 


* Again to save a few shifts, the intermediate results between pass 1 and 
97 


* pass 2 are not upscaled, but are represented only to integral precision. 
98 


* 
99 


* A final compromise is to represent the multiplicative constants to only 
100 


* 8 fractional bits, rather than 13. This saves some shifting work on some 
101 


* machines, and may also reduce the cost of multiplication (since there 
102 


* are fewer onebits in the constants). 
103 


*/ 
104 



105 


#define CONST_BITS 8 
106 



107 



108 


/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus 
109 


* causing a lot of useless floatingpoint operations at run time. 
110 


* To get around this we use the following precalculated constants. 
111 


* If you change CONST_BITS you may want to add appropriate values. 
112 


* (With a reasonable C compiler, you can just rely on the FIX() macro...) 
113 


*/ 
114 



115 


#if CONST_BITS == 8 
116 


#define FIX_0_382683433 ((int32_t) 98) /* FIX(0.382683433) */ 
117 


#define FIX_0_541196100 ((int32_t) 139) /* FIX(0.541196100) */ 
118 


#define FIX_0_707106781 ((int32_t) 181) /* FIX(0.707106781) */ 
119 


#define FIX_1_306562965 ((int32_t) 334) /* FIX(1.306562965) */ 
120 


#else 
121 


#define FIX_0_382683433 FIX(0.382683433) 
122 


#define FIX_0_541196100 FIX(0.541196100) 
123 


#define FIX_0_707106781 FIX(0.707106781) 
124 


#define FIX_1_306562965 FIX(1.306562965) 
125 


#endif 
126 



127 



128 


/* We can gain a little more speed, with a further compromise in accuracy, 
129 


* by omitting the addition in a descaling shift. This yields an incorrectly 
130 


* rounded result half the time... 
131 


*/ 
132 



133 


#ifndef USE_ACCURATE_ROUNDING 
134 


#undef DESCALE 
135 


#define DESCALE(x,n) RIGHT_SHIFT(x, n) 
136 


#endif 
137 



138 



139 


/* Multiply a int16_t variable by an int32_t constant, and immediately 
140 


* descale to yield a int16_t result. 
141 


*/ 
142 



143 


#define MULTIPLY(var,const) ((int16_t) DESCALE((var) * (const), CONST_BITS)) 
144 



145 

99700014 
static av_always_inline void row_fdct(int16_t * data){ 
146 


int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 
147 


int tmp10, tmp11, tmp12, tmp13; 
148 


int z1, z2, z3, z4, z5, z11, z13; 
149 


int16_t *dataptr; 
150 


int ctr; 
151 



152 


/* Pass 1: process rows. */ 
153 



154 

99700014 
dataptr = data; 
155 
✓✓ 
897300126 
for (ctr = DCTSIZE1; ctr >= 0; ctr) { 
156 

797600112 
tmp0 = dataptr[0] + dataptr[7]; 
157 

797600112 
tmp7 = dataptr[0]  dataptr[7]; 
158 

797600112 
tmp1 = dataptr[1] + dataptr[6]; 
159 

797600112 
tmp6 = dataptr[1]  dataptr[6]; 
160 

797600112 
tmp2 = dataptr[2] + dataptr[5]; 
161 

797600112 
tmp5 = dataptr[2]  dataptr[5]; 
162 

797600112 
tmp3 = dataptr[3] + dataptr[4]; 
163 

797600112 
tmp4 = dataptr[3]  dataptr[4]; 
164 



165 


/* Even part */ 
166 



167 

797600112 
tmp10 = tmp0 + tmp3; /* phase 2 */ 
168 

797600112 
tmp13 = tmp0  tmp3; 
169 

797600112 
tmp11 = tmp1 + tmp2; 
170 

797600112 
tmp12 = tmp1  tmp2; 
171 



172 

797600112 
dataptr[0] = tmp10 + tmp11; /* phase 3 */ 
173 

797600112 
dataptr[4] = tmp10  tmp11; 
174 



175 

797600112 
z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */ 
176 

797600112 
dataptr[2] = tmp13 + z1; /* phase 5 */ 
177 

797600112 
dataptr[6] = tmp13  z1; 
178 



179 


/* Odd part */ 
180 



181 

797600112 
tmp10 = tmp4 + tmp5; /* phase 2 */ 
182 

797600112 
tmp11 = tmp5 + tmp6; 
183 

797600112 
tmp12 = tmp6 + tmp7; 
184 



185 


/* The rotator is modified from fig 48 to avoid extra negations. */ 
186 

797600112 
z5 = MULTIPLY(tmp10  tmp12, FIX_0_382683433); /* c6 */ 
187 

797600112 
z2 = MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2c6 */ 
188 

797600112 
z4 = MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */ 
189 

797600112 
z3 = MULTIPLY(tmp11, FIX_0_707106781); /* c4 */ 
190 



191 

797600112 
z11 = tmp7 + z3; /* phase 5 */ 
192 

797600112 
z13 = tmp7  z3; 
193 



194 

797600112 
dataptr[5] = z13 + z2; /* phase 6 */ 
195 

797600112 
dataptr[3] = z13  z2; 
196 

797600112 
dataptr[1] = z11 + z4; 
197 

797600112 
dataptr[7] = z11  z4; 
198 



199 

797600112 
dataptr += DCTSIZE; /* advance pointer to next row */ 
200 


} 
201 

99700014 
} 
202 



203 


/* 
204 


* Perform the forward DCT on one block of samples. 
205 


*/ 
206 



207 


GLOBAL(void) 
208 

99700014 
ff_fdct_ifast (int16_t * data) 
209 


{ 
210 


int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 
211 


int tmp10, tmp11, tmp12, tmp13; 
212 


int z1, z2, z3, z4, z5, z11, z13; 
213 


int16_t *dataptr; 
214 


int ctr; 
215 



216 

99700014 
row_fdct(data); 
217 



218 


/* Pass 2: process columns. */ 
219 



220 

99700014 
dataptr = data; 
221 
✓✓ 
897300126 
for (ctr = DCTSIZE1; ctr >= 0; ctr) { 
222 

797600112 
tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7]; 
223 

797600112 
tmp7 = dataptr[DCTSIZE*0]  dataptr[DCTSIZE*7]; 
224 

797600112 
tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6]; 
225 

797600112 
tmp6 = dataptr[DCTSIZE*1]  dataptr[DCTSIZE*6]; 
226 

797600112 
tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5]; 
227 

797600112 
tmp5 = dataptr[DCTSIZE*2]  dataptr[DCTSIZE*5]; 
228 

797600112 
tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4]; 
229 

797600112 
tmp4 = dataptr[DCTSIZE*3]  dataptr[DCTSIZE*4]; 
230 



231 


/* Even part */ 
232 



233 

797600112 
tmp10 = tmp0 + tmp3; /* phase 2 */ 
234 

797600112 
tmp13 = tmp0  tmp3; 
235 

797600112 
tmp11 = tmp1 + tmp2; 
236 

797600112 
tmp12 = tmp1  tmp2; 
237 



238 

797600112 
dataptr[DCTSIZE*0] = tmp10 + tmp11; /* phase 3 */ 
239 

797600112 
dataptr[DCTSIZE*4] = tmp10  tmp11; 
240 



241 

797600112 
z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */ 
242 

797600112 
dataptr[DCTSIZE*2] = tmp13 + z1; /* phase 5 */ 
243 

797600112 
dataptr[DCTSIZE*6] = tmp13  z1; 
244 



245 


/* Odd part */ 
246 



247 

797600112 
tmp10 = tmp4 + tmp5; /* phase 2 */ 
248 

797600112 
tmp11 = tmp5 + tmp6; 
249 

797600112 
tmp12 = tmp6 + tmp7; 
250 



251 


/* The rotator is modified from fig 48 to avoid extra negations. */ 
252 

797600112 
z5 = MULTIPLY(tmp10  tmp12, FIX_0_382683433); /* c6 */ 
253 

797600112 
z2 = MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2c6 */ 
254 

797600112 
z4 = MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */ 
255 

797600112 
z3 = MULTIPLY(tmp11, FIX_0_707106781); /* c4 */ 
256 



257 

797600112 
z11 = tmp7 + z3; /* phase 5 */ 
258 

797600112 
z13 = tmp7  z3; 
259 



260 

797600112 
dataptr[DCTSIZE*5] = z13 + z2; /* phase 6 */ 
261 

797600112 
dataptr[DCTSIZE*3] = z13  z2; 
262 

797600112 
dataptr[DCTSIZE*1] = z11 + z4; 
263 

797600112 
dataptr[DCTSIZE*7] = z11  z4; 
264 



265 

797600112 
dataptr++; /* advance pointer to next column */ 
266 


} 
267 

99700014 
} 
268 



269 


/* 
270 


* Perform the forward 248 DCT on one block of samples. 
271 


*/ 
272 



273 


GLOBAL(void) 
274 


ff_fdct_ifast248 (int16_t * data) 
275 


{ 
276 


int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 
277 


int tmp10, tmp11, tmp12, tmp13; 
278 


int z1; 
279 


int16_t *dataptr; 
280 


int ctr; 
281 



282 


row_fdct(data); 
283 



284 


/* Pass 2: process columns. */ 
285 



286 


dataptr = data; 
287 


for (ctr = DCTSIZE1; ctr >= 0; ctr) { 
288 


tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*1]; 
289 


tmp1 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3]; 
290 


tmp2 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5]; 
291 


tmp3 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7]; 
292 


tmp4 = dataptr[DCTSIZE*0]  dataptr[DCTSIZE*1]; 
293 


tmp5 = dataptr[DCTSIZE*2]  dataptr[DCTSIZE*3]; 
294 


tmp6 = dataptr[DCTSIZE*4]  dataptr[DCTSIZE*5]; 
295 


tmp7 = dataptr[DCTSIZE*6]  dataptr[DCTSIZE*7]; 
296 



297 


/* Even part */ 
298 



299 


tmp10 = tmp0 + tmp3; 
300 


tmp11 = tmp1 + tmp2; 
301 


tmp12 = tmp1  tmp2; 
302 


tmp13 = tmp0  tmp3; 
303 



304 


dataptr[DCTSIZE*0] = tmp10 + tmp11; 
305 


dataptr[DCTSIZE*4] = tmp10  tmp11; 
306 



307 


z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); 
308 


dataptr[DCTSIZE*2] = tmp13 + z1; 
309 


dataptr[DCTSIZE*6] = tmp13  z1; 
310 



311 


tmp10 = tmp4 + tmp7; 
312 


tmp11 = tmp5 + tmp6; 
313 


tmp12 = tmp5  tmp6; 
314 


tmp13 = tmp4  tmp7; 
315 



316 


dataptr[DCTSIZE*1] = tmp10 + tmp11; 
317 


dataptr[DCTSIZE*5] = tmp10  tmp11; 
318 



319 


z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); 
320 


dataptr[DCTSIZE*3] = tmp13 + z1; 
321 


dataptr[DCTSIZE*7] = tmp13  z1; 
322 



323 


dataptr++; /* advance pointer to next column */ 
324 


} 
325 


} 
326 



327 



328 


#undef GLOBAL 
329 


#undef CONST_BITS 
330 


#undef DESCALE 
331 


#undef FIX_0_541196100 
332 


#undef FIX_1_306562965 