GCC Code Coverage Report
Directory: ../../../ffmpeg/ Exec Total Coverage
File: src/libavcodec/fft_template.c Lines: 257 277 92.8 %
Date: 2021-01-21 21:11:50 Branches: 60 78 76.9 %

Line Branch Exec Source
1
/*
2
 * FFT/IFFT transforms
3
 * Copyright (c) 2008 Loren Merritt
4
 * Copyright (c) 2002 Fabrice Bellard
5
 * Partly based on libdjbfft by D. J. Bernstein
6
 *
7
 * This file is part of FFmpeg.
8
 *
9
 * FFmpeg is free software; you can redistribute it and/or
10
 * modify it under the terms of the GNU Lesser General Public
11
 * License as published by the Free Software Foundation; either
12
 * version 2.1 of the License, or (at your option) any later version.
13
 *
14
 * FFmpeg is distributed in the hope that it will be useful,
15
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
 * Lesser General Public License for more details.
18
 *
19
 * You should have received a copy of the GNU Lesser General Public
20
 * License along with FFmpeg; if not, write to the Free Software
21
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22
 */
23
24
/**
25
 * @file
26
 * FFT/IFFT transforms.
27
 */
28
29
#include <stdlib.h>
30
#include <string.h>
31
#include "libavutil/mathematics.h"
32
#include "libavutil/thread.h"
33
#include "fft.h"
34
#include "fft-internal.h"
35
36
#if FFT_FIXED_32
37
#include "fft_table.h"
38
#else /* FFT_FIXED_32 */
39
40
/* cos(2*pi*x/n) for 0<=x<=n/4, followed by its reverse */
41
#if !CONFIG_HARDCODED_TABLES
42
COSTABLE(16);
43
COSTABLE(32);
44
COSTABLE(64);
45
COSTABLE(128);
46
COSTABLE(256);
47
COSTABLE(512);
48
COSTABLE(1024);
49
COSTABLE(2048);
50
COSTABLE(4096);
51
COSTABLE(8192);
52
COSTABLE(16384);
53
COSTABLE(32768);
54
COSTABLE(65536);
55
COSTABLE(131072);
56
57
2426
static av_cold void init_ff_cos_tabs(int index)
58
{
59
    int i;
60
2426
    int m = 1<<index;
61
2426
    double freq = 2*M_PI/m;
62
2426
    FFTSample *tab = FFT_NAME(ff_cos_tabs)[index];
63
180452
    for(i=0; i<=m/4; i++)
64
178026
        tab[i] = FIX15(cos(i*freq));
65
175600
    for(i=1; i<m/4; i++)
66
173174
        tab[m/2-i] = tab[i];
67
2426
}
68
69
typedef struct CosTabsInitOnce {
70
    void (*func)(void);
71
    AVOnce control;
72
} CosTabsInitOnce;
73
74
#define INIT_FF_COS_TABS_FUNC(index, size)          \
75
static av_cold void init_ff_cos_tabs_ ## size (void)\
76
{                                                   \
77
    init_ff_cos_tabs(index);                        \
78
}
79
80
478
INIT_FF_COS_TABS_FUNC(4, 16)
81
461
INIT_FF_COS_TABS_FUNC(5, 32)
82
387
INIT_FF_COS_TABS_FUNC(6, 64)
83
363
INIT_FF_COS_TABS_FUNC(7, 128)
84
310
INIT_FF_COS_TABS_FUNC(8, 256)
85
286
INIT_FF_COS_TABS_FUNC(9, 512)
86
68
INIT_FF_COS_TABS_FUNC(10, 1024)
87
37
INIT_FF_COS_TABS_FUNC(11, 2048)
88
24
INIT_FF_COS_TABS_FUNC(12, 4096)
89
7
INIT_FF_COS_TABS_FUNC(13, 8192)
90
5
INIT_FF_COS_TABS_FUNC(14, 16384)
91
INIT_FF_COS_TABS_FUNC(15, 32768)
92
INIT_FF_COS_TABS_FUNC(16, 65536)
93
INIT_FF_COS_TABS_FUNC(17, 131072)
94
95
static CosTabsInitOnce cos_tabs_init_once[] = {
96
    { NULL },
97
    { NULL },
98
    { NULL },
99
    { NULL },
100
    { init_ff_cos_tabs_16, AV_ONCE_INIT },
101
    { init_ff_cos_tabs_32, AV_ONCE_INIT },
102
    { init_ff_cos_tabs_64, AV_ONCE_INIT },
103
    { init_ff_cos_tabs_128, AV_ONCE_INIT },
104
    { init_ff_cos_tabs_256, AV_ONCE_INIT },
105
    { init_ff_cos_tabs_512, AV_ONCE_INIT },
106
    { init_ff_cos_tabs_1024, AV_ONCE_INIT },
107
    { init_ff_cos_tabs_2048, AV_ONCE_INIT },
108
    { init_ff_cos_tabs_4096, AV_ONCE_INIT },
109
    { init_ff_cos_tabs_8192, AV_ONCE_INIT },
110
    { init_ff_cos_tabs_16384, AV_ONCE_INIT },
111
    { init_ff_cos_tabs_32768, AV_ONCE_INIT },
112
    { init_ff_cos_tabs_65536, AV_ONCE_INIT },
113
    { init_ff_cos_tabs_131072, AV_ONCE_INIT },
114
};
115
116
#endif
117
COSTABLE_CONST FFTSample * const FFT_NAME(ff_cos_tabs)[] = {
118
    NULL, NULL, NULL, NULL,
119
    FFT_NAME(ff_cos_16),
120
    FFT_NAME(ff_cos_32),
121
    FFT_NAME(ff_cos_64),
122
    FFT_NAME(ff_cos_128),
123
    FFT_NAME(ff_cos_256),
124
    FFT_NAME(ff_cos_512),
125
    FFT_NAME(ff_cos_1024),
126
    FFT_NAME(ff_cos_2048),
127
    FFT_NAME(ff_cos_4096),
128
    FFT_NAME(ff_cos_8192),
129
    FFT_NAME(ff_cos_16384),
130
    FFT_NAME(ff_cos_32768),
131
    FFT_NAME(ff_cos_65536),
132
    FFT_NAME(ff_cos_131072),
133
};
134
135
#endif /* FFT_FIXED_32 */
136
137
static void fft_permute_c(FFTContext *s, FFTComplex *z);
138
static void fft_calc_c(FFTContext *s, FFTComplex *z);
139
140
5126024
static int split_radix_permutation(int i, int n, int inverse)
141
{
142
    int m;
143
5126024
    if(n <= 2) return i&1;
144
4369464
    m = n >> 1;
145
4369464
    if(!(i&m))            return split_radix_permutation(i, m, inverse)*2;
146
2184732
    m >>= 1;
147
2184732
    if(inverse == !(i&m)) return split_radix_permutation(i, m, inverse)*4 + 1;
148
1092366
    else                  return split_radix_permutation(i, m, inverse)*4 - 1;
149
}
150
151
10389
av_cold void ff_init_ff_cos_tabs(int index)
152
{
153
#if (!CONFIG_HARDCODED_TABLES) && (!FFT_FIXED_32)
154
10389
    ff_thread_once(&cos_tabs_init_once[index].control, cos_tabs_init_once[index].func);
155
#endif
156
10389
}
157
158
static const int avx_tab[] = {
159
    0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15
160
};
161
162
14530
static int is_second_half_of_fft32(int i, int n)
163
{
164
14530
    if (n <= 32)
165
4090
        return i >= 16;
166
10440
    else if (i < n/2)
167
5220
        return is_second_half_of_fft32(i, n/2);
168
5220
    else if (i < 3*n/4)
169
2610
        return is_second_half_of_fft32(i - n/2, n/4);
170
    else
171
2610
        return is_second_half_of_fft32(i - 3*n/4, n/4);
172
}
173
174
303
static av_cold void fft_perm_avx(FFTContext *s)
175
{
176
    int i;
177
303
    int n = 1 << s->nbits;
178
179
4393
    for (i = 0; i < n; i += 16) {
180
        int k;
181
4090
        if (is_second_half_of_fft32(i, n)) {
182
23783
            for (k = 0; k < 16; k++)
183
44768
                s->revtab[-split_radix_permutation(i + k, n, s->inverse) & (n - 1)] =
184
22384
                    i + avx_tab[k];
185
186
        } else {
187
45747
            for (k = 0; k < 16; k++) {
188
43056
                int j = i + k;
189
43056
                j = (j & ~7) | ((j >> 1) & 3) | ((j << 2) & 4);
190
43056
                s->revtab[-split_radix_permutation(i + k, n, s->inverse) & (n - 1)] = j;
191
            }
192
        }
193
    }
194
303
}
195
196
3566
av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
197
{
198
    int i, j, n;
199
200
3566
    s->revtab = NULL;
201
3566
    s->revtab32 = NULL;
202
203

3566
    if (nbits < 2 || nbits > 17)
204
        goto fail;
205
3566
    s->nbits = nbits;
206
3566
    n = 1 << nbits;
207
208
3566
    if (nbits <= 16) {
209
3566
        s->revtab = av_malloc(n * sizeof(uint16_t));
210
3566
        if (!s->revtab)
211
            goto fail;
212
    } else {
213
        s->revtab32 = av_malloc(n * sizeof(uint32_t));
214
        if (!s->revtab32)
215
            goto fail;
216
    }
217
3566
    s->tmp_buf = av_malloc(n * sizeof(FFTComplex));
218
3566
    if (!s->tmp_buf)
219
        goto fail;
220
3566
    s->inverse = inverse;
221
3566
    s->fft_permutation = FF_FFT_PERM_DEFAULT;
222
223
3566
    s->fft_permute = fft_permute_c;
224
3566
    s->fft_calc    = fft_calc_c;
225
#if CONFIG_MDCT
226
3566
    s->imdct_calc  = ff_imdct_calc_c;
227
3566
    s->imdct_half  = ff_imdct_half_c;
228
3566
    s->mdct_calc   = ff_mdct_calc_c;
229
#endif
230
231
#if FFT_FIXED_32
232
188
    ff_fft_lut_init();
233
#else /* FFT_FIXED_32 */
234
#if FFT_FLOAT
235
    if (ARCH_AARCH64) ff_fft_init_aarch64(s);
236
    if (ARCH_ARM)     ff_fft_init_arm(s);
237
    if (ARCH_PPC)     ff_fft_init_ppc(s);
238
3378
    if (ARCH_X86)     ff_fft_init_x86(s);
239
    if (HAVE_MIPSFPU) ff_fft_init_mips(s);
240
#endif
241
13588
    for(j=4; j<=nbits; j++) {
242
10210
        ff_init_ff_cos_tabs(j);
243
    }
244
#endif /* FFT_FIXED_32 */
245
246
247
3378
    if (ARCH_X86 && FFT_FLOAT && s->fft_permutation == FF_FFT_PERM_AVX) {
248
303
        fft_perm_avx(s);
249
    } else {
250
#define PROCESS_FFT_PERM_SWAP_LSBS(num) do {\
251
    for(i = 0; i < n; i++) {\
252
        int k;\
253
        j = i;\
254
        j = (j & ~3) | ((j >> 1) & 1) | ((j << 1) & 2);\
255
        k = -split_radix_permutation(i, n, s->inverse) & (n - 1);\
256
        s->revtab##num[k] = j;\
257
    } \
258
} while(0);
259
260
#define PROCESS_FFT_PERM_DEFAULT(num) do {\
261
    for(i = 0; i < n; i++) {\
262
        int k;\
263
        j = i;\
264
        k = -split_radix_permutation(i, n, s->inverse) & (n - 1);\
265
        s->revtab##num[k] = j;\
266
    } \
267
} while(0);
268
269
#define SPLIT_RADIX_PERMUTATION(num) do { \
270
    if (s->fft_permutation == FF_FFT_PERM_SWAP_LSBS) {\
271
        PROCESS_FFT_PERM_SWAP_LSBS(num) \
272
    } else {\
273
        PROCESS_FFT_PERM_DEFAULT(num) \
274
    }\
275
} while(0);
276
277
3263
    if (s->revtab)
278

694383
        SPLIT_RADIX_PERMUTATION()
279
3263
    if (s->revtab32)
280
        SPLIT_RADIX_PERMUTATION(32)
281
282
#undef PROCESS_FFT_PERM_DEFAULT
283
#undef PROCESS_FFT_PERM_SWAP_LSBS
284
#undef SPLIT_RADIX_PERMUTATION
285
    }
286
287
3566
    return 0;
288
 fail:
289
    av_freep(&s->revtab);
290
    av_freep(&s->revtab32);
291
    av_freep(&s->tmp_buf);
292
    return -1;
293
}
294
295
48062
static void fft_permute_c(FFTContext *s, FFTComplex *z)
296
{
297
    int j, np;
298
48062
    const uint16_t *revtab = s->revtab;
299
48062
    const uint32_t *revtab32 = s->revtab32;
300
48062
    np = 1 << s->nbits;
301
    /* TODO: handle split-radix permute in a more optimal way, probably in-place */
302
48062
    if (revtab) {
303
12905982
        for(j=0;j<np;j++) s->tmp_buf[revtab[j]] = z[j];
304
    } else
305
        for(j=0;j<np;j++) s->tmp_buf[revtab32[j]] = z[j];
306
307
48062
    memcpy(z, s->tmp_buf, np * sizeof(FFTComplex));
308
48062
}
309
310
3660
av_cold void ff_fft_end(FFTContext *s)
311
{
312
3660
    av_freep(&s->revtab);
313
3660
    av_freep(&s->revtab32);
314
3660
    av_freep(&s->tmp_buf);
315
3660
}
316
317
#if FFT_FIXED_32
318
319
770822
static void fft_calc_c(FFTContext *s, FFTComplex *z) {
320
321
    int nbits, i, n, num_transforms, offset, step;
322
    int n4, n2, n34;
323
    unsigned tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
324
    FFTComplex *tmpz;
325
770822
    const int fft_size = (1 << s->nbits);
326
    int64_t accu;
327
328
770822
    num_transforms = (0x2aab >> (16 - s->nbits)) | 1;
329
330
6613534
    for (n=0; n<num_transforms; n++){
331
5842712
        offset = ff_fft_offsets_lut[n] << 2;
332
5842712
        tmpz = z + offset;
333
334
5842712
        tmp1 = tmpz[0].re + (unsigned)tmpz[1].re;
335
5842712
        tmp5 = tmpz[2].re + (unsigned)tmpz[3].re;
336
5842712
        tmp2 = tmpz[0].im + (unsigned)tmpz[1].im;
337
5842712
        tmp6 = tmpz[2].im + (unsigned)tmpz[3].im;
338
5842712
        tmp3 = tmpz[0].re - (unsigned)tmpz[1].re;
339
5842712
        tmp8 = tmpz[2].im - (unsigned)tmpz[3].im;
340
5842712
        tmp4 = tmpz[0].im - (unsigned)tmpz[1].im;
341
5842712
        tmp7 = tmpz[2].re - (unsigned)tmpz[3].re;
342
343
5842712
        tmpz[0].re = tmp1 + tmp5;
344
5842712
        tmpz[2].re = tmp1 - tmp5;
345
5842712
        tmpz[0].im = tmp2 + tmp6;
346
5842712
        tmpz[2].im = tmp2 - tmp6;
347
5842712
        tmpz[1].re = tmp3 + tmp8;
348
5842712
        tmpz[3].re = tmp3 - tmp8;
349
5842712
        tmpz[1].im = tmp4 - tmp7;
350
5842712
        tmpz[3].im = tmp4 + tmp7;
351
    }
352
353
770822
    if (fft_size < 8)
354
1
        return;
355
356
770821
    num_transforms = (num_transforms >> 1) | 1;
357
358
4059972
    for (n=0; n<num_transforms; n++){
359
3289151
        offset = ff_fft_offsets_lut[n] << 3;
360
3289151
        tmpz = z + offset;
361
362
3289151
        tmp1 = tmpz[4].re + (unsigned)tmpz[5].re;
363
3289151
        tmp3 = tmpz[6].re + (unsigned)tmpz[7].re;
364
3289151
        tmp2 = tmpz[4].im + (unsigned)tmpz[5].im;
365
3289151
        tmp4 = tmpz[6].im + (unsigned)tmpz[7].im;
366
3289151
        tmp5 = tmp1 + tmp3;
367
3289151
        tmp7 = tmp1 - tmp3;
368
3289151
        tmp6 = tmp2 + tmp4;
369
3289151
        tmp8 = tmp2 - tmp4;
370
371
3289151
        tmp1 = tmpz[4].re - (unsigned)tmpz[5].re;
372
3289151
        tmp2 = tmpz[4].im - (unsigned)tmpz[5].im;
373
3289151
        tmp3 = tmpz[6].re - (unsigned)tmpz[7].re;
374
3289151
        tmp4 = tmpz[6].im - (unsigned)tmpz[7].im;
375
376
3289151
        tmpz[4].re = tmpz[0].re - tmp5;
377
3289151
        tmpz[0].re = tmpz[0].re + tmp5;
378
3289151
        tmpz[4].im = tmpz[0].im - tmp6;
379
3289151
        tmpz[0].im = tmpz[0].im + tmp6;
380
3289151
        tmpz[6].re = tmpz[2].re - tmp8;
381
3289151
        tmpz[2].re = tmpz[2].re + tmp8;
382
3289151
        tmpz[6].im = tmpz[2].im + tmp7;
383
3289151
        tmpz[2].im = tmpz[2].im - tmp7;
384
385
3289151
        accu = (int64_t)Q31(M_SQRT1_2)*(int)(tmp1 + tmp2);
386
3289151
        tmp5 = (int32_t)((accu + 0x40000000) >> 31);
387
3289151
        accu = (int64_t)Q31(M_SQRT1_2)*(int)(tmp3 - tmp4);
388
3289151
        tmp7 = (int32_t)((accu + 0x40000000) >> 31);
389
3289151
        accu = (int64_t)Q31(M_SQRT1_2)*(int)(tmp2 - tmp1);
390
3289151
        tmp6 = (int32_t)((accu + 0x40000000) >> 31);
391
3289151
        accu = (int64_t)Q31(M_SQRT1_2)*(int)(tmp3 + tmp4);
392
3289151
        tmp8 = (int32_t)((accu + 0x40000000) >> 31);
393
3289151
        tmp1 = tmp5 + tmp7;
394
3289151
        tmp3 = tmp5 - tmp7;
395
3289151
        tmp2 = tmp6 + tmp8;
396
3289151
        tmp4 = tmp6 - tmp8;
397
398
3289151
        tmpz[5].re = tmpz[1].re - tmp1;
399
3289151
        tmpz[1].re = tmpz[1].re + tmp1;
400
3289151
        tmpz[5].im = tmpz[1].im - tmp2;
401
3289151
        tmpz[1].im = tmpz[1].im + tmp2;
402
3289151
        tmpz[7].re = tmpz[3].re - tmp4;
403
3289151
        tmpz[3].re = tmpz[3].re + tmp4;
404
3289151
        tmpz[7].im = tmpz[3].im + tmp3;
405
3289151
        tmpz[3].im = tmpz[3].im - tmp3;
406
    }
407
408
770821
    step = 1 << ((MAX_LOG2_NFFT-4) - 4);
409
770821
    n4 = 4;
410
411
2441198
    for (nbits=4; nbits<=s->nbits; nbits++){
412
1670377
        n2  = 2*n4;
413
1670377
        n34 = 3*n4;
414
1670377
        num_transforms = (num_transforms >> 1) | 1;
415
416
4206322
        for (n=0; n<num_transforms; n++){
417
2535945
            const FFTSample *w_re_ptr = ff_w_tab_sr + step;
418
2535945
            const FFTSample *w_im_ptr = ff_w_tab_sr + MAX_FFT_SIZE/(4*16) - step;
419
2535945
            offset = ff_fft_offsets_lut[n] << nbits;
420
2535945
            tmpz = z + offset;
421
422
2535945
            tmp5 = tmpz[ n2].re + (unsigned)tmpz[n34].re;
423
2535945
            tmp1 = tmpz[ n2].re - (unsigned)tmpz[n34].re;
424
2535945
            tmp6 = tmpz[ n2].im + (unsigned)tmpz[n34].im;
425
2535945
            tmp2 = tmpz[ n2].im - (unsigned)tmpz[n34].im;
426
427
2535945
            tmpz[ n2].re = tmpz[ 0].re - tmp5;
428
2535945
            tmpz[  0].re = tmpz[ 0].re + tmp5;
429
2535945
            tmpz[ n2].im = tmpz[ 0].im - tmp6;
430
2535945
            tmpz[  0].im = tmpz[ 0].im + tmp6;
431
2535945
            tmpz[n34].re = tmpz[n4].re - tmp2;
432
2535945
            tmpz[ n4].re = tmpz[n4].re + tmp2;
433
2535945
            tmpz[n34].im = tmpz[n4].im + tmp1;
434
2535945
            tmpz[ n4].im = tmpz[n4].im - tmp1;
435
436
21618072
            for (i=1; i<n4; i++){
437
19082127
                FFTSample w_re = w_re_ptr[0];
438
19082127
                FFTSample w_im = w_im_ptr[0];
439
19082127
                accu  = (int64_t)w_re*tmpz[ n2+i].re;
440
19082127
                accu += (int64_t)w_im*tmpz[ n2+i].im;
441
19082127
                tmp1 = (int32_t)((accu + 0x40000000) >> 31);
442
19082127
                accu  = (int64_t)w_re*tmpz[ n2+i].im;
443
19082127
                accu -= (int64_t)w_im*tmpz[ n2+i].re;
444
19082127
                tmp2 = (int32_t)((accu + 0x40000000) >> 31);
445
19082127
                accu  = (int64_t)w_re*tmpz[n34+i].re;
446
19082127
                accu -= (int64_t)w_im*tmpz[n34+i].im;
447
19082127
                tmp3 = (int32_t)((accu + 0x40000000) >> 31);
448
19082127
                accu  = (int64_t)w_re*tmpz[n34+i].im;
449
19082127
                accu += (int64_t)w_im*tmpz[n34+i].re;
450
19082127
                tmp4 = (int32_t)((accu + 0x40000000) >> 31);
451
452
19082127
                tmp5 = tmp1 + tmp3;
453
19082127
                tmp1 = tmp1 - tmp3;
454
19082127
                tmp6 = tmp2 + tmp4;
455
19082127
                tmp2 = tmp2 - tmp4;
456
457
19082127
                tmpz[ n2+i].re = tmpz[   i].re - tmp5;
458
19082127
                tmpz[    i].re = tmpz[   i].re + tmp5;
459
19082127
                tmpz[ n2+i].im = tmpz[   i].im - tmp6;
460
19082127
                tmpz[    i].im = tmpz[   i].im + tmp6;
461
19082127
                tmpz[n34+i].re = tmpz[n4+i].re - tmp2;
462
19082127
                tmpz[ n4+i].re = tmpz[n4+i].re + tmp2;
463
19082127
                tmpz[n34+i].im = tmpz[n4+i].im + tmp1;
464
19082127
                tmpz[ n4+i].im = tmpz[n4+i].im - tmp1;
465
466
19082127
                w_re_ptr += step;
467
19082127
                w_im_ptr -= step;
468
            }
469
        }
470
1670377
        step >>= 1;
471
1670377
        n4   <<= 1;
472
    }
473
}
474
475
#else /* FFT_FIXED_32 */
476
477
#define BUTTERFLIES(a0,a1,a2,a3) {\
478
    BF(t3, t5, t5, t1);\
479
    BF(a2.re, a0.re, a0.re, t5);\
480
    BF(a3.im, a1.im, a1.im, t3);\
481
    BF(t4, t6, t2, t6);\
482
    BF(a3.re, a1.re, a1.re, t4);\
483
    BF(a2.im, a0.im, a0.im, t6);\
484
}
485
486
// force loading all the inputs before storing any.
487
// this is slightly slower for small data, but avoids store->load aliasing
488
// for addresses separated by large powers of 2.
489
#define BUTTERFLIES_BIG(a0,a1,a2,a3) {\
490
    FFTSample r0=a0.re, i0=a0.im, r1=a1.re, i1=a1.im;\
491
    BF(t3, t5, t5, t1);\
492
    BF(a2.re, a0.re, r0, t5);\
493
    BF(a3.im, a1.im, i1, t3);\
494
    BF(t4, t6, t2, t6);\
495
    BF(a3.re, a1.re, r1, t4);\
496
    BF(a2.im, a0.im, i0, t6);\
497
}
498
499
#define TRANSFORM(a0,a1,a2,a3,wre,wim) {\
500
    CMUL(t1, t2, a2.re, a2.im, wre, -wim);\
501
    CMUL(t5, t6, a3.re, a3.im, wre,  wim);\
502
    BUTTERFLIES(a0,a1,a2,a3)\
503
}
504
505
#define TRANSFORM_ZERO(a0,a1,a2,a3) {\
506
    t1 = a2.re;\
507
    t2 = a2.im;\
508
    t5 = a3.re;\
509
    t6 = a3.im;\
510
    BUTTERFLIES(a0,a1,a2,a3)\
511
}
512
513
/* z[0...8n-1], w[1...2n-1] */
514
#define PASS(name)\
515
static void name(FFTComplex *z, const FFTSample *wre, unsigned int n)\
516
{\
517
    FFTDouble t1, t2, t3, t4, t5, t6;\
518
    int o1 = 2*n;\
519
    int o2 = 4*n;\
520
    int o3 = 6*n;\
521
    const FFTSample *wim = wre+o1;\
522
    n--;\
523
\
524
    TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3]);\
525
    TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\
526
    do {\
527
        z += 2;\
528
        wre += 2;\
529
        wim -= 2;\
530
        TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0]);\
531
        TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\
532
    } while(--n);\
533
}
534
535
25745265
PASS(pass)
536
#if !CONFIG_SMALL
537
#undef BUTTERFLIES
538
#define BUTTERFLIES BUTTERFLIES_BIG
539
3063261
PASS(pass_big)
540
#endif
541
542
#define DECL_FFT(n,n2,n4)\
543
static void fft##n(FFTComplex *z)\
544
{\
545
    fft##n2(z);\
546
    fft##n4(z+n4*2);\
547
    fft##n4(z+n4*3);\
548
    pass(z,FFT_NAME(ff_cos_##n),n4/2);\
549
}
550
551
18920291
static void fft4(FFTComplex *z)
552
{
553
    FFTDouble t1, t2, t3, t4, t5, t6, t7, t8;
554
555
18920291
    BF(t3, t1, z[0].re, z[1].re);
556
18920291
    BF(t8, t6, z[3].re, z[2].re);
557
18920291
    BF(z[2].re, z[0].re, t1, t6);
558
18920291
    BF(t4, t2, z[0].im, z[1].im);
559
18920291
    BF(t7, t5, z[2].im, z[3].im);
560
18920291
    BF(z[3].im, z[1].im, t4, t8);
561
18920291
    BF(z[3].re, z[1].re, t3, t7);
562
18920291
    BF(z[2].im, z[0].im, t2, t5);
563
18920291
}
564
565
9627643
static void fft8(FFTComplex *z)
566
{
567
    FFTDouble t1, t2, t3, t4, t5, t6;
568
569
9627643
    fft4(z);
570
571
9627643
    BF(t1, z[5].re, z[4].re, -z[5].re);
572
9627643
    BF(t2, z[5].im, z[4].im, -z[5].im);
573
9627643
    BF(t5, z[7].re, z[6].re, -z[7].re);
574
9627643
    BF(t6, z[7].im, z[6].im, -z[7].im);
575
576
9627643
    BUTTERFLIES(z[0],z[2],z[4],z[6]);
577
9627643
    TRANSFORM(z[1],z[3],z[5],z[7],sqrthalf,sqrthalf);
578
9627643
}
579
580
#if !CONFIG_SMALL
581
4134447
static void fft16(FFTComplex *z)
582
{
583
    FFTDouble t1, t2, t3, t4, t5, t6;
584
4134447
    FFTSample cos_16_1 = FFT_NAME(ff_cos_16)[1];
585
4134447
    FFTSample cos_16_3 = FFT_NAME(ff_cos_16)[3];
586
587
4134447
    fft8(z);
588
4134447
    fft4(z+8);
589
4134447
    fft4(z+12);
590
591
4134447
    TRANSFORM_ZERO(z[0],z[4],z[8],z[12]);
592
4134447
    TRANSFORM(z[2],z[6],z[10],z[14],sqrthalf,sqrthalf);
593
4134447
    TRANSFORM(z[1],z[5],z[9],z[13],cos_16_1,cos_16_3);
594
4134447
    TRANSFORM(z[3],z[7],z[11],z[15],cos_16_3,cos_16_1);
595
4134447
}
596
#else
597
DECL_FFT(16,8,4)
598
#endif
599
2592773
DECL_FFT(32,16,8)
600
631259
DECL_FFT(64,32,16)
601
324172
DECL_FFT(128,64,32)
602
114796
DECL_FFT(256,128,64)
603
81379
DECL_FFT(512,256,128)
604
#if !CONFIG_SMALL
605
#define pass pass_big
606
#endif
607
8553
DECL_FFT(1024,512,256)
608
2969
DECL_FFT(2048,1024,512)
609
1008
DECL_FFT(4096,2048,1024)
610
689
DECL_FFT(8192,4096,2048)
611
DECL_FFT(16384,8192,4096)
612
DECL_FFT(32768,16384,8192)
613
DECL_FFT(65536,32768,16384)
614
DECL_FFT(131072,65536,32768)
615
616
static void (* const fft_dispatch[])(FFTComplex*) = {
617
    fft4, fft8, fft16, fft32, fft64, fft128, fft256, fft512, fft1024,
618
    fft2048, fft4096, fft8192, fft16384, fft32768, fft65536, fft131072
619
};
620
621
3136201
static void fft_calc_c(FFTContext *s, FFTComplex *z)
622
{
623
3136201
    fft_dispatch[s->nbits-2](z);
624
3136201
}
625
#endif /* FFT_FIXED_32 */