FFmpeg coverage


Directory: ../../../ffmpeg/
File: src/libavutil/x86/tx_float_init.c
Date: 2024-11-20 23:03:26
Exec Total Coverage
Lines: 49 79 62.0%
Functions: 4 5 80.0%
Branches: 27 46 58.7%

Line Branch Exec Source
1 /*
2 * This file is part of FFmpeg.
3 *
4 * FFmpeg is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
8 *
9 * FFmpeg is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with FFmpeg; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18
19 #define TX_FLOAT
20 #include "libavutil/tx_priv.h"
21 #include "libavutil/attributes.h"
22 #include "libavutil/mem.h"
23 #include "libavutil/x86/cpu.h"
24
25 #include "config.h"
26
27 TX_DECL_FN(fft2, sse3)
28 TX_DECL_FN(fft4_fwd, sse2)
29 TX_DECL_FN(fft4_inv, sse2)
30 TX_DECL_FN(fft8, sse3)
31 TX_DECL_FN(fft8_ns, sse3)
32 TX_DECL_FN(fft8, avx)
33 TX_DECL_FN(fft8_ns, avx)
34 TX_DECL_FN(fft15, avx2)
35 TX_DECL_FN(fft15_ns, avx2)
36 TX_DECL_FN(fft16, avx)
37 TX_DECL_FN(fft16_ns, avx)
38 TX_DECL_FN(fft16, fma3)
39 TX_DECL_FN(fft16_ns, fma3)
40 TX_DECL_FN(fft32, avx)
41 TX_DECL_FN(fft32_ns, avx)
42 TX_DECL_FN(fft32, fma3)
43 TX_DECL_FN(fft32_ns, fma3)
44 TX_DECL_FN(fft_sr, avx)
45 TX_DECL_FN(fft_sr_ns, avx)
46 TX_DECL_FN(fft_sr, fma3)
47 TX_DECL_FN(fft_sr_ns, fma3)
48 TX_DECL_FN(fft_sr, avx2)
49 TX_DECL_FN(fft_sr_ns, avx2)
50
51 TX_DECL_FN(fft_pfa_15xM, avx2)
52 TX_DECL_FN(fft_pfa_15xM_ns, avx2)
53
54 TX_DECL_FN(mdct_inv, avx2)
55
56 TX_DECL_FN(fft2_asm, sse3)
57 TX_DECL_FN(fft4_fwd_asm, sse2)
58 TX_DECL_FN(fft4_inv_asm, sse2)
59 TX_DECL_FN(fft8_asm, sse3)
60 TX_DECL_FN(fft8_asm, avx)
61 TX_DECL_FN(fft16_asm, avx)
62 TX_DECL_FN(fft16_asm, fma3)
63 TX_DECL_FN(fft32_asm, avx)
64 TX_DECL_FN(fft32_asm, fma3)
65 TX_DECL_FN(fft_sr_asm, avx)
66 TX_DECL_FN(fft_sr_asm, fma3)
67 TX_DECL_FN(fft_sr_asm, avx2)
68
69 TX_DECL_FN(fft_pfa_15xM_asm, avx2)
70
71 #define DECL_INIT_FN(basis, interleave) \
72 static av_cold int b ##basis## _i ##interleave(AVTXContext *s, \
73 const FFTXCodelet *cd, \
74 uint64_t flags, \
75 FFTXCodeletOptions *opts, \
76 int len, int inv, \
77 const void *scale) \
78 { \
79 ff_tx_init_tabs_float(len); \
80 if (cd->max_len == 2) \
81 return ff_tx_gen_ptwo_revtab(s, opts); \
82 else \
83 return ff_tx_gen_split_radix_parity_revtab(s, len, inv, opts, \
84 basis, interleave); \
85 }
86
87
2/2
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 167 times.
175 DECL_INIT_FN(8, 0)
88
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 661 times.
661 DECL_INIT_FN(8, 2)
89
90 static av_cold int factor_init(AVTXContext *s, const FFTXCodelet *cd,
91 uint64_t flags, FFTXCodeletOptions *opts,
92 int len, int inv, const void *scale)
93 {
94 int ret;
95
96 /* The transformations below are performed in the gather domain,
97 * so override the option and let the infrastructure convert the map
98 * to SCATTER if needed. */
99 FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_GATHER };
100
101 TX_TAB(ff_tx_init_tabs)(len);
102
103 if (len == 15)
104 ret = ff_tx_gen_pfa_input_map(s, &sub_opts, 3, 5);
105 else
106 ret = ff_tx_gen_default_map(s, &sub_opts);
107
108 if (ret < 0)
109 return ret;
110
111 if (len == 15) {
112 int cnt = 0, tmp[15];
113
114 /* Special permutation to simplify loads in the pre-permuted version */
115 memcpy(tmp, s->map, 15*sizeof(*tmp));
116 for (int i = 1; i < 15; i += 3) {
117 s->map[cnt] = tmp[i];
118 cnt++;
119 }
120 for (int i = 2; i < 15; i += 3) {
121 s->map[cnt] = tmp[i];
122 cnt++;
123 }
124 for (int i = 0; i < 15; i += 3) {
125 s->map[cnt] = tmp[i];
126 cnt++;
127 }
128 memmove(&s->map[7], &s->map[6], 4*sizeof(int));
129 memmove(&s->map[3], &s->map[1], 4*sizeof(int));
130 s->map[1] = tmp[2];
131 s->map[2] = tmp[0];
132 }
133
134 return 0;
135 }
136
137 700 static av_cold int m_inv_init(AVTXContext *s, const FFTXCodelet *cd,
138 uint64_t flags, FFTXCodeletOptions *opts,
139 int len, int inv, const void *scale)
140 {
141 int ret;
142 700 FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_GATHER };
143
144 700 s->scale_d = *((SCALE_TYPE *)scale);
145 700 s->scale_f = s->scale_d;
146
147 700 flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
148 700 flags |= AV_TX_INPLACE; /* in-place */
149 700 flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */
150 700 flags |= FF_TX_ASM_CALL; /* We want an assembly function, not C */
151
152
2/2
✓ Branch 1 taken 94 times.
✓ Branch 2 taken 606 times.
700 if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1,
153 inv, scale)))
154 94 return ret;
155
156 606 s->map = av_malloc(len*sizeof(*s->map));
157
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 606 times.
606 if (!s->map)
158 return AVERROR(ENOMEM);
159
160 606 memcpy(s->map, s->sub->map, (len >> 1)*sizeof(*s->map));
161 /* Invert lookup table for unstrided path */
162
2/2
✓ Branch 0 taken 138336 times.
✓ Branch 1 taken 606 times.
138942 for (int i = 0; i < (len >> 1); i++)
163 138336 s->map[(len >> 1) + s->map[i]] = i;
164
165
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 606 times.
606 if ((ret = ff_tx_mdct_gen_exp_float(s, s->map)))
166 return ret;
167
168 606 return 0;
169 }
170
171 283 static av_cold int fft_pfa_init(AVTXContext *s,
172 const FFTXCodelet *cd,
173 uint64_t flags,
174 FFTXCodeletOptions *opts,
175 int len, int inv,
176 const void *scale)
177 {
178 int ret;
179 283 int sub_len = len / cd->factors[0];
180 283 FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_SCATTER };
181
182 283 flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
183 283 flags |= AV_TX_INPLACE; /* in-place */
184 283 flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */
185 283 flags |= FF_TX_ASM_CALL; /* We want an assembly function, not C */
186
187
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 283 times.
283 if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
188 sub_len, inv, scale)))
189 return ret;
190
191
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 283 times.
283 if ((ret = ff_tx_gen_compound_mapping(s, opts, s->inv, cd->factors[0], sub_len)))
192 return ret;
193
194
1/2
✓ Branch 0 taken 283 times.
✗ Branch 1 not taken.
283 if (cd->factors[0] == 15) {
195 int tmp[15];
196
197 /* Our 15-point transform is also a compound one, so embed its input map */
198
6/6
✓ Branch 0 taken 71760 times.
✓ Branch 1 taken 23920 times.
✓ Branch 2 taken 23920 times.
✓ Branch 3 taken 4784 times.
✓ Branch 4 taken 4784 times.
✓ Branch 5 taken 283 times.
100747 TX_EMBED_INPUT_PFA_MAP(s->map, len, 3, 5);
199
200 /* Special permutation to simplify loads in the pre-permuted version */
201
2/2
✓ Branch 0 taken 4784 times.
✓ Branch 1 taken 283 times.
5067 for (int k = 0; k < s->sub[0].len; k++) {
202 4784 int cnt = 0;
203 4784 memcpy(tmp, &s->map[k*15], 15*sizeof(*tmp));
204
2/2
✓ Branch 0 taken 23920 times.
✓ Branch 1 taken 4784 times.
28704 for (int i = 1; i < 15; i += 3) {
205 23920 s->map[k*15 + cnt] = tmp[i];
206 23920 cnt++;
207 }
208
2/2
✓ Branch 0 taken 23920 times.
✓ Branch 1 taken 4784 times.
28704 for (int i = 2; i < 15; i += 3) {
209 23920 s->map[k*15 + cnt] = tmp[i];
210 23920 cnt++;
211 }
212
2/2
✓ Branch 0 taken 23920 times.
✓ Branch 1 taken 4784 times.
28704 for (int i = 0; i < 15; i += 3) {
213 23920 s->map[k*15 + cnt] = tmp[i];
214 23920 cnt++;
215 }
216 4784 memmove(&s->map[k*15 + 7], &s->map[k*15 + 6], 4*sizeof(int));
217 4784 memmove(&s->map[k*15 + 3], &s->map[k*15 + 1], 4*sizeof(int));
218 4784 s->map[k*15 + 1] = tmp[2];
219 4784 s->map[k*15 + 2] = tmp[0];
220 }
221 }
222
223
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 283 times.
283 if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
224 return AVERROR(ENOMEM);
225
226 283 TX_TAB(ff_tx_init_tabs)(len / sub_len);
227
228 283 return 0;
229 }
230
231 const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = {
232 TX_DEF(fft2, FFT, 2, 2, 2, 0, 128, NULL, sse3, SSE3, AV_TX_INPLACE, 0),
233 TX_DEF(fft2_asm, FFT, 2, 2, 2, 0, 192, b8_i0, sse3, SSE3,
234 AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, 0),
235 TX_DEF(fft2, FFT, 2, 2, 2, 0, 192, b8_i0, sse3, SSE3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
236 TX_DEF(fft4_fwd, FFT, 4, 4, 2, 0, 128, NULL, sse2, SSE2, AV_TX_INPLACE | FF_TX_FORWARD_ONLY, 0),
237 TX_DEF(fft4_fwd_asm, FFT, 4, 4, 2, 0, 192, b8_i0, sse2, SSE2,
238 AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, 0),
239 TX_DEF(fft4_inv_asm, FFT, 4, 4, 2, 0, 128, NULL, sse2, SSE2,
240 AV_TX_INPLACE | FF_TX_INVERSE_ONLY | FF_TX_ASM_CALL, 0),
241 TX_DEF(fft4_fwd, FFT, 4, 4, 2, 0, 192, b8_i0, sse2, SSE2, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
242 TX_DEF(fft4_inv, FFT, 4, 4, 2, 0, 128, NULL, sse2, SSE2, AV_TX_INPLACE | FF_TX_INVERSE_ONLY, 0),
243 TX_DEF(fft8, FFT, 8, 8, 2, 0, 128, b8_i0, sse3, SSE3, AV_TX_INPLACE, 0),
244 TX_DEF(fft8_asm, FFT, 8, 8, 2, 0, 192, b8_i0, sse3, SSE3,
245 AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, 0),
246 TX_DEF(fft8_ns, FFT, 8, 8, 2, 0, 192, b8_i0, sse3, SSE3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
247 TX_DEF(fft8, FFT, 8, 8, 2, 0, 256, b8_i0, avx, AVX, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
248 TX_DEF(fft8_asm, FFT, 8, 8, 2, 0, 320, b8_i0, avx, AVX,
249 AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW),
250 TX_DEF(fft8_ns, FFT, 8, 8, 2, 0, 320, b8_i0, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
251 AV_CPU_FLAG_AVXSLOW),
252 TX_DEF(fft16, FFT, 16, 16, 2, 0, 256, b8_i2, avx, AVX, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
253 TX_DEF(fft16_asm, FFT, 16, 16, 2, 0, 320, b8_i2, avx, AVX,
254 AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW),
255 TX_DEF(fft16_ns, FFT, 16, 16, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
256 AV_CPU_FLAG_AVXSLOW),
257 TX_DEF(fft16, FFT, 16, 16, 2, 0, 288, b8_i2, fma3, FMA3, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
258 TX_DEF(fft16_asm, FFT, 16, 16, 2, 0, 352, b8_i2, fma3, FMA3,
259 AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW),
260 TX_DEF(fft16_ns, FFT, 16, 16, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
261 AV_CPU_FLAG_AVXSLOW),
262
263 #if ARCH_X86_64
264 TX_DEF(fft32, FFT, 32, 32, 2, 0, 256, b8_i2, avx, AVX, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
265 TX_DEF(fft32_asm, FFT, 32, 32, 2, 0, 320, b8_i2, avx, AVX,
266 AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW),
267 TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
268 AV_CPU_FLAG_AVXSLOW),
269 TX_DEF(fft32, FFT, 32, 32, 2, 0, 288, b8_i2, fma3, FMA3, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
270 TX_DEF(fft32_asm, FFT, 32, 32, 2, 0, 352, b8_i2, fma3, FMA3,
271 AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW),
272 TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
273 AV_CPU_FLAG_AVXSLOW),
274 TX_DEF(fft_sr, FFT, 64, 2097152, 2, 0, 256, b8_i2, avx, AVX, 0, AV_CPU_FLAG_AVXSLOW),
275 TX_DEF(fft_sr_asm, FFT, 64, 2097152, 2, 0, 320, b8_i2, avx, AVX,
276 AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW),
277 TX_DEF(fft_sr_ns, FFT, 64, 2097152, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
278 AV_CPU_FLAG_AVXSLOW),
279 TX_DEF(fft_sr, FFT, 64, 2097152, 2, 0, 288, b8_i2, fma3, FMA3, 0, AV_CPU_FLAG_AVXSLOW),
280 TX_DEF(fft_sr_asm, FFT, 64, 2097152, 2, 0, 352, b8_i2, fma3, FMA3,
281 AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW),
282 TX_DEF(fft_sr_ns, FFT, 64, 2097152, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
283 AV_CPU_FLAG_AVXSLOW),
284
285 TX_DEF(fft15, FFT, 15, 15, 15, 0, 320, factor_init, avx2, AVX2,
286 AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
287 TX_DEF(fft15_ns, FFT, 15, 15, 15, 0, 384, factor_init, avx2, AVX2,
288 AV_TX_INPLACE | FF_TX_PRESHUFFLE, AV_CPU_FLAG_AVXSLOW),
289
290 TX_DEF(fft_sr, FFT, 64, 2097152, 2, 0, 320, b8_i2, avx2, AVX2, 0,
291 AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
292 TX_DEF(fft_sr_asm, FFT, 64, 2097152, 2, 0, 384, b8_i2, avx2, AVX2,
293 AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
294 TX_DEF(fft_sr_ns, FFT, 64, 2097152, 2, 0, 384, b8_i2, avx2, AVX2, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
295 AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
296
297 TX_DEF(fft_pfa_15xM, FFT, 60, TX_LEN_UNLIMITED, 15, 2, 320, fft_pfa_init, avx2, AVX2,
298 AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
299 TX_DEF(fft_pfa_15xM_asm, FFT, 60, TX_LEN_UNLIMITED, 15, 2, 384, fft_pfa_init, avx2, AVX2,
300 AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
301 TX_DEF(fft_pfa_15xM_ns, FFT, 60, TX_LEN_UNLIMITED, 15, 2, 384, fft_pfa_init, avx2, AVX2,
302 AV_TX_INPLACE | FF_TX_PRESHUFFLE, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
303
304 TX_DEF(mdct_inv, MDCT, 16, TX_LEN_UNLIMITED, 2, TX_FACTOR_ANY, 384, m_inv_init, avx2, AVX2,
305 FF_TX_INVERSE_ONLY, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
306 #endif
307
308 NULL,
309 };
310