| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /* | ||
| 2 | * Copyright (c) Lynne | ||
| 3 | * | ||
| 4 | * Power of two FFT: | ||
| 5 | * Copyright (c) Lynne | ||
| 6 | * Copyright (c) 2008 Loren Merritt | ||
| 7 | * Copyright (c) 2002 Fabrice Bellard | ||
| 8 | * Partly based on libdjbfft by D. J. Bernstein | ||
| 9 | * | ||
| 10 | * This file is part of FFmpeg. | ||
| 11 | * | ||
| 12 | * FFmpeg is free software; you can redistribute it and/or | ||
| 13 | * modify it under the terms of the GNU Lesser General Public | ||
| 14 | * License as published by the Free Software Foundation; either | ||
| 15 | * version 2.1 of the License, or (at your option) any later version. | ||
| 16 | * | ||
| 17 | * FFmpeg is distributed in the hope that it will be useful, | ||
| 18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 20 | * Lesser General Public License for more details. | ||
| 21 | * | ||
| 22 | * You should have received a copy of the GNU Lesser General Public | ||
| 23 | * License along with FFmpeg; if not, write to the Free Software | ||
| 24 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
| 25 | */ | ||
| 26 | |||
| 27 | #include "mem.h" | ||
| 28 | |||
| 29 | #define TABLE_DEF(name, size) \ | ||
| 30 | DECLARE_ALIGNED(32, TXSample, TX_TAB(ff_tx_tab_ ##name))[size] | ||
| 31 | |||
| 32 | #define SR_POW2_TABLES \ | ||
| 33 | SR_TABLE(8) \ | ||
| 34 | SR_TABLE(16) \ | ||
| 35 | SR_TABLE(32) \ | ||
| 36 | SR_TABLE(64) \ | ||
| 37 | SR_TABLE(128) \ | ||
| 38 | SR_TABLE(256) \ | ||
| 39 | SR_TABLE(512) \ | ||
| 40 | SR_TABLE(1024) \ | ||
| 41 | SR_TABLE(2048) \ | ||
| 42 | SR_TABLE(4096) \ | ||
| 43 | SR_TABLE(8192) \ | ||
| 44 | SR_TABLE(16384) \ | ||
| 45 | SR_TABLE(32768) \ | ||
| 46 | SR_TABLE(65536) \ | ||
| 47 | SR_TABLE(131072) \ | ||
| 48 | |||
| 49 | #define SR_TABLE(len) \ | ||
| 50 | TABLE_DEF(len, len/4 + 1); | ||
| 51 | /* Power of two tables */ | ||
| 52 | SR_POW2_TABLES | ||
| 53 | #undef SR_TABLE | ||
| 54 | |||
| 55 | /* Other factors' tables */ | ||
| 56 | TABLE_DEF(53, 12); | ||
| 57 | TABLE_DEF( 7, 6); | ||
| 58 | TABLE_DEF( 9, 8); | ||
| 59 | |||
| 60 | typedef struct FFTabInitData { | ||
| 61 | void (*func)(void); | ||
| 62 | int factors[TX_MAX_SUB]; /* Must be sorted high -> low */ | ||
| 63 | } FFTabInitData; | ||
| 64 | |||
| 65 | #define SR_TABLE(len) \ | ||
| 66 | static av_cold void TX_TAB(ff_tx_init_tab_ ##len)(void) \ | ||
| 67 | { \ | ||
| 68 | double freq = 2*M_PI/len; \ | ||
| 69 | TXSample *tab = TX_TAB(ff_tx_tab_ ##len); \ | ||
| 70 | \ | ||
| 71 | for (int i = 0; i < len/4; i++) \ | ||
| 72 | *tab++ = RESCALE(cos(i*freq)); \ | ||
| 73 | \ | ||
| 74 | *tab = 0; \ | ||
| 75 | } | ||
| 76 |
2/2✓ Branch 0 taken 112320 times.
✓ Branch 1 taken 2798 times.
|
230236 | SR_POW2_TABLES |
| 77 | #undef SR_TABLE | ||
| 78 | |||
| 79 | static void (*const sr_tabs_init_funcs[])(void) = { | ||
| 80 | #define SR_TABLE(len) TX_TAB(ff_tx_init_tab_ ##len), | ||
| 81 | SR_POW2_TABLES | ||
| 82 | #undef SR_TABLE | ||
| 83 | }; | ||
| 84 | |||
| 85 | static AVOnce sr_tabs_init_once[] = { | ||
| 86 | #define SR_TABLE(len) AV_ONCE_INIT, | ||
| 87 | SR_POW2_TABLES | ||
| 88 | #undef SR_TABLE | ||
| 89 | }; | ||
| 90 | |||
| 91 | 235 | static av_cold void TX_TAB(ff_tx_init_tab_53)(void) | |
| 92 | { | ||
| 93 | /* 5pt, doubled to eliminate AVX lane shuffles */ | ||
| 94 | 235 | TX_TAB(ff_tx_tab_53)[0] = RESCALE(cos(2 * M_PI / 5)); | |
| 95 | 235 | TX_TAB(ff_tx_tab_53)[1] = RESCALE(cos(2 * M_PI / 5)); | |
| 96 | 235 | TX_TAB(ff_tx_tab_53)[2] = RESCALE(cos(2 * M_PI / 10)); | |
| 97 | 235 | TX_TAB(ff_tx_tab_53)[3] = RESCALE(cos(2 * M_PI / 10)); | |
| 98 | 235 | TX_TAB(ff_tx_tab_53)[4] = RESCALE(sin(2 * M_PI / 5)); | |
| 99 | 235 | TX_TAB(ff_tx_tab_53)[5] = RESCALE(sin(2 * M_PI / 5)); | |
| 100 | 235 | TX_TAB(ff_tx_tab_53)[6] = RESCALE(sin(2 * M_PI / 10)); | |
| 101 | 235 | TX_TAB(ff_tx_tab_53)[7] = RESCALE(sin(2 * M_PI / 10)); | |
| 102 | |||
| 103 | /* 3pt */ | ||
| 104 | 235 | TX_TAB(ff_tx_tab_53)[ 8] = RESCALE(cos(2 * M_PI / 12)); | |
| 105 | 235 | TX_TAB(ff_tx_tab_53)[ 9] = RESCALE(cos(2 * M_PI / 12)); | |
| 106 | 235 | TX_TAB(ff_tx_tab_53)[10] = RESCALE(cos(2 * M_PI / 6)); | |
| 107 | 235 | TX_TAB(ff_tx_tab_53)[11] = RESCALE(cos(8 * M_PI / 6)); | |
| 108 | 235 | } | |
| 109 | |||
| 110 | 5 | static av_cold void TX_TAB(ff_tx_init_tab_7)(void) | |
| 111 | { | ||
| 112 | 5 | TX_TAB(ff_tx_tab_7)[0] = RESCALE(cos(2 * M_PI / 7)); | |
| 113 | 5 | TX_TAB(ff_tx_tab_7)[1] = RESCALE(sin(2 * M_PI / 7)); | |
| 114 | 5 | TX_TAB(ff_tx_tab_7)[2] = RESCALE(sin(2 * M_PI / 28)); | |
| 115 | 5 | TX_TAB(ff_tx_tab_7)[3] = RESCALE(cos(2 * M_PI / 28)); | |
| 116 | 5 | TX_TAB(ff_tx_tab_7)[4] = RESCALE(cos(2 * M_PI / 14)); | |
| 117 | 5 | TX_TAB(ff_tx_tab_7)[5] = RESCALE(sin(2 * M_PI / 14)); | |
| 118 | 5 | } | |
| 119 | |||
| 120 | 5 | static av_cold void TX_TAB(ff_tx_init_tab_9)(void) | |
| 121 | { | ||
| 122 | 5 | TX_TAB(ff_tx_tab_9)[0] = RESCALE(cos(2 * M_PI / 3)); | |
| 123 | 5 | TX_TAB(ff_tx_tab_9)[1] = RESCALE(sin(2 * M_PI / 3)); | |
| 124 | 5 | TX_TAB(ff_tx_tab_9)[2] = RESCALE(cos(2 * M_PI / 9)); | |
| 125 | 5 | TX_TAB(ff_tx_tab_9)[3] = RESCALE(sin(2 * M_PI / 9)); | |
| 126 | 5 | TX_TAB(ff_tx_tab_9)[4] = RESCALE(cos(2 * M_PI / 36)); | |
| 127 | 5 | TX_TAB(ff_tx_tab_9)[5] = RESCALE(sin(2 * M_PI / 36)); | |
| 128 | 5 | TX_TAB(ff_tx_tab_9)[6] = TX_TAB(ff_tx_tab_9)[2] + TX_TAB(ff_tx_tab_9)[5]; | |
| 129 | 5 | TX_TAB(ff_tx_tab_9)[7] = TX_TAB(ff_tx_tab_9)[3] - TX_TAB(ff_tx_tab_9)[4]; | |
| 130 | 5 | } | |
| 131 | |||
| 132 | static const FFTabInitData nptwo_tabs_init_data[] = { | ||
| 133 | { TX_TAB(ff_tx_init_tab_53), { 15, 5, 3 } }, | ||
| 134 | { TX_TAB(ff_tx_init_tab_9), { 9 } }, | ||
| 135 | { TX_TAB(ff_tx_init_tab_7), { 7 } }, | ||
| 136 | }; | ||
| 137 | |||
| 138 | static AVOnce nptwo_tabs_init_once[] = { | ||
| 139 | AV_ONCE_INIT, | ||
| 140 | AV_ONCE_INIT, | ||
| 141 | AV_ONCE_INIT, | ||
| 142 | }; | ||
| 143 | |||
| 144 | 7781 | av_cold void TX_TAB(ff_tx_init_tabs)(int len) | |
| 145 | { | ||
| 146 | 7781 | int factor_2 = ff_ctz(len); | |
| 147 |
2/2✓ Branch 0 taken 5544 times.
✓ Branch 1 taken 2237 times.
|
7781 | if (factor_2) { |
| 148 | 5544 | int idx = factor_2 - 3; | |
| 149 |
2/2✓ Branch 0 taken 20181 times.
✓ Branch 1 taken 5544 times.
|
25725 | for (int i = 0; i <= idx; i++) |
| 150 | 20181 | ff_thread_once(&sr_tabs_init_once[i], | |
| 151 | sr_tabs_init_funcs[i]); | ||
| 152 | 5544 | len >>= factor_2; | |
| 153 | } | ||
| 154 | |||
| 155 |
2/2✓ Branch 0 taken 10034 times.
✓ Branch 1 taken 8 times.
|
10042 | for (int i = 0; i < FF_ARRAY_ELEMS(nptwo_tabs_init_data); i++) { |
| 156 | 10034 | int f, f_idx = 0; | |
| 157 | |||
| 158 |
2/2✓ Branch 0 taken 7773 times.
✓ Branch 1 taken 2261 times.
|
10034 | if (len <= 1) |
| 159 | 7773 | return; | |
| 160 | |||
| 161 |
2/2✓ Branch 0 taken 2293 times.
✓ Branch 1 taken 24 times.
|
2317 | while ((f = nptwo_tabs_init_data[i].factors[f_idx++])) { |
| 162 |
2/2✓ Branch 0 taken 56 times.
✓ Branch 1 taken 2237 times.
|
2293 | if (f % len) |
| 163 | 56 | continue; | |
| 164 | |||
| 165 | 2237 | ff_thread_once(&nptwo_tabs_init_once[i], | |
| 166 | nptwo_tabs_init_data[i].func); | ||
| 167 | 2237 | len /= f; | |
| 168 | 2237 | break; | |
| 169 | } | ||
| 170 | } | ||
| 171 | } | ||
| 172 | |||
| 173 | 4466460 | static av_always_inline void fft3(TXComplex *out, TXComplex *in, | |
| 174 | ptrdiff_t stride) | ||
| 175 | { | ||
| 176 | TXComplex tmp[3]; | ||
| 177 | 4466460 | const TXSample *tab = TX_TAB(ff_tx_tab_53); | |
| 178 | #ifdef TX_INT32 | ||
| 179 | int64_t mtmp[4]; | ||
| 180 | #endif | ||
| 181 | |||
| 182 | 4466460 | tmp[0] = in[0]; | |
| 183 | 4466460 | BF(tmp[1].re, tmp[2].im, in[1].im, in[2].im); | |
| 184 | 4466460 | BF(tmp[1].im, tmp[2].re, in[1].re, in[2].re); | |
| 185 | |||
| 186 | #ifdef TX_INT32 | ||
| 187 | ✗ | out[0*stride].re = (int64_t)tmp[0].re + tmp[2].re; | |
| 188 | ✗ | out[0*stride].im = (int64_t)tmp[0].im + tmp[2].im; | |
| 189 | ✗ | mtmp[0] = (int64_t)tab[ 8] * tmp[1].re; | |
| 190 | ✗ | mtmp[1] = (int64_t)tab[ 9] * tmp[1].im; | |
| 191 | ✗ | mtmp[2] = (int64_t)tab[10] * tmp[2].re; | |
| 192 | ✗ | mtmp[3] = (int64_t)tab[10] * tmp[2].im; | |
| 193 | ✗ | out[1*stride].re = tmp[0].re - (mtmp[2] + mtmp[0] + 0x40000000 >> 31); | |
| 194 | ✗ | out[1*stride].im = tmp[0].im - (mtmp[3] - mtmp[1] + 0x40000000 >> 31); | |
| 195 | ✗ | out[2*stride].re = tmp[0].re - (mtmp[2] - mtmp[0] + 0x40000000 >> 31); | |
| 196 | ✗ | out[2*stride].im = tmp[0].im - (mtmp[3] + mtmp[1] + 0x40000000 >> 31); | |
| 197 | #else | ||
| 198 | 4466460 | out[0*stride].re = tmp[0].re + tmp[2].re; | |
| 199 | 4466460 | out[0*stride].im = tmp[0].im + tmp[2].im; | |
| 200 | 4466460 | tmp[1].re = tab[ 8] * tmp[1].re; | |
| 201 | 4466460 | tmp[1].im = tab[ 9] * tmp[1].im; | |
| 202 | 4466460 | tmp[2].re = tab[10] * tmp[2].re; | |
| 203 | 4466460 | tmp[2].im = tab[10] * tmp[2].im; | |
| 204 | 4466460 | out[1*stride].re = tmp[0].re - tmp[2].re + tmp[1].re; | |
| 205 | 4466460 | out[1*stride].im = tmp[0].im - tmp[2].im - tmp[1].im; | |
| 206 | 4466460 | out[2*stride].re = tmp[0].re - tmp[2].re - tmp[1].re; | |
| 207 | 4466460 | out[2*stride].im = tmp[0].im - tmp[2].im + tmp[1].im; | |
| 208 | #endif | ||
| 209 | 4466460 | } | |
| 210 | |||
| 211 | #define DECL_FFT5(NAME, D0, D1, D2, D3, D4) \ | ||
| 212 | static av_always_inline void NAME(TXComplex *out, TXComplex *in, \ | ||
| 213 | ptrdiff_t stride) \ | ||
| 214 | { \ | ||
| 215 | TXComplex dc, z0[4], t[6]; \ | ||
| 216 | const TXSample *tab = TX_TAB(ff_tx_tab_53); \ | ||
| 217 | \ | ||
| 218 | dc = in[0]; \ | ||
| 219 | BF(t[1].im, t[0].re, in[1].re, in[4].re); \ | ||
| 220 | BF(t[1].re, t[0].im, in[1].im, in[4].im); \ | ||
| 221 | BF(t[3].im, t[2].re, in[2].re, in[3].re); \ | ||
| 222 | BF(t[3].re, t[2].im, in[2].im, in[3].im); \ | ||
| 223 | \ | ||
| 224 | out[D0*stride].re = dc.re + (TXUSample)t[0].re + t[2].re; \ | ||
| 225 | out[D0*stride].im = dc.im + (TXUSample)t[0].im + t[2].im; \ | ||
| 226 | \ | ||
| 227 | SMUL(t[4].re, t[0].re, tab[0], tab[2], t[2].re, t[0].re); \ | ||
| 228 | SMUL(t[4].im, t[0].im, tab[0], tab[2], t[2].im, t[0].im); \ | ||
| 229 | CMUL(t[5].re, t[1].re, tab[4], tab[6], t[3].re, t[1].re); \ | ||
| 230 | CMUL(t[5].im, t[1].im, tab[4], tab[6], t[3].im, t[1].im); \ | ||
| 231 | \ | ||
| 232 | BF(z0[0].re, z0[3].re, t[0].re, t[1].re); \ | ||
| 233 | BF(z0[0].im, z0[3].im, t[0].im, t[1].im); \ | ||
| 234 | BF(z0[2].re, z0[1].re, t[4].re, t[5].re); \ | ||
| 235 | BF(z0[2].im, z0[1].im, t[4].im, t[5].im); \ | ||
| 236 | \ | ||
| 237 | out[D1*stride].re = dc.re + (TXUSample)z0[3].re; \ | ||
| 238 | out[D1*stride].im = dc.im + (TXUSample)z0[0].im; \ | ||
| 239 | out[D2*stride].re = dc.re + (TXUSample)z0[2].re; \ | ||
| 240 | out[D2*stride].im = dc.im + (TXUSample)z0[1].im; \ | ||
| 241 | out[D3*stride].re = dc.re + (TXUSample)z0[1].re; \ | ||
| 242 | out[D3*stride].im = dc.im + (TXUSample)z0[2].im; \ | ||
| 243 | out[D4*stride].re = dc.re + (TXUSample)z0[0].re; \ | ||
| 244 | out[D4*stride].im = dc.im + (TXUSample)z0[3].im; \ | ||
| 245 | } | ||
| 246 | |||
| 247 | 79542 | DECL_FFT5(fft5, 0, 1, 2, 3, 4) | |
| 248 | 893292 | DECL_FFT5(fft5_m1, 0, 6, 12, 3, 9) | |
| 249 | 893292 | DECL_FFT5(fft5_m2, 10, 1, 7, 13, 4) | |
| 250 | 893292 | DECL_FFT5(fft5_m3, 5, 11, 2, 8, 14) | |
| 251 | |||
| 252 | 50526 | static av_always_inline void fft7(TXComplex *out, TXComplex *in, | |
| 253 | ptrdiff_t stride) | ||
| 254 | { | ||
| 255 | TXComplex dc, t[6], z[3]; | ||
| 256 | 50526 | const TXComplex *tab = (const TXComplex *)TX_TAB(ff_tx_tab_7); | |
| 257 | #ifdef TX_INT32 | ||
| 258 | int64_t mtmp[12]; | ||
| 259 | #endif | ||
| 260 | |||
| 261 | 50526 | dc = in[0]; | |
| 262 | 50526 | BF(t[1].re, t[0].re, in[1].re, in[6].re); | |
| 263 | 50526 | BF(t[1].im, t[0].im, in[1].im, in[6].im); | |
| 264 | 50526 | BF(t[3].re, t[2].re, in[2].re, in[5].re); | |
| 265 | 50526 | BF(t[3].im, t[2].im, in[2].im, in[5].im); | |
| 266 | 50526 | BF(t[5].re, t[4].re, in[3].re, in[4].re); | |
| 267 | 50526 | BF(t[5].im, t[4].im, in[3].im, in[4].im); | |
| 268 | |||
| 269 | 50526 | out[0*stride].re = dc.re + t[0].re + t[2].re + t[4].re; | |
| 270 | 50526 | out[0*stride].im = dc.im + t[0].im + t[2].im + t[4].im; | |
| 271 | |||
| 272 | #ifdef TX_INT32 /* NOTE: it's possible to do this with 16 mults but 72 adds */ | ||
| 273 | ✗ | mtmp[ 0] = ((int64_t)tab[0].re)*t[0].re - ((int64_t)tab[2].re)*t[4].re; | |
| 274 | ✗ | mtmp[ 1] = ((int64_t)tab[0].re)*t[4].re - ((int64_t)tab[1].re)*t[0].re; | |
| 275 | ✗ | mtmp[ 2] = ((int64_t)tab[0].re)*t[2].re - ((int64_t)tab[2].re)*t[0].re; | |
| 276 | ✗ | mtmp[ 3] = ((int64_t)tab[0].re)*t[0].im - ((int64_t)tab[1].re)*t[2].im; | |
| 277 | ✗ | mtmp[ 4] = ((int64_t)tab[0].re)*t[4].im - ((int64_t)tab[1].re)*t[0].im; | |
| 278 | ✗ | mtmp[ 5] = ((int64_t)tab[0].re)*t[2].im - ((int64_t)tab[2].re)*t[0].im; | |
| 279 | |||
| 280 | ✗ | mtmp[ 6] = ((int64_t)tab[2].im)*t[1].im + ((int64_t)tab[1].im)*t[5].im; | |
| 281 | ✗ | mtmp[ 7] = ((int64_t)tab[0].im)*t[5].im + ((int64_t)tab[2].im)*t[3].im; | |
| 282 | ✗ | mtmp[ 8] = ((int64_t)tab[2].im)*t[5].im + ((int64_t)tab[1].im)*t[3].im; | |
| 283 | ✗ | mtmp[ 9] = ((int64_t)tab[0].im)*t[1].re + ((int64_t)tab[1].im)*t[3].re; | |
| 284 | ✗ | mtmp[10] = ((int64_t)tab[2].im)*t[3].re + ((int64_t)tab[0].im)*t[5].re; | |
| 285 | ✗ | mtmp[11] = ((int64_t)tab[2].im)*t[1].re + ((int64_t)tab[1].im)*t[5].re; | |
| 286 | |||
| 287 | ✗ | z[0].re = (int32_t)(mtmp[ 0] - ((int64_t)tab[1].re)*t[2].re + 0x40000000 >> 31); | |
| 288 | ✗ | z[1].re = (int32_t)(mtmp[ 1] - ((int64_t)tab[2].re)*t[2].re + 0x40000000 >> 31); | |
| 289 | ✗ | z[2].re = (int32_t)(mtmp[ 2] - ((int64_t)tab[1].re)*t[4].re + 0x40000000 >> 31); | |
| 290 | ✗ | z[0].im = (int32_t)(mtmp[ 3] - ((int64_t)tab[2].re)*t[4].im + 0x40000000 >> 31); | |
| 291 | ✗ | z[1].im = (int32_t)(mtmp[ 4] - ((int64_t)tab[2].re)*t[2].im + 0x40000000 >> 31); | |
| 292 | ✗ | z[2].im = (int32_t)(mtmp[ 5] - ((int64_t)tab[1].re)*t[4].im + 0x40000000 >> 31); | |
| 293 | |||
| 294 | ✗ | t[0].re = (int32_t)(mtmp[ 6] - ((int64_t)tab[0].im)*t[3].im + 0x40000000 >> 31); | |
| 295 | ✗ | t[2].re = (int32_t)(mtmp[ 7] - ((int64_t)tab[1].im)*t[1].im + 0x40000000 >> 31); | |
| 296 | ✗ | t[4].re = (int32_t)(mtmp[ 8] + ((int64_t)tab[0].im)*t[1].im + 0x40000000 >> 31); | |
| 297 | ✗ | t[0].im = (int32_t)(mtmp[ 9] + ((int64_t)tab[2].im)*t[5].re + 0x40000000 >> 31); | |
| 298 | ✗ | t[2].im = (int32_t)(mtmp[10] - ((int64_t)tab[1].im)*t[1].re + 0x40000000 >> 31); | |
| 299 | ✗ | t[4].im = (int32_t)(mtmp[11] - ((int64_t)tab[0].im)*t[3].re + 0x40000000 >> 31); | |
| 300 | #else | ||
| 301 | 50526 | z[0].re = tab[0].re*t[0].re - tab[2].re*t[4].re - tab[1].re*t[2].re; | |
| 302 | 50526 | z[1].re = tab[0].re*t[4].re - tab[1].re*t[0].re - tab[2].re*t[2].re; | |
| 303 | 50526 | z[2].re = tab[0].re*t[2].re - tab[2].re*t[0].re - tab[1].re*t[4].re; | |
| 304 | 50526 | z[0].im = tab[0].re*t[0].im - tab[1].re*t[2].im - tab[2].re*t[4].im; | |
| 305 | 50526 | z[1].im = tab[0].re*t[4].im - tab[1].re*t[0].im - tab[2].re*t[2].im; | |
| 306 | 50526 | z[2].im = tab[0].re*t[2].im - tab[2].re*t[0].im - tab[1].re*t[4].im; | |
| 307 | |||
| 308 | /* It's possible to do t[4].re and t[0].im with 2 multiplies only by | ||
| 309 | * multiplying the sum of all with the average of the twiddles */ | ||
| 310 | |||
| 311 | 50526 | t[0].re = tab[2].im*t[1].im + tab[1].im*t[5].im - tab[0].im*t[3].im; | |
| 312 | 50526 | t[2].re = tab[0].im*t[5].im + tab[2].im*t[3].im - tab[1].im*t[1].im; | |
| 313 | 50526 | t[4].re = tab[2].im*t[5].im + tab[1].im*t[3].im + tab[0].im*t[1].im; | |
| 314 | 50526 | t[0].im = tab[0].im*t[1].re + tab[1].im*t[3].re + tab[2].im*t[5].re; | |
| 315 | 50526 | t[2].im = tab[2].im*t[3].re + tab[0].im*t[5].re - tab[1].im*t[1].re; | |
| 316 | 50526 | t[4].im = tab[2].im*t[1].re + tab[1].im*t[5].re - tab[0].im*t[3].re; | |
| 317 | #endif | ||
| 318 | |||
| 319 | 50526 | BF(t[1].re, z[0].re, z[0].re, t[4].re); | |
| 320 | 50526 | BF(t[3].re, z[1].re, z[1].re, t[2].re); | |
| 321 | 50526 | BF(t[5].re, z[2].re, z[2].re, t[0].re); | |
| 322 | 50526 | BF(t[1].im, z[0].im, z[0].im, t[0].im); | |
| 323 | 50526 | BF(t[3].im, z[1].im, z[1].im, t[2].im); | |
| 324 | 50526 | BF(t[5].im, z[2].im, z[2].im, t[4].im); | |
| 325 | |||
| 326 | 50526 | out[1*stride].re = dc.re + z[0].re; | |
| 327 | 50526 | out[1*stride].im = dc.im + t[1].im; | |
| 328 | 50526 | out[2*stride].re = dc.re + t[3].re; | |
| 329 | 50526 | out[2*stride].im = dc.im + z[1].im; | |
| 330 | 50526 | out[3*stride].re = dc.re + z[2].re; | |
| 331 | 50526 | out[3*stride].im = dc.im + t[5].im; | |
| 332 | 50526 | out[4*stride].re = dc.re + t[5].re; | |
| 333 | 50526 | out[4*stride].im = dc.im + z[2].im; | |
| 334 | 50526 | out[5*stride].re = dc.re + z[1].re; | |
| 335 | 50526 | out[5*stride].im = dc.im + t[3].im; | |
| 336 | 50526 | out[6*stride].re = dc.re + t[1].re; | |
| 337 | 50526 | out[6*stride].im = dc.im + z[0].im; | |
| 338 | 50526 | } | |
| 339 | |||
| 340 | 39298 | static av_always_inline void fft9(TXComplex *out, TXComplex *in, | |
| 341 | ptrdiff_t stride) | ||
| 342 | { | ||
| 343 | 39298 | const TXComplex *tab = (const TXComplex *)TX_TAB(ff_tx_tab_9); | |
| 344 | TXComplex dc, t[16], w[4], x[5], y[5], z[2]; | ||
| 345 | #ifdef TX_INT32 | ||
| 346 | int64_t mtmp[12]; | ||
| 347 | #endif | ||
| 348 | |||
| 349 | 39298 | dc = in[0]; | |
| 350 | 39298 | BF(t[1].re, t[0].re, in[1].re, in[8].re); | |
| 351 | 39298 | BF(t[1].im, t[0].im, in[1].im, in[8].im); | |
| 352 | 39298 | BF(t[3].re, t[2].re, in[2].re, in[7].re); | |
| 353 | 39298 | BF(t[3].im, t[2].im, in[2].im, in[7].im); | |
| 354 | 39298 | BF(t[5].re, t[4].re, in[3].re, in[6].re); | |
| 355 | 39298 | BF(t[5].im, t[4].im, in[3].im, in[6].im); | |
| 356 | 39298 | BF(t[7].re, t[6].re, in[4].re, in[5].re); | |
| 357 | 39298 | BF(t[7].im, t[6].im, in[4].im, in[5].im); | |
| 358 | |||
| 359 | 39298 | w[0].re = t[0].re - t[6].re; | |
| 360 | 39298 | w[0].im = t[0].im - t[6].im; | |
| 361 | 39298 | w[1].re = t[2].re - t[6].re; | |
| 362 | 39298 | w[1].im = t[2].im - t[6].im; | |
| 363 | 39298 | w[2].re = t[1].re - t[7].re; | |
| 364 | 39298 | w[2].im = t[1].im - t[7].im; | |
| 365 | 39298 | w[3].re = t[3].re + t[7].re; | |
| 366 | 39298 | w[3].im = t[3].im + t[7].im; | |
| 367 | |||
| 368 | 39298 | z[0].re = dc.re + t[4].re; | |
| 369 | 39298 | z[0].im = dc.im + t[4].im; | |
| 370 | |||
| 371 | 39298 | z[1].re = t[0].re + t[2].re + t[6].re; | |
| 372 | 39298 | z[1].im = t[0].im + t[2].im + t[6].im; | |
| 373 | |||
| 374 | 39298 | out[0*stride].re = z[0].re + z[1].re; | |
| 375 | 39298 | out[0*stride].im = z[0].im + z[1].im; | |
| 376 | |||
| 377 | #ifdef TX_INT32 | ||
| 378 | ✗ | mtmp[0] = t[1].re - t[3].re + t[7].re; | |
| 379 | ✗ | mtmp[1] = t[1].im - t[3].im + t[7].im; | |
| 380 | |||
| 381 | ✗ | y[3].re = (int32_t)(((int64_t)tab[0].im)*mtmp[0] + 0x40000000 >> 31); | |
| 382 | ✗ | y[3].im = (int32_t)(((int64_t)tab[0].im)*mtmp[1] + 0x40000000 >> 31); | |
| 383 | |||
| 384 | ✗ | mtmp[0] = (int32_t)(((int64_t)tab[0].re)*z[1].re + 0x40000000 >> 31); | |
| 385 | ✗ | mtmp[1] = (int32_t)(((int64_t)tab[0].re)*z[1].im + 0x40000000 >> 31); | |
| 386 | ✗ | mtmp[2] = (int32_t)(((int64_t)tab[0].re)*t[4].re + 0x40000000 >> 31); | |
| 387 | ✗ | mtmp[3] = (int32_t)(((int64_t)tab[0].re)*t[4].im + 0x40000000 >> 31); | |
| 388 | |||
| 389 | ✗ | x[3].re = z[0].re + (int32_t)mtmp[0]; | |
| 390 | ✗ | x[3].im = z[0].im + (int32_t)mtmp[1]; | |
| 391 | ✗ | z[0].re = in[0].re + (int32_t)mtmp[2]; | |
| 392 | ✗ | z[0].im = in[0].im + (int32_t)mtmp[3]; | |
| 393 | |||
| 394 | ✗ | mtmp[0] = ((int64_t)tab[1].re)*w[0].re; | |
| 395 | ✗ | mtmp[1] = ((int64_t)tab[1].re)*w[0].im; | |
| 396 | ✗ | mtmp[2] = ((int64_t)tab[2].im)*w[0].re; | |
| 397 | ✗ | mtmp[3] = ((int64_t)tab[2].im)*w[0].im; | |
| 398 | ✗ | mtmp[4] = ((int64_t)tab[1].im)*w[2].re; | |
| 399 | ✗ | mtmp[5] = ((int64_t)tab[1].im)*w[2].im; | |
| 400 | ✗ | mtmp[6] = ((int64_t)tab[2].re)*w[2].re; | |
| 401 | ✗ | mtmp[7] = ((int64_t)tab[2].re)*w[2].im; | |
| 402 | |||
| 403 | ✗ | x[1].re = (int32_t)(mtmp[0] + ((int64_t)tab[2].im)*w[1].re + 0x40000000 >> 31); | |
| 404 | ✗ | x[1].im = (int32_t)(mtmp[1] + ((int64_t)tab[2].im)*w[1].im + 0x40000000 >> 31); | |
| 405 | ✗ | x[2].re = (int32_t)(mtmp[2] - ((int64_t)tab[3].re)*w[1].re + 0x40000000 >> 31); | |
| 406 | ✗ | x[2].im = (int32_t)(mtmp[3] - ((int64_t)tab[3].re)*w[1].im + 0x40000000 >> 31); | |
| 407 | ✗ | y[1].re = (int32_t)(mtmp[4] + ((int64_t)tab[2].re)*w[3].re + 0x40000000 >> 31); | |
| 408 | ✗ | y[1].im = (int32_t)(mtmp[5] + ((int64_t)tab[2].re)*w[3].im + 0x40000000 >> 31); | |
| 409 | ✗ | y[2].re = (int32_t)(mtmp[6] - ((int64_t)tab[3].im)*w[3].re + 0x40000000 >> 31); | |
| 410 | ✗ | y[2].im = (int32_t)(mtmp[7] - ((int64_t)tab[3].im)*w[3].im + 0x40000000 >> 31); | |
| 411 | |||
| 412 | ✗ | y[0].re = (int32_t)(((int64_t)tab[0].im)*t[5].re + 0x40000000 >> 31); | |
| 413 | ✗ | y[0].im = (int32_t)(((int64_t)tab[0].im)*t[5].im + 0x40000000 >> 31); | |
| 414 | |||
| 415 | #else | ||
| 416 | 39298 | y[3].re = tab[0].im*(t[1].re - t[3].re + t[7].re); | |
| 417 | 39298 | y[3].im = tab[0].im*(t[1].im - t[3].im + t[7].im); | |
| 418 | |||
| 419 | 39298 | x[3].re = z[0].re + tab[0].re*z[1].re; | |
| 420 | 39298 | x[3].im = z[0].im + tab[0].re*z[1].im; | |
| 421 | 39298 | z[0].re = dc.re + tab[0].re*t[4].re; | |
| 422 | 39298 | z[0].im = dc.im + tab[0].re*t[4].im; | |
| 423 | |||
| 424 | 39298 | x[1].re = tab[1].re*w[0].re + tab[2].im*w[1].re; | |
| 425 | 39298 | x[1].im = tab[1].re*w[0].im + tab[2].im*w[1].im; | |
| 426 | 39298 | x[2].re = tab[2].im*w[0].re - tab[3].re*w[1].re; | |
| 427 | 39298 | x[2].im = tab[2].im*w[0].im - tab[3].re*w[1].im; | |
| 428 | 39298 | y[1].re = tab[1].im*w[2].re + tab[2].re*w[3].re; | |
| 429 | 39298 | y[1].im = tab[1].im*w[2].im + tab[2].re*w[3].im; | |
| 430 | 39298 | y[2].re = tab[2].re*w[2].re - tab[3].im*w[3].re; | |
| 431 | 39298 | y[2].im = tab[2].re*w[2].im - tab[3].im*w[3].im; | |
| 432 | |||
| 433 | 39298 | y[0].re = tab[0].im*t[5].re; | |
| 434 | 39298 | y[0].im = tab[0].im*t[5].im; | |
| 435 | #endif | ||
| 436 | |||
| 437 | 39298 | x[4].re = x[1].re + x[2].re; | |
| 438 | 39298 | x[4].im = x[1].im + x[2].im; | |
| 439 | |||
| 440 | 39298 | y[4].re = y[1].re - y[2].re; | |
| 441 | 39298 | y[4].im = y[1].im - y[2].im; | |
| 442 | 39298 | x[1].re = z[0].re + x[1].re; | |
| 443 | 39298 | x[1].im = z[0].im + x[1].im; | |
| 444 | 39298 | y[1].re = y[0].re + y[1].re; | |
| 445 | 39298 | y[1].im = y[0].im + y[1].im; | |
| 446 | 39298 | x[2].re = z[0].re + x[2].re; | |
| 447 | 39298 | x[2].im = z[0].im + x[2].im; | |
| 448 | 39298 | y[2].re = y[2].re - y[0].re; | |
| 449 | 39298 | y[2].im = y[2].im - y[0].im; | |
| 450 | 39298 | x[4].re = z[0].re - x[4].re; | |
| 451 | 39298 | x[4].im = z[0].im - x[4].im; | |
| 452 | 39298 | y[4].re = y[0].re - y[4].re; | |
| 453 | 39298 | y[4].im = y[0].im - y[4].im; | |
| 454 | |||
| 455 | 39298 | out[1*stride] = (TXComplex){ x[1].re + y[1].im, x[1].im - y[1].re }; | |
| 456 | 39298 | out[2*stride] = (TXComplex){ x[2].re + y[2].im, x[2].im - y[2].re }; | |
| 457 | 39298 | out[3*stride] = (TXComplex){ x[3].re + y[3].im, x[3].im - y[3].re }; | |
| 458 | 39298 | out[4*stride] = (TXComplex){ x[4].re + y[4].im, x[4].im - y[4].re }; | |
| 459 | 39298 | out[5*stride] = (TXComplex){ x[4].re - y[4].im, x[4].im + y[4].re }; | |
| 460 | 39298 | out[6*stride] = (TXComplex){ x[3].re - y[3].im, x[3].im + y[3].re }; | |
| 461 | 39298 | out[7*stride] = (TXComplex){ x[2].re - y[2].im, x[2].im + y[2].re }; | |
| 462 | 39298 | out[8*stride] = (TXComplex){ x[1].re - y[1].im, x[1].im + y[1].re }; | |
| 463 | 39298 | } | |
| 464 | |||
| 465 | 893292 | static av_always_inline void fft15(TXComplex *out, TXComplex *in, | |
| 466 | ptrdiff_t stride) | ||
| 467 | { | ||
| 468 | TXComplex tmp[15]; | ||
| 469 | |||
| 470 |
2/2✓ Branch 0 taken 4466460 times.
✓ Branch 1 taken 893292 times.
|
5359752 | for (int i = 0; i < 5; i++) |
| 471 | 4466460 | fft3(tmp + i, in + i*3, 5); | |
| 472 | |||
| 473 | 893292 | fft5_m1(out, tmp + 0, stride); | |
| 474 | 893292 | fft5_m2(out, tmp + 5, stride); | |
| 475 | 893292 | fft5_m3(out, tmp + 10, stride); | |
| 476 | 893292 | } | |
| 477 | |||
| 478 | 105 | static av_cold int TX_NAME(ff_tx_fft_factor_init)(AVTXContext *s, | |
| 479 | const FFTXCodelet *cd, | ||
| 480 | uint64_t flags, | ||
| 481 | FFTXCodeletOptions *opts, | ||
| 482 | int len, int inv, | ||
| 483 | const void *scale) | ||
| 484 | { | ||
| 485 | 105 | int ret = 0; | |
| 486 | 105 | TX_TAB(ff_tx_init_tabs)(len); | |
| 487 | |||
| 488 |
2/2✓ Branch 0 taken 81 times.
✓ Branch 1 taken 24 times.
|
105 | if (len == 15) |
| 489 | 81 | ret = ff_tx_gen_pfa_input_map(s, opts, 3, 5); | |
| 490 |
1/2✓ Branch 0 taken 24 times.
✗ Branch 1 not taken.
|
24 | else if (flags & FF_TX_PRESHUFFLE) |
| 491 | 24 | ret = ff_tx_gen_default_map(s, opts); | |
| 492 | |||
| 493 | 105 | return ret; | |
| 494 | } | ||
| 495 | |||
| 496 | #define DECL_FACTOR_S(n) \ | ||
| 497 | static void TX_NAME(ff_tx_fft##n)(AVTXContext *s, void *dst, \ | ||
| 498 | void *src, ptrdiff_t stride) \ | ||
| 499 | { \ | ||
| 500 | fft##n((TXComplex *)dst, (TXComplex *)src, stride / sizeof(TXComplex)); \ | ||
| 501 | } \ | ||
| 502 | static const FFTXCodelet TX_NAME(ff_tx_fft##n##_ns_def) = { \ | ||
| 503 | .name = TX_NAME_STR("fft" #n "_ns"), \ | ||
| 504 | .function = TX_NAME(ff_tx_fft##n), \ | ||
| 505 | .type = TX_TYPE(FFT), \ | ||
| 506 | .flags = AV_TX_INPLACE | FF_TX_OUT_OF_PLACE | \ | ||
| 507 | AV_TX_UNALIGNED | FF_TX_PRESHUFFLE, \ | ||
| 508 | .factors[0] = n, \ | ||
| 509 | .nb_factors = 1, \ | ||
| 510 | .min_len = n, \ | ||
| 511 | .max_len = n, \ | ||
| 512 | .init = TX_NAME(ff_tx_fft_factor_init), \ | ||
| 513 | .cpu_flags = FF_TX_CPU_FLAGS_ALL, \ | ||
| 514 | .prio = FF_TX_PRIO_BASE, \ | ||
| 515 | }; | ||
| 516 | |||
| 517 | #define DECL_FACTOR_F(n) \ | ||
| 518 | DECL_FACTOR_S(n) \ | ||
| 519 | static const FFTXCodelet TX_NAME(ff_tx_fft##n##_fwd_def) = { \ | ||
| 520 | .name = TX_NAME_STR("fft" #n "_fwd"), \ | ||
| 521 | .function = TX_NAME(ff_tx_fft##n), \ | ||
| 522 | .type = TX_TYPE(FFT), \ | ||
| 523 | .flags = AV_TX_INPLACE | FF_TX_OUT_OF_PLACE | \ | ||
| 524 | AV_TX_UNALIGNED | FF_TX_FORWARD_ONLY, \ | ||
| 525 | .factors[0] = n, \ | ||
| 526 | .nb_factors = 1, \ | ||
| 527 | .min_len = n, \ | ||
| 528 | .max_len = n, \ | ||
| 529 | .init = TX_NAME(ff_tx_fft_factor_init), \ | ||
| 530 | .cpu_flags = FF_TX_CPU_FLAGS_ALL, \ | ||
| 531 | .prio = FF_TX_PRIO_BASE, \ | ||
| 532 | }; | ||
| 533 | |||
| 534 | ✗ | DECL_FACTOR_F(3) | |
| 535 | 72982 | DECL_FACTOR_F(5) | |
| 536 | 50526 | DECL_FACTOR_F(7) | |
| 537 | 39298 | DECL_FACTOR_F(9) | |
| 538 | 1000 | DECL_FACTOR_S(15) | |
| 539 | |||
| 540 | #define BUTTERFLIES(a0, a1, a2, a3) \ | ||
| 541 | do { \ | ||
| 542 | r0=a0.re; \ | ||
| 543 | i0=a0.im; \ | ||
| 544 | r1=a1.re; \ | ||
| 545 | i1=a1.im; \ | ||
| 546 | BF(t3, t5, t5, t1); \ | ||
| 547 | BF(a2.re, a0.re, r0, t5); \ | ||
| 548 | BF(a3.im, a1.im, i1, t3); \ | ||
| 549 | BF(t4, t6, t2, t6); \ | ||
| 550 | BF(a3.re, a1.re, r1, t4); \ | ||
| 551 | BF(a2.im, a0.im, i0, t6); \ | ||
| 552 | } while (0) | ||
| 553 | |||
| 554 | #define TRANSFORM(a0, a1, a2, a3, wre, wim) \ | ||
| 555 | do { \ | ||
| 556 | CMUL(t1, t2, a2.re, a2.im, wre, -wim); \ | ||
| 557 | CMUL(t5, t6, a3.re, a3.im, wre, wim); \ | ||
| 558 | BUTTERFLIES(a0, a1, a2, a3); \ | ||
| 559 | } while (0) | ||
| 560 | |||
| 561 | /* z[0...8n-1], w[1...2n-1] */ | ||
| 562 | 5102419 | static inline void TX_NAME(ff_tx_fft_sr_combine)(TXComplex *z, | |
| 563 | const TXSample *cos, int len) | ||
| 564 | { | ||
| 565 | 5102419 | int o1 = 2*len; | |
| 566 | 5102419 | int o2 = 4*len; | |
| 567 | 5102419 | int o3 = 6*len; | |
| 568 | 5102419 | const TXSample *wim = cos + o1 - 7; | |
| 569 | TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1; | ||
| 570 | |||
| 571 |
2/2✓ Branch 0 taken 10306356 times.
✓ Branch 1 taken 5102419 times.
|
15408775 | for (int i = 0; i < len; i += 4) { |
| 572 | 10306356 | TRANSFORM(z[0], z[o1 + 0], z[o2 + 0], z[o3 + 0], cos[0], wim[7]); | |
| 573 | 10306356 | TRANSFORM(z[2], z[o1 + 2], z[o2 + 2], z[o3 + 2], cos[2], wim[5]); | |
| 574 | 10306356 | TRANSFORM(z[4], z[o1 + 4], z[o2 + 4], z[o3 + 4], cos[4], wim[3]); | |
| 575 | 10306356 | TRANSFORM(z[6], z[o1 + 6], z[o2 + 6], z[o3 + 6], cos[6], wim[1]); | |
| 576 | |||
| 577 | 10306356 | TRANSFORM(z[1], z[o1 + 1], z[o2 + 1], z[o3 + 1], cos[1], wim[6]); | |
| 578 | 10306356 | TRANSFORM(z[3], z[o1 + 3], z[o2 + 3], z[o3 + 3], cos[3], wim[4]); | |
| 579 | 10306356 | TRANSFORM(z[5], z[o1 + 5], z[o2 + 5], z[o3 + 5], cos[5], wim[2]); | |
| 580 | 10306356 | TRANSFORM(z[7], z[o1 + 7], z[o2 + 7], z[o3 + 7], cos[7], wim[0]); | |
| 581 | |||
| 582 | 10306356 | z += 2*4; | |
| 583 | 10306356 | cos += 2*4; | |
| 584 | 10306356 | wim -= 2*4; | |
| 585 | } | ||
| 586 | 5102419 | } | |
| 587 | |||
| 588 | 4352 | static av_cold int TX_NAME(ff_tx_fft_sr_codelet_init)(AVTXContext *s, | |
| 589 | const FFTXCodelet *cd, | ||
| 590 | uint64_t flags, | ||
| 591 | FFTXCodeletOptions *opts, | ||
| 592 | int len, int inv, | ||
| 593 | const void *scale) | ||
| 594 | { | ||
| 595 | 4352 | TX_TAB(ff_tx_init_tabs)(len); | |
| 596 | 4352 | return ff_tx_gen_ptwo_revtab(s, opts); | |
| 597 | } | ||
| 598 | |||
| 599 | #define DECL_SR_CODELET_DEF(n) \ | ||
| 600 | static const FFTXCodelet TX_NAME(ff_tx_fft##n##_ns_def) = { \ | ||
| 601 | .name = TX_NAME_STR("fft" #n "_ns"), \ | ||
| 602 | .function = TX_NAME(ff_tx_fft##n##_ns), \ | ||
| 603 | .type = TX_TYPE(FFT), \ | ||
| 604 | .flags = FF_TX_OUT_OF_PLACE | AV_TX_INPLACE | \ | ||
| 605 | AV_TX_UNALIGNED | FF_TX_PRESHUFFLE, \ | ||
| 606 | .factors[0] = 2, \ | ||
| 607 | .nb_factors = 1, \ | ||
| 608 | .min_len = n, \ | ||
| 609 | .max_len = n, \ | ||
| 610 | .init = TX_NAME(ff_tx_fft_sr_codelet_init), \ | ||
| 611 | .cpu_flags = FF_TX_CPU_FLAGS_ALL, \ | ||
| 612 | .prio = FF_TX_PRIO_BASE, \ | ||
| 613 | }; | ||
| 614 | |||
| 615 | #define DECL_SR_CODELET(n, n2, n4) \ | ||
| 616 | static void TX_NAME(ff_tx_fft##n##_ns)(AVTXContext *s, void *_dst, \ | ||
| 617 | void *_src, ptrdiff_t stride) \ | ||
| 618 | { \ | ||
| 619 | TXComplex *src = _src; \ | ||
| 620 | TXComplex *dst = _dst; \ | ||
| 621 | const TXSample *cos = TX_TAB(ff_tx_tab_##n); \ | ||
| 622 | \ | ||
| 623 | TX_NAME(ff_tx_fft##n2##_ns)(s, dst, src, stride); \ | ||
| 624 | TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride); \ | ||
| 625 | TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride); \ | ||
| 626 | TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1); \ | ||
| 627 | } \ | ||
| 628 | \ | ||
| 629 | DECL_SR_CODELET_DEF(n) | ||
| 630 | |||
| 631 | 7 | static void TX_NAME(ff_tx_fft2_ns)(AVTXContext *s, void *_dst, | |
| 632 | void *_src, ptrdiff_t stride) | ||
| 633 | { | ||
| 634 | 7 | TXComplex *src = _src; | |
| 635 | 7 | TXComplex *dst = _dst; | |
| 636 | TXComplex tmp; | ||
| 637 | |||
| 638 | 7 | BF(tmp.re, dst[0].re, src[0].re, src[1].re); | |
| 639 | 7 | BF(tmp.im, dst[0].im, src[0].im, src[1].im); | |
| 640 | 7 | dst[1] = tmp; | |
| 641 | 7 | } | |
| 642 | |||
| 643 | 25111017 | static void TX_NAME(ff_tx_fft4_ns)(AVTXContext *s, void *_dst, | |
| 644 | void *_src, ptrdiff_t stride) | ||
| 645 | { | ||
| 646 | 25111017 | TXComplex *src = _src; | |
| 647 | 25111017 | TXComplex *dst = _dst; | |
| 648 | TXSample t1, t2, t3, t4, t5, t6, t7, t8; | ||
| 649 | |||
| 650 | 25111017 | BF(t3, t1, src[0].re, src[1].re); | |
| 651 | 25111017 | BF(t8, t6, src[3].re, src[2].re); | |
| 652 | 25111017 | BF(dst[2].re, dst[0].re, t1, t6); | |
| 653 | 25111017 | BF(t4, t2, src[0].im, src[1].im); | |
| 654 | 25111017 | BF(t7, t5, src[2].im, src[3].im); | |
| 655 | 25111017 | BF(dst[3].im, dst[1].im, t4, t8); | |
| 656 | 25111017 | BF(dst[3].re, dst[1].re, t3, t7); | |
| 657 | 25111017 | BF(dst[2].im, dst[0].im, t2, t5); | |
| 658 | 25111017 | } | |
| 659 | |||
| 660 | 13088075 | static void TX_NAME(ff_tx_fft8_ns)(AVTXContext *s, void *_dst, | |
| 661 | void *_src, ptrdiff_t stride) | ||
| 662 | { | ||
| 663 | 13088075 | TXComplex *src = _src; | |
| 664 | 13088075 | TXComplex *dst = _dst; | |
| 665 | TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1; | ||
| 666 | 13088075 | const TXSample cos = TX_TAB(ff_tx_tab_8)[1]; | |
| 667 | |||
| 668 | 13088075 | TX_NAME(ff_tx_fft4_ns)(s, dst, src, stride); | |
| 669 | |||
| 670 | 13088075 | BF(t1, dst[5].re, src[4].re, -src[5].re); | |
| 671 | 13088075 | BF(t2, dst[5].im, src[4].im, -src[5].im); | |
| 672 | 13088075 | BF(t5, dst[7].re, src[6].re, -src[7].re); | |
| 673 | 13088075 | BF(t6, dst[7].im, src[6].im, -src[7].im); | |
| 674 | |||
| 675 | 13088075 | BUTTERFLIES(dst[0], dst[2], dst[4], dst[6]); | |
| 676 | 13088075 | TRANSFORM(dst[1], dst[3], dst[5], dst[7], cos, cos); | |
| 677 | 13088075 | } | |
| 678 | |||
| 679 | 5496690 | static void TX_NAME(ff_tx_fft16_ns)(AVTXContext *s, void *_dst, | |
| 680 | void *_src, ptrdiff_t stride) | ||
| 681 | { | ||
| 682 | 5496690 | TXComplex *src = _src; | |
| 683 | 5496690 | TXComplex *dst = _dst; | |
| 684 | 5496690 | const TXSample *cos = TX_TAB(ff_tx_tab_16); | |
| 685 | |||
| 686 | TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1; | ||
| 687 | 5496690 | TXSample cos_16_1 = cos[1]; | |
| 688 | 5496690 | TXSample cos_16_2 = cos[2]; | |
| 689 | 5496690 | TXSample cos_16_3 = cos[3]; | |
| 690 | |||
| 691 | 5496690 | TX_NAME(ff_tx_fft8_ns)(s, dst + 0, src + 0, stride); | |
| 692 | 5496690 | TX_NAME(ff_tx_fft4_ns)(s, dst + 8, src + 8, stride); | |
| 693 | 5496690 | TX_NAME(ff_tx_fft4_ns)(s, dst + 12, src + 12, stride); | |
| 694 | |||
| 695 | 5496690 | t1 = dst[ 8].re; | |
| 696 | 5496690 | t2 = dst[ 8].im; | |
| 697 | 5496690 | t5 = dst[12].re; | |
| 698 | 5496690 | t6 = dst[12].im; | |
| 699 | 5496690 | BUTTERFLIES(dst[0], dst[4], dst[8], dst[12]); | |
| 700 | |||
| 701 | 5496690 | TRANSFORM(dst[ 2], dst[ 6], dst[10], dst[14], cos_16_2, cos_16_2); | |
| 702 | 5496690 | TRANSFORM(dst[ 1], dst[ 5], dst[ 9], dst[13], cos_16_1, cos_16_3); | |
| 703 | 5496690 | TRANSFORM(dst[ 3], dst[ 7], dst[11], dst[15], cos_16_3, cos_16_1); | |
| 704 | 5496690 | } | |
| 705 | |||
| 706 | DECL_SR_CODELET_DEF(2) | ||
| 707 | DECL_SR_CODELET_DEF(4) | ||
| 708 | DECL_SR_CODELET_DEF(8) | ||
| 709 | DECL_SR_CODELET_DEF(16) | ||
| 710 | 3641792 | DECL_SR_CODELET(32,16,8) | |
| 711 | 787694 | DECL_SR_CODELET(64,32,16) | |
| 712 | 412074 | DECL_SR_CODELET(128,64,32) | |
| 713 | 146322 | DECL_SR_CODELET(256,128,64) | |
| 714 | 103104 | DECL_SR_CODELET(512,256,128) | |
| 715 | 7655 | DECL_SR_CODELET(1024,512,256) | |
| 716 | 2582 | DECL_SR_CODELET(2048,1024,512) | |
| 717 | 758 | DECL_SR_CODELET(4096,2048,1024) | |
| 718 | 433 | DECL_SR_CODELET(8192,4096,2048) | |
| 719 | 5 | DECL_SR_CODELET(16384,8192,4096) | |
| 720 | ✗ | DECL_SR_CODELET(32768,16384,8192) | |
| 721 | ✗ | DECL_SR_CODELET(65536,32768,16384) | |
| 722 | ✗ | DECL_SR_CODELET(131072,65536,32768) | |
| 723 | |||
| 724 | 262 | static av_cold int TX_NAME(ff_tx_fft_init)(AVTXContext *s, | |
| 725 | const FFTXCodelet *cd, | ||
| 726 | uint64_t flags, | ||
| 727 | FFTXCodeletOptions *opts, | ||
| 728 | int len, int inv, | ||
| 729 | const void *scale) | ||
| 730 | { | ||
| 731 | int ret; | ||
| 732 | 262 | int is_inplace = !!(flags & AV_TX_INPLACE); | |
| 733 | 262 | FFTXCodeletOptions sub_opts = { | |
| 734 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 262 times.
|
262 | .map_dir = is_inplace ? FF_TX_MAP_SCATTER : FF_TX_MAP_GATHER, |
| 735 | }; | ||
| 736 | |||
| 737 | 262 | flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */ | |
| 738 | 262 | flags |= AV_TX_INPLACE; /* in-place */ | |
| 739 | 262 | flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */ | |
| 740 | |||
| 741 |
2/2✓ Branch 1 taken 24 times.
✓ Branch 2 taken 238 times.
|
262 | if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len, inv, scale))) |
| 742 | 24 | return ret; | |
| 743 | |||
| 744 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 238 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
238 | if (is_inplace && (ret = ff_tx_gen_inplace_map(s, len))) |
| 745 | ✗ | return ret; | |
| 746 | |||
| 747 | 238 | return 0; | |
| 748 | } | ||
| 749 | |||
| 750 | 8 | static av_cold int TX_NAME(ff_tx_fft_inplace_small_init)(AVTXContext *s, | |
| 751 | const FFTXCodelet *cd, | ||
| 752 | uint64_t flags, | ||
| 753 | FFTXCodeletOptions *opts, | ||
| 754 | int len, int inv, | ||
| 755 | const void *scale) | ||
| 756 | { | ||
| 757 |
1/2✗ Branch 1 not taken.
✓ Branch 2 taken 8 times.
|
8 | if (!(s->tmp = av_malloc(len*sizeof(*s->tmp)))) |
| 758 | ✗ | return AVERROR(ENOMEM); | |
| 759 | 8 | flags &= ~AV_TX_INPLACE; | |
| 760 | 8 | return TX_NAME(ff_tx_fft_init)(s, cd, flags, opts, len, inv, scale); | |
| 761 | } | ||
| 762 | |||
| 763 | 35616 | static void TX_NAME(ff_tx_fft)(AVTXContext *s, void *_dst, | |
| 764 | void *_src, ptrdiff_t stride) | ||
| 765 | { | ||
| 766 | 35616 | TXComplex *src = _src; | |
| 767 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 35616 times.
|
35616 | TXComplex *dst1 = s->flags & AV_TX_INPLACE ? s->tmp : _dst; |
| 768 | 35616 | TXComplex *dst2 = _dst; | |
| 769 | 35616 | int *map = s->sub[0].map; | |
| 770 | 35616 | int len = s->len; | |
| 771 | |||
| 772 | /* Compilers can't vectorize this anyway without assuming AVX2, which they | ||
| 773 | * generally don't, at least without -march=native -mtune=native */ | ||
| 774 |
2/2✓ Branch 0 taken 10978294 times.
✓ Branch 1 taken 35616 times.
|
11013910 | for (int i = 0; i < len; i++) |
| 775 | 10978294 | dst1[i] = src[map[i]]; | |
| 776 | |||
| 777 | 35616 | s->fn[0](&s->sub[0], dst2, dst1, stride); | |
| 778 | 35616 | } | |
| 779 | |||
| 780 | ✗ | static void TX_NAME(ff_tx_fft_inplace)(AVTXContext *s, void *_dst, | |
| 781 | void *_src, ptrdiff_t stride) | ||
| 782 | { | ||
| 783 | ✗ | TXComplex *src = _src; | |
| 784 | ✗ | TXComplex *dst = _dst; | |
| 785 | TXComplex tmp; | ||
| 786 | ✗ | const int *map = s->sub->map; | |
| 787 | ✗ | const int *inplace_idx = s->map; | |
| 788 | int src_idx, dst_idx; | ||
| 789 | |||
| 790 | ✗ | src_idx = *inplace_idx++; | |
| 791 | do { | ||
| 792 | ✗ | tmp = src[src_idx]; | |
| 793 | ✗ | dst_idx = map[src_idx]; | |
| 794 | do { | ||
| 795 | ✗ | FFSWAP(TXComplex, tmp, src[dst_idx]); | |
| 796 | ✗ | dst_idx = map[dst_idx]; | |
| 797 | ✗ | } while (dst_idx != src_idx); /* Can be > as well, but was less predictable */ | |
| 798 | ✗ | src[dst_idx] = tmp; | |
| 799 | ✗ | } while ((src_idx = *inplace_idx++)); | |
| 800 | |||
| 801 | ✗ | s->fn[0](&s->sub[0], dst, src, stride); | |
| 802 | ✗ | } | |
| 803 | |||
| 804 | static const FFTXCodelet TX_NAME(ff_tx_fft_def) = { | ||
| 805 | .name = TX_NAME_STR("fft"), | ||
| 806 | .function = TX_NAME(ff_tx_fft), | ||
| 807 | .type = TX_TYPE(FFT), | ||
| 808 | .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE, | ||
| 809 | .factors[0] = TX_FACTOR_ANY, | ||
| 810 | .nb_factors = 1, | ||
| 811 | .min_len = 2, | ||
| 812 | .max_len = TX_LEN_UNLIMITED, | ||
| 813 | .init = TX_NAME(ff_tx_fft_init), | ||
| 814 | .cpu_flags = FF_TX_CPU_FLAGS_ALL, | ||
| 815 | .prio = FF_TX_PRIO_BASE, | ||
| 816 | }; | ||
| 817 | |||
| 818 | static const FFTXCodelet TX_NAME(ff_tx_fft_inplace_small_def) = { | ||
| 819 | .name = TX_NAME_STR("fft_inplace_small"), | ||
| 820 | .function = TX_NAME(ff_tx_fft), | ||
| 821 | .type = TX_TYPE(FFT), | ||
| 822 | .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | AV_TX_INPLACE, | ||
| 823 | .factors[0] = TX_FACTOR_ANY, | ||
| 824 | .nb_factors = 1, | ||
| 825 | .min_len = 2, | ||
| 826 | .max_len = 65536, | ||
| 827 | .init = TX_NAME(ff_tx_fft_inplace_small_init), | ||
| 828 | .cpu_flags = FF_TX_CPU_FLAGS_ALL, | ||
| 829 | .prio = FF_TX_PRIO_BASE - 256, | ||
| 830 | }; | ||
| 831 | |||
| 832 | static const FFTXCodelet TX_NAME(ff_tx_fft_inplace_def) = { | ||
| 833 | .name = TX_NAME_STR("fft_inplace"), | ||
| 834 | .function = TX_NAME(ff_tx_fft_inplace), | ||
| 835 | .type = TX_TYPE(FFT), | ||
| 836 | .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | AV_TX_INPLACE, | ||
| 837 | .factors[0] = TX_FACTOR_ANY, | ||
| 838 | .nb_factors = 1, | ||
| 839 | .min_len = 2, | ||
| 840 | .max_len = TX_LEN_UNLIMITED, | ||
| 841 | .init = TX_NAME(ff_tx_fft_init), | ||
| 842 | .cpu_flags = FF_TX_CPU_FLAGS_ALL, | ||
| 843 | .prio = FF_TX_PRIO_BASE - 512, | ||
| 844 | }; | ||
| 845 | |||
| 846 | 8 | static av_cold int TX_NAME(ff_tx_fft_init_naive_small)(AVTXContext *s, | |
| 847 | const FFTXCodelet *cd, | ||
| 848 | uint64_t flags, | ||
| 849 | FFTXCodeletOptions *opts, | ||
| 850 | int len, int inv, | ||
| 851 | const void *scale) | ||
| 852 | { | ||
| 853 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 8 times.
|
8 | const double phase = s->inv ? 2.0*M_PI/len : -2.0*M_PI/len; |
| 854 | |||
| 855 |
1/2✗ Branch 1 not taken.
✓ Branch 2 taken 8 times.
|
8 | if (!(s->exp = av_malloc(len*len*sizeof(*s->exp)))) |
| 856 | ✗ | return AVERROR(ENOMEM); | |
| 857 | |||
| 858 |
2/2✓ Branch 0 taken 104 times.
✓ Branch 1 taken 8 times.
|
112 | for (int i = 0; i < len; i++) { |
| 859 |
2/2✓ Branch 0 taken 1352 times.
✓ Branch 1 taken 104 times.
|
1456 | for (int j = 0; j < len; j++) { |
| 860 | 1352 | const double factor = phase*i*j; | |
| 861 | 1352 | s->exp[i*j] = (TXComplex){ | |
| 862 | 1352 | RESCALE(cos(factor)), | |
| 863 | 1352 | RESCALE(sin(factor)), | |
| 864 | }; | ||
| 865 | } | ||
| 866 | } | ||
| 867 | |||
| 868 | 8 | return 0; | |
| 869 | } | ||
| 870 | |||
| 871 | ✗ | static void TX_NAME(ff_tx_fft_naive)(AVTXContext *s, void *_dst, void *_src, | |
| 872 | ptrdiff_t stride) | ||
| 873 | { | ||
| 874 | ✗ | TXComplex *src = _src; | |
| 875 | ✗ | TXComplex *dst = _dst; | |
| 876 | ✗ | const int n = s->len; | |
| 877 | ✗ | double phase = s->inv ? 2.0*M_PI/n : -2.0*M_PI/n; | |
| 878 | |||
| 879 | ✗ | stride /= sizeof(*dst); | |
| 880 | |||
| 881 | ✗ | for (int i = 0; i < n; i++) { | |
| 882 | ✗ | TXComplex tmp = { 0 }; | |
| 883 | ✗ | for (int j = 0; j < n; j++) { | |
| 884 | ✗ | const double factor = phase*i*j; | |
| 885 | ✗ | const TXComplex mult = { | |
| 886 | ✗ | RESCALE(cos(factor)), | |
| 887 | ✗ | RESCALE(sin(factor)), | |
| 888 | }; | ||
| 889 | TXComplex res; | ||
| 890 | ✗ | CMUL3(res, src[j], mult); | |
| 891 | ✗ | tmp.re += res.re; | |
| 892 | ✗ | tmp.im += res.im; | |
| 893 | } | ||
| 894 | ✗ | dst[i*stride] = tmp; | |
| 895 | } | ||
| 896 | ✗ | } | |
| 897 | |||
| 898 | 28070 | static void TX_NAME(ff_tx_fft_naive_small)(AVTXContext *s, void *_dst, void *_src, | |
| 899 | ptrdiff_t stride) | ||
| 900 | { | ||
| 901 | 28070 | TXComplex *src = _src; | |
| 902 | 28070 | TXComplex *dst = _dst; | |
| 903 | 28070 | const int n = s->len; | |
| 904 | |||
| 905 | 28070 | stride /= sizeof(*dst); | |
| 906 | |||
| 907 |
2/2✓ Branch 0 taken 364910 times.
✓ Branch 1 taken 28070 times.
|
392980 | for (int i = 0; i < n; i++) { |
| 908 | 364910 | TXComplex tmp = { 0 }; | |
| 909 |
2/2✓ Branch 0 taken 4743830 times.
✓ Branch 1 taken 364910 times.
|
5108740 | for (int j = 0; j < n; j++) { |
| 910 | TXComplex res; | ||
| 911 | 4743830 | const TXComplex mult = s->exp[i*j]; | |
| 912 | 4743830 | CMUL3(res, src[j], mult); | |
| 913 | 4743830 | tmp.re += res.re; | |
| 914 | 4743830 | tmp.im += res.im; | |
| 915 | } | ||
| 916 | 364910 | dst[i*stride] = tmp; | |
| 917 | } | ||
| 918 | 28070 | } | |
| 919 | |||
| 920 | static const FFTXCodelet TX_NAME(ff_tx_fft_naive_small_def) = { | ||
| 921 | .name = TX_NAME_STR("fft_naive_small"), | ||
| 922 | .function = TX_NAME(ff_tx_fft_naive_small), | ||
| 923 | .type = TX_TYPE(FFT), | ||
| 924 | .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE, | ||
| 925 | .factors[0] = TX_FACTOR_ANY, | ||
| 926 | .nb_factors = 1, | ||
| 927 | .min_len = 2, | ||
| 928 | .max_len = 1024, | ||
| 929 | .init = TX_NAME(ff_tx_fft_init_naive_small), | ||
| 930 | .cpu_flags = FF_TX_CPU_FLAGS_ALL, | ||
| 931 | .prio = FF_TX_PRIO_MIN/2, | ||
| 932 | }; | ||
| 933 | |||
| 934 | static const FFTXCodelet TX_NAME(ff_tx_fft_naive_def) = { | ||
| 935 | .name = TX_NAME_STR("fft_naive"), | ||
| 936 | .function = TX_NAME(ff_tx_fft_naive), | ||
| 937 | .type = TX_TYPE(FFT), | ||
| 938 | .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE, | ||
| 939 | .factors[0] = TX_FACTOR_ANY, | ||
| 940 | .nb_factors = 1, | ||
| 941 | .min_len = 2, | ||
| 942 | .max_len = TX_LEN_UNLIMITED, | ||
| 943 | .init = NULL, | ||
| 944 | .cpu_flags = FF_TX_CPU_FLAGS_ALL, | ||
| 945 | .prio = FF_TX_PRIO_MIN, | ||
| 946 | }; | ||
| 947 | |||
| 948 | 4273 | static av_cold int TX_NAME(ff_tx_fft_pfa_init)(AVTXContext *s, | |
| 949 | const FFTXCodelet *cd, | ||
| 950 | uint64_t flags, | ||
| 951 | FFTXCodeletOptions *opts, | ||
| 952 | int len, int inv, | ||
| 953 | const void *scale) | ||
| 954 | { | ||
| 955 | 4273 | int ret, *tmp, ps = flags & FF_TX_PRESHUFFLE; | |
| 956 | 4273 | FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_GATHER }; | |
| 957 | 4273 | size_t extra_tmp_len = 0; | |
| 958 | int len_list[TX_MAX_DECOMPOSITIONS]; | ||
| 959 | |||
| 960 |
2/2✓ Branch 1 taken 4176 times.
✓ Branch 2 taken 97 times.
|
4273 | if ((ret = ff_tx_decompose_length(len_list, TX_TYPE(FFT), len, inv)) < 0) |
| 961 | 4176 | return ret; | |
| 962 | |||
| 963 | /* Two iterations to test both orderings. */ | ||
| 964 |
1/2✓ Branch 0 taken 97 times.
✗ Branch 1 not taken.
|
97 | for (int i = 0; i < ret; i++) { |
| 965 | 97 | int len1 = len_list[i]; | |
| 966 | 97 | int len2 = len / len1; | |
| 967 | |||
| 968 | /* Our ptwo transforms don't support striding the output. */ | ||
| 969 |
2/2✓ Branch 0 taken 70 times.
✓ Branch 1 taken 27 times.
|
97 | if (len2 & (len2 - 1)) |
| 970 | 70 | FFSWAP(int, len1, len2); | |
| 971 | |||
| 972 | 97 | ff_tx_clear_ctx(s); | |
| 973 | |||
| 974 | /* First transform */ | ||
| 975 | 97 | sub_opts.map_dir = FF_TX_MAP_GATHER; | |
| 976 | 97 | flags &= ~AV_TX_INPLACE; | |
| 977 | 97 | flags |= FF_TX_OUT_OF_PLACE; | |
| 978 | 97 | flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */ | |
| 979 | 97 | ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, | |
| 980 | len1, inv, scale); | ||
| 981 | |||
| 982 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 97 times.
|
97 | if (ret == AVERROR(ENOMEM)) { |
| 983 | ✗ | return ret; | |
| 984 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 89 times.
|
97 | } else if (ret < 0) { /* Try again without a preshuffle flag */ |
| 985 | 8 | flags &= ~FF_TX_PRESHUFFLE; | |
| 986 | 8 | ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, | |
| 987 | len1, inv, scale); | ||
| 988 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 8 times.
|
8 | if (ret == AVERROR(ENOMEM)) |
| 989 | ✗ | return ret; | |
| 990 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 8 times.
|
8 | else if (ret < 0) |
| 991 | ✗ | continue; | |
| 992 | } | ||
| 993 | |||
| 994 | /* Second transform. */ | ||
| 995 | 97 | sub_opts.map_dir = FF_TX_MAP_SCATTER; | |
| 996 | 97 | flags |= FF_TX_PRESHUFFLE; | |
| 997 | 97 | retry: | |
| 998 | 97 | flags &= ~FF_TX_OUT_OF_PLACE; | |
| 999 | 97 | flags |= AV_TX_INPLACE; | |
| 1000 | 97 | ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, | |
| 1001 | len2, inv, scale); | ||
| 1002 | |||
| 1003 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 97 times.
|
97 | if (ret == AVERROR(ENOMEM)) { |
| 1004 | ✗ | return ret; | |
| 1005 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 97 times.
|
97 | } else if (ret < 0) { /* Try again with an out-of-place transform */ |
| 1006 | ✗ | flags |= FF_TX_OUT_OF_PLACE; | |
| 1007 | ✗ | flags &= ~AV_TX_INPLACE; | |
| 1008 | ✗ | ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, | |
| 1009 | len2, inv, scale); | ||
| 1010 | ✗ | if (ret == AVERROR(ENOMEM)) { | |
| 1011 | ✗ | return ret; | |
| 1012 | ✗ | } else if (ret < 0) { | |
| 1013 | ✗ | if (flags & FF_TX_PRESHUFFLE) { /* Retry again without a preshuf flag */ | |
| 1014 | ✗ | flags &= ~FF_TX_PRESHUFFLE; | |
| 1015 | ✗ | goto retry; | |
| 1016 | } else { | ||
| 1017 | ✗ | continue; | |
| 1018 | } | ||
| 1019 | } | ||
| 1020 | } | ||
| 1021 | |||
| 1022 | /* Success */ | ||
| 1023 | 97 | break; | |
| 1024 | } | ||
| 1025 | |||
| 1026 | /* If nothing was successful, error out */ | ||
| 1027 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 97 times.
|
97 | if (ret < 0) |
| 1028 | ✗ | return ret; | |
| 1029 | |||
| 1030 | /* Generate PFA map */ | ||
| 1031 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 97 times.
|
97 | if ((ret = ff_tx_gen_compound_mapping(s, opts, 0, |
| 1032 | 97 | s->sub[0].len, s->sub[1].len))) | |
| 1033 | ✗ | return ret; | |
| 1034 | |||
| 1035 |
1/2✗ Branch 1 not taken.
✓ Branch 2 taken 97 times.
|
97 | if (!(s->tmp = av_malloc(len*sizeof(*s->tmp)))) |
| 1036 | ✗ | return AVERROR(ENOMEM); | |
| 1037 | |||
| 1038 | /* Flatten input map */ | ||
| 1039 | 97 | tmp = (int *)s->tmp; | |
| 1040 |
2/2✓ Branch 0 taken 5512 times.
✓ Branch 1 taken 97 times.
|
5609 | for (int k = 0; k < len; k += s->sub[0].len) { |
| 1041 | 5512 | memcpy(tmp, &s->map[k], s->sub[0].len*sizeof(*tmp)); | |
| 1042 |
2/2✓ Branch 0 taken 82024 times.
✓ Branch 1 taken 5512 times.
|
87536 | for (int i = 0; i < s->sub[0].len; i++) |
| 1043 | 82024 | s->map[k + i] = tmp[s->sub[0].map[i]]; | |
| 1044 | } | ||
| 1045 | |||
| 1046 | /* Only allocate extra temporary memory if we need it */ | ||
| 1047 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 97 times.
|
97 | if (!(s->sub[1].flags & AV_TX_INPLACE)) |
| 1048 | ✗ | extra_tmp_len = len; | |
| 1049 |
1/2✓ Branch 0 taken 97 times.
✗ Branch 1 not taken.
|
97 | else if (!ps) |
| 1050 | 97 | extra_tmp_len = s->sub[0].len; | |
| 1051 | |||
| 1052 |
2/4✓ Branch 0 taken 97 times.
✗ Branch 1 not taken.
✗ Branch 3 not taken.
✓ Branch 4 taken 97 times.
|
97 | if (extra_tmp_len && !(s->exp = av_malloc(extra_tmp_len*sizeof(*s->exp)))) |
| 1053 | ✗ | return AVERROR(ENOMEM); | |
| 1054 | |||
| 1055 | 97 | return 0; | |
| 1056 | } | ||
| 1057 | |||
| 1058 | 11243 | static void TX_NAME(ff_tx_fft_pfa)(AVTXContext *s, void *_out, | |
| 1059 | void *_in, ptrdiff_t stride) | ||
| 1060 | { | ||
| 1061 | 11243 | const int n = s->sub[0].len, m = s->sub[1].len, l = s->len; | |
| 1062 | 11243 | const int *in_map = s->map, *out_map = in_map + l; | |
| 1063 | 11243 | const int *sub_map = s->sub[1].map; | |
| 1064 |
1/2✓ Branch 0 taken 11243 times.
✗ Branch 1 not taken.
|
11243 | TXComplex *tmp1 = s->sub[1].flags & AV_TX_INPLACE ? s->tmp : s->exp; |
| 1065 | 11243 | TXComplex *in = _in, *out = _out; | |
| 1066 | |||
| 1067 | 11243 | stride /= sizeof(*out); | |
| 1068 | |||
| 1069 |
2/2✓ Branch 0 taken 79596 times.
✓ Branch 1 taken 11243 times.
|
90839 | for (int i = 0; i < m; i++) { |
| 1070 |
2/2✓ Branch 0 taken 733592 times.
✓ Branch 1 taken 79596 times.
|
813188 | for (int j = 0; j < n; j++) |
| 1071 | 733592 | s->exp[j] = in[in_map[i*n + j]]; | |
| 1072 | 79596 | s->fn[0](&s->sub[0], &s->tmp[sub_map[i]], s->exp, m*sizeof(TXComplex)); | |
| 1073 | } | ||
| 1074 | |||
| 1075 |
2/2✓ Branch 0 taken 112505 times.
✓ Branch 1 taken 11243 times.
|
123748 | for (int i = 0; i < n; i++) |
| 1076 | 112505 | s->fn[1](&s->sub[1], &tmp1[m*i], &s->tmp[m*i], sizeof(TXComplex)); | |
| 1077 | |||
| 1078 |
2/2✓ Branch 0 taken 733592 times.
✓ Branch 1 taken 11243 times.
|
744835 | for (int i = 0; i < l; i++) |
| 1079 | 733592 | out[i*stride] = tmp1[out_map[i]]; | |
| 1080 | 11243 | } | |
| 1081 | |||
| 1082 | ✗ | static void TX_NAME(ff_tx_fft_pfa_ns)(AVTXContext *s, void *_out, | |
| 1083 | void *_in, ptrdiff_t stride) | ||
| 1084 | { | ||
| 1085 | ✗ | const int n = s->sub[0].len, m = s->sub[1].len, l = s->len; | |
| 1086 | ✗ | const int *in_map = s->map, *out_map = in_map + l; | |
| 1087 | ✗ | const int *sub_map = s->sub[1].map; | |
| 1088 | ✗ | TXComplex *tmp1 = s->sub[1].flags & AV_TX_INPLACE ? s->tmp : s->exp; | |
| 1089 | ✗ | TXComplex *in = _in, *out = _out; | |
| 1090 | |||
| 1091 | ✗ | stride /= sizeof(*out); | |
| 1092 | |||
| 1093 | ✗ | for (int i = 0; i < m; i++) | |
| 1094 | ✗ | s->fn[0](&s->sub[0], &s->tmp[sub_map[i]], &in[i*n], m*sizeof(TXComplex)); | |
| 1095 | |||
| 1096 | ✗ | for (int i = 0; i < n; i++) | |
| 1097 | ✗ | s->fn[1](&s->sub[1], &tmp1[m*i], &s->tmp[m*i], sizeof(TXComplex)); | |
| 1098 | |||
| 1099 | ✗ | for (int i = 0; i < l; i++) | |
| 1100 | ✗ | out[i*stride] = tmp1[out_map[i]]; | |
| 1101 | ✗ | } | |
| 1102 | |||
| 1103 | static const FFTXCodelet TX_NAME(ff_tx_fft_pfa_def) = { | ||
| 1104 | .name = TX_NAME_STR("fft_pfa"), | ||
| 1105 | .function = TX_NAME(ff_tx_fft_pfa), | ||
| 1106 | .type = TX_TYPE(FFT), | ||
| 1107 | .flags = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE, | ||
| 1108 | .factors = { 7, 5, 3, 2, TX_FACTOR_ANY }, | ||
| 1109 | .nb_factors = 2, | ||
| 1110 | .min_len = 2*3, | ||
| 1111 | .max_len = TX_LEN_UNLIMITED, | ||
| 1112 | .init = TX_NAME(ff_tx_fft_pfa_init), | ||
| 1113 | .cpu_flags = FF_TX_CPU_FLAGS_ALL, | ||
| 1114 | .prio = FF_TX_PRIO_BASE, | ||
| 1115 | }; | ||
| 1116 | |||
| 1117 | static const FFTXCodelet TX_NAME(ff_tx_fft_pfa_ns_def) = { | ||
| 1118 | .name = TX_NAME_STR("fft_pfa_ns"), | ||
| 1119 | .function = TX_NAME(ff_tx_fft_pfa_ns), | ||
| 1120 | .type = TX_TYPE(FFT), | ||
| 1121 | .flags = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE | | ||
| 1122 | FF_TX_PRESHUFFLE, | ||
| 1123 | .factors = { 7, 5, 3, 2, TX_FACTOR_ANY }, | ||
| 1124 | .nb_factors = 2, | ||
| 1125 | .min_len = 2*3, | ||
| 1126 | .max_len = TX_LEN_UNLIMITED, | ||
| 1127 | .init = TX_NAME(ff_tx_fft_pfa_init), | ||
| 1128 | .cpu_flags = FF_TX_CPU_FLAGS_ALL, | ||
| 1129 | .prio = FF_TX_PRIO_BASE, | ||
| 1130 | }; | ||
| 1131 | |||
| 1132 | ✗ | static av_cold int TX_NAME(ff_tx_mdct_naive_init)(AVTXContext *s, | |
| 1133 | const FFTXCodelet *cd, | ||
| 1134 | uint64_t flags, | ||
| 1135 | FFTXCodeletOptions *opts, | ||
| 1136 | int len, int inv, | ||
| 1137 | const void *scale) | ||
| 1138 | { | ||
| 1139 | ✗ | s->scale_d = *((SCALE_TYPE *)scale); | |
| 1140 | ✗ | s->scale_f = s->scale_d; | |
| 1141 | ✗ | return 0; | |
| 1142 | } | ||
| 1143 | |||
| 1144 | ✗ | static void TX_NAME(ff_tx_mdct_naive_fwd)(AVTXContext *s, void *_dst, | |
| 1145 | void *_src, ptrdiff_t stride) | ||
| 1146 | { | ||
| 1147 | ✗ | TXSample *src = _src; | |
| 1148 | ✗ | TXSample *dst = _dst; | |
| 1149 | ✗ | double scale = s->scale_d; | |
| 1150 | ✗ | int len = s->len; | |
| 1151 | ✗ | const double phase = M_PI/(4.0*len); | |
| 1152 | |||
| 1153 | ✗ | stride /= sizeof(*dst); | |
| 1154 | |||
| 1155 | ✗ | for (int i = 0; i < len; i++) { | |
| 1156 | ✗ | double sum = 0.0; | |
| 1157 | ✗ | for (int j = 0; j < len*2; j++) { | |
| 1158 | ✗ | int a = (2*j + 1 + len) * (2*i + 1); | |
| 1159 | ✗ | sum += UNSCALE(src[j]) * cos(a * phase); | |
| 1160 | } | ||
| 1161 | ✗ | dst[i*stride] = RESCALE(sum*scale); | |
| 1162 | } | ||
| 1163 | ✗ | } | |
| 1164 | |||
| 1165 | ✗ | static void TX_NAME(ff_tx_mdct_naive_inv)(AVTXContext *s, void *_dst, | |
| 1166 | void *_src, ptrdiff_t stride) | ||
| 1167 | { | ||
| 1168 | ✗ | TXSample *src = _src; | |
| 1169 | ✗ | TXSample *dst = _dst; | |
| 1170 | ✗ | double scale = s->scale_d; | |
| 1171 | ✗ | int len = s->len >> 1; | |
| 1172 | ✗ | int len2 = len*2; | |
| 1173 | ✗ | const double phase = M_PI/(4.0*len2); | |
| 1174 | |||
| 1175 | ✗ | stride /= sizeof(*src); | |
| 1176 | |||
| 1177 | ✗ | for (int i = 0; i < len; i++) { | |
| 1178 | ✗ | double sum_d = 0.0; | |
| 1179 | ✗ | double sum_u = 0.0; | |
| 1180 | ✗ | double i_d = phase * (4*len - 2*i - 1); | |
| 1181 | ✗ | double i_u = phase * (3*len2 + 2*i + 1); | |
| 1182 | ✗ | for (int j = 0; j < len2; j++) { | |
| 1183 | ✗ | double a = (2 * j + 1); | |
| 1184 | ✗ | double a_d = cos(a * i_d); | |
| 1185 | ✗ | double a_u = cos(a * i_u); | |
| 1186 | ✗ | double val = UNSCALE(src[j*stride]); | |
| 1187 | ✗ | sum_d += a_d * val; | |
| 1188 | ✗ | sum_u += a_u * val; | |
| 1189 | } | ||
| 1190 | ✗ | dst[i + 0] = RESCALE( sum_d*scale); | |
| 1191 | ✗ | dst[i + len] = RESCALE(-sum_u*scale); | |
| 1192 | } | ||
| 1193 | ✗ | } | |
| 1194 | |||
| 1195 | static const FFTXCodelet TX_NAME(ff_tx_mdct_naive_fwd_def) = { | ||
| 1196 | .name = TX_NAME_STR("mdct_naive_fwd"), | ||
| 1197 | .function = TX_NAME(ff_tx_mdct_naive_fwd), | ||
| 1198 | .type = TX_TYPE(MDCT), | ||
| 1199 | .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY, | ||
| 1200 | .factors = { 2, TX_FACTOR_ANY }, /* MDCTs need an even length */ | ||
| 1201 | .nb_factors = 2, | ||
| 1202 | .min_len = 2, | ||
| 1203 | .max_len = TX_LEN_UNLIMITED, | ||
| 1204 | .init = TX_NAME(ff_tx_mdct_naive_init), | ||
| 1205 | .cpu_flags = FF_TX_CPU_FLAGS_ALL, | ||
| 1206 | .prio = FF_TX_PRIO_MIN, | ||
| 1207 | }; | ||
| 1208 | |||
| 1209 | static const FFTXCodelet TX_NAME(ff_tx_mdct_naive_inv_def) = { | ||
| 1210 | .name = TX_NAME_STR("mdct_naive_inv"), | ||
| 1211 | .function = TX_NAME(ff_tx_mdct_naive_inv), | ||
| 1212 | .type = TX_TYPE(MDCT), | ||
| 1213 | .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_INVERSE_ONLY, | ||
| 1214 | .factors = { 2, TX_FACTOR_ANY }, | ||
| 1215 | .nb_factors = 2, | ||
| 1216 | .min_len = 2, | ||
| 1217 | .max_len = TX_LEN_UNLIMITED, | ||
| 1218 | .init = TX_NAME(ff_tx_mdct_naive_init), | ||
| 1219 | .cpu_flags = FF_TX_CPU_FLAGS_ALL, | ||
| 1220 | .prio = FF_TX_PRIO_MIN, | ||
| 1221 | }; | ||
| 1222 | |||
| 1223 | 2566 | static av_cold int TX_NAME(ff_tx_mdct_init)(AVTXContext *s, | |
| 1224 | const FFTXCodelet *cd, | ||
| 1225 | uint64_t flags, | ||
| 1226 | FFTXCodeletOptions *opts, | ||
| 1227 | int len, int inv, | ||
| 1228 | const void *scale) | ||
| 1229 | { | ||
| 1230 | int ret; | ||
| 1231 | 2566 | FFTXCodeletOptions sub_opts = { | |
| 1232 |
2/2✓ Branch 0 taken 361 times.
✓ Branch 1 taken 2205 times.
|
2566 | .map_dir = !inv ? FF_TX_MAP_SCATTER : FF_TX_MAP_GATHER, |
| 1233 | }; | ||
| 1234 | |||
| 1235 | 2566 | s->scale_d = *((SCALE_TYPE *)scale); | |
| 1236 | 2566 | s->scale_f = s->scale_d; | |
| 1237 | |||
| 1238 | 2566 | flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */ | |
| 1239 | 2566 | flags |= AV_TX_INPLACE; /* in-place */ | |
| 1240 | 2566 | flags |= FF_TX_PRESHUFFLE; /* First try with an in-place transform */ | |
| 1241 | |||
| 1242 |
2/2✓ Branch 1 taken 14 times.
✓ Branch 2 taken 2552 times.
|
2566 | if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1, |
| 1243 | inv, scale))) { | ||
| 1244 | 14 | flags &= ~FF_TX_PRESHUFFLE; /* Now try with a generic FFT */ | |
| 1245 |
1/2✗ Branch 1 not taken.
✓ Branch 2 taken 14 times.
|
14 | if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1, |
| 1246 | inv, scale))) | ||
| 1247 | ✗ | return ret; | |
| 1248 | } | ||
| 1249 | |||
| 1250 | 2566 | s->map = av_malloc((len >> 1)*sizeof(*s->map)); | |
| 1251 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2566 times.
|
2566 | if (!s->map) |
| 1252 | ✗ | return AVERROR(ENOMEM); | |
| 1253 | |||
| 1254 | /* If we need to preshuffle copy the map from the subcontext */ | ||
| 1255 |
2/2✓ Branch 0 taken 2552 times.
✓ Branch 1 taken 14 times.
|
2566 | if (s->sub[0].flags & FF_TX_PRESHUFFLE) { |
| 1256 | 2552 | memcpy(s->map, s->sub->map, (len >> 1)*sizeof(*s->map)); | |
| 1257 | } else { | ||
| 1258 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 14 times.
|
28 | for (int i = 0; i < len >> 1; i++) |
| 1259 | 14 | s->map[i] = i; | |
| 1260 | } | ||
| 1261 | |||
| 1262 |
3/4✓ Branch 0 taken 2205 times.
✓ Branch 1 taken 361 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 2566 times.
|
2566 | if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL))) |
| 1263 | ✗ | return ret; | |
| 1264 | |||
| 1265 | /* Saves a multiply in a hot path. */ | ||
| 1266 |
2/2✓ Branch 0 taken 2205 times.
✓ Branch 1 taken 361 times.
|
2566 | if (inv) |
| 1267 |
2/2✓ Branch 0 taken 496714 times.
✓ Branch 1 taken 2205 times.
|
498919 | for (int i = 0; i < (s->len >> 1); i++) |
| 1268 | 496714 | s->map[i] <<= 1; | |
| 1269 | |||
| 1270 | 2566 | return 0; | |
| 1271 | } | ||
| 1272 | |||
| 1273 | 26730 | static void TX_NAME(ff_tx_mdct_fwd)(AVTXContext *s, void *_dst, void *_src, | |
| 1274 | ptrdiff_t stride) | ||
| 1275 | { | ||
| 1276 | 26730 | TXSample *src = _src, *dst = _dst; | |
| 1277 | 26730 | TXComplex *exp = s->exp, tmp, *z = _dst; | |
| 1278 | 26730 | const int len2 = s->len >> 1; | |
| 1279 | 26730 | const int len4 = s->len >> 2; | |
| 1280 | 26730 | const int len3 = len2 * 3; | |
| 1281 | 26730 | const int *sub_map = s->map; | |
| 1282 | |||
| 1283 | 26730 | stride /= sizeof(*dst); | |
| 1284 | |||
| 1285 |
2/2✓ Branch 0 taken 7080192 times.
✓ Branch 1 taken 26730 times.
|
7106922 | for (int i = 0; i < len2; i++) { /* Folding and pre-reindexing */ |
| 1286 | 7080192 | const int k = 2*i; | |
| 1287 | 7080192 | const int idx = sub_map[i]; | |
| 1288 |
2/2✓ Branch 0 taken 3540096 times.
✓ Branch 1 taken 3540096 times.
|
7080192 | if (k < len2) { |
| 1289 | 3540096 | tmp.re = FOLD(-src[ len2 + k], src[1*len2 - 1 - k]); | |
| 1290 | 3540096 | tmp.im = FOLD(-src[ len3 + k], -src[1*len3 - 1 - k]); | |
| 1291 | } else { | ||
| 1292 | 3540096 | tmp.re = FOLD(-src[ len2 + k], -src[5*len2 - 1 - k]); | |
| 1293 | 3540096 | tmp.im = FOLD( src[-len2 + k], -src[1*len3 - 1 - k]); | |
| 1294 | } | ||
| 1295 | 7080192 | CMUL(z[idx].im, z[idx].re, tmp.re, tmp.im, exp[i].re, exp[i].im); | |
| 1296 | } | ||
| 1297 | |||
| 1298 | 26730 | s->fn[0](&s->sub[0], z, z, sizeof(TXComplex)); | |
| 1299 | |||
| 1300 |
2/2✓ Branch 0 taken 3540096 times.
✓ Branch 1 taken 26730 times.
|
3566826 | for (int i = 0; i < len4; i++) { |
| 1301 | 3540096 | const int i0 = len4 + i, i1 = len4 - i - 1; | |
| 1302 | 3540096 | TXComplex src1 = { z[i1].re, z[i1].im }; | |
| 1303 | 3540096 | TXComplex src0 = { z[i0].re, z[i0].im }; | |
| 1304 | |||
| 1305 | 3540096 | CMUL(dst[2*i1*stride + stride], dst[2*i0*stride], src0.re, src0.im, | |
| 1306 | exp[i0].im, exp[i0].re); | ||
| 1307 | 3540096 | CMUL(dst[2*i0*stride + stride], dst[2*i1*stride], src1.re, src1.im, | |
| 1308 | exp[i1].im, exp[i1].re); | ||
| 1309 | } | ||
| 1310 | 26730 | } | |
| 1311 | |||
| 1312 | 2341147 | static void TX_NAME(ff_tx_mdct_inv)(AVTXContext *s, void *_dst, void *_src, | |
| 1313 | ptrdiff_t stride) | ||
| 1314 | { | ||
| 1315 | 2341147 | TXComplex *z = _dst, *exp = s->exp; | |
| 1316 | 2341147 | const TXSample *src = _src, *in1, *in2; | |
| 1317 | 2341147 | const int len2 = s->len >> 1; | |
| 1318 | 2341147 | const int len4 = s->len >> 2; | |
| 1319 | 2341147 | const int *sub_map = s->map; | |
| 1320 | |||
| 1321 | 2341147 | stride /= sizeof(*src); | |
| 1322 | 2341147 | in1 = src; | |
| 1323 | 2341147 | in2 = src + ((len2*2) - 1) * stride; | |
| 1324 | |||
| 1325 |
2/2✓ Branch 0 taken 121306358 times.
✓ Branch 1 taken 2341147 times.
|
123647505 | for (int i = 0; i < len2; i++) { |
| 1326 | 121306358 | int k = sub_map[i]; | |
| 1327 | 121306358 | TXComplex tmp = { in2[-k*stride], in1[k*stride] }; | |
| 1328 | 121306358 | CMUL3(z[i], tmp, exp[i]); | |
| 1329 | } | ||
| 1330 | |||
| 1331 | 2341147 | s->fn[0](&s->sub[0], z, z, sizeof(TXComplex)); | |
| 1332 | |||
| 1333 | 2341147 | exp += len2; | |
| 1334 |
2/2✓ Branch 0 taken 60653178 times.
✓ Branch 1 taken 2341147 times.
|
62994325 | for (int i = 0; i < len4; i++) { |
| 1335 | 60653178 | const int i0 = len4 + i, i1 = len4 - i - 1; | |
| 1336 | 60653178 | TXComplex src1 = { z[i1].im, z[i1].re }; | |
| 1337 | 60653178 | TXComplex src0 = { z[i0].im, z[i0].re }; | |
| 1338 | |||
| 1339 | 60653178 | CMUL(z[i1].re, z[i0].im, src1.re, src1.im, exp[i1].im, exp[i1].re); | |
| 1340 | 60653178 | CMUL(z[i0].re, z[i1].im, src0.re, src0.im, exp[i0].im, exp[i0].re); | |
| 1341 | } | ||
| 1342 | 2341147 | } | |
| 1343 | |||
| 1344 | static const FFTXCodelet TX_NAME(ff_tx_mdct_fwd_def) = { | ||
| 1345 | .name = TX_NAME_STR("mdct_fwd"), | ||
| 1346 | .function = TX_NAME(ff_tx_mdct_fwd), | ||
| 1347 | .type = TX_TYPE(MDCT), | ||
| 1348 | .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY, | ||
| 1349 | .factors = { 2, TX_FACTOR_ANY }, | ||
| 1350 | .nb_factors = 2, | ||
| 1351 | .min_len = 2, | ||
| 1352 | .max_len = TX_LEN_UNLIMITED, | ||
| 1353 | .init = TX_NAME(ff_tx_mdct_init), | ||
| 1354 | .cpu_flags = FF_TX_CPU_FLAGS_ALL, | ||
| 1355 | .prio = FF_TX_PRIO_BASE, | ||
| 1356 | }; | ||
| 1357 | |||
| 1358 | static const FFTXCodelet TX_NAME(ff_tx_mdct_inv_def) = { | ||
| 1359 | .name = TX_NAME_STR("mdct_inv"), | ||
| 1360 | .function = TX_NAME(ff_tx_mdct_inv), | ||
| 1361 | .type = TX_TYPE(MDCT), | ||
| 1362 | .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_INVERSE_ONLY, | ||
| 1363 | .factors = { 2, TX_FACTOR_ANY }, | ||
| 1364 | .nb_factors = 2, | ||
| 1365 | .min_len = 2, | ||
| 1366 | .max_len = TX_LEN_UNLIMITED, | ||
| 1367 | .init = TX_NAME(ff_tx_mdct_init), | ||
| 1368 | .cpu_flags = FF_TX_CPU_FLAGS_ALL, | ||
| 1369 | .prio = FF_TX_PRIO_BASE, | ||
| 1370 | }; | ||
| 1371 | |||
| 1372 | 69 | static av_cold int TX_NAME(ff_tx_mdct_inv_full_init)(AVTXContext *s, | |
| 1373 | const FFTXCodelet *cd, | ||
| 1374 | uint64_t flags, | ||
| 1375 | FFTXCodeletOptions *opts, | ||
| 1376 | int len, int inv, | ||
| 1377 | const void *scale) | ||
| 1378 | { | ||
| 1379 | int ret; | ||
| 1380 | |||
| 1381 | 69 | s->scale_d = *((SCALE_TYPE *)scale); | |
| 1382 | 69 | s->scale_f = s->scale_d; | |
| 1383 | |||
| 1384 | 69 | flags &= ~AV_TX_FULL_IMDCT; | |
| 1385 | |||
| 1386 |
1/2✗ Branch 1 not taken.
✓ Branch 2 taken 69 times.
|
69 | if ((ret = ff_tx_init_subtx(s, TX_TYPE(MDCT), flags, NULL, len, 1, scale))) |
| 1387 | ✗ | return ret; | |
| 1388 | |||
| 1389 | 69 | return 0; | |
| 1390 | } | ||
| 1391 | |||
| 1392 | 26238 | static void TX_NAME(ff_tx_mdct_inv_full)(AVTXContext *s, void *_dst, | |
| 1393 | void *_src, ptrdiff_t stride) | ||
| 1394 | { | ||
| 1395 | 26238 | int len = s->len << 1; | |
| 1396 | 26238 | int len2 = len >> 1; | |
| 1397 | 26238 | int len4 = len >> 2; | |
| 1398 | 26238 | TXSample *dst = _dst; | |
| 1399 | |||
| 1400 | 26238 | s->fn[0](&s->sub[0], dst + len4, _src, stride); | |
| 1401 | |||
| 1402 | 26238 | stride /= sizeof(*dst); | |
| 1403 | |||
| 1404 |
2/2✓ Branch 0 taken 3164800 times.
✓ Branch 1 taken 26238 times.
|
3191038 | for (int i = 0; i < len4; i++) { |
| 1405 | 3164800 | dst[ i*stride] = -dst[(len2 - i - 1)*stride]; | |
| 1406 | 3164800 | dst[(len - i - 1)*stride] = dst[(len2 + i + 0)*stride]; | |
| 1407 | } | ||
| 1408 | 26238 | } | |
| 1409 | |||
| 1410 | static const FFTXCodelet TX_NAME(ff_tx_mdct_inv_full_def) = { | ||
| 1411 | .name = TX_NAME_STR("mdct_inv_full"), | ||
| 1412 | .function = TX_NAME(ff_tx_mdct_inv_full), | ||
| 1413 | .type = TX_TYPE(MDCT), | ||
| 1414 | .flags = AV_TX_UNALIGNED | AV_TX_INPLACE | | ||
| 1415 | FF_TX_OUT_OF_PLACE | AV_TX_FULL_IMDCT, | ||
| 1416 | .factors = { 2, TX_FACTOR_ANY }, | ||
| 1417 | .nb_factors = 2, | ||
| 1418 | .min_len = 2, | ||
| 1419 | .max_len = TX_LEN_UNLIMITED, | ||
| 1420 | .init = TX_NAME(ff_tx_mdct_inv_full_init), | ||
| 1421 | .cpu_flags = FF_TX_CPU_FLAGS_ALL, | ||
| 1422 | .prio = FF_TX_PRIO_BASE, | ||
| 1423 | }; | ||
| 1424 | |||
| 1425 | 1712 | static av_cold int TX_NAME(ff_tx_mdct_pfa_init)(AVTXContext *s, | |
| 1426 | const FFTXCodelet *cd, | ||
| 1427 | uint64_t flags, | ||
| 1428 | FFTXCodeletOptions *opts, | ||
| 1429 | int len, int inv, | ||
| 1430 | const void *scale) | ||
| 1431 | { | ||
| 1432 | int ret, sub_len; | ||
| 1433 | 1712 | FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_SCATTER }; | |
| 1434 | |||
| 1435 | 1712 | len >>= 1; | |
| 1436 | 1712 | sub_len = len / cd->factors[0]; | |
| 1437 | |||
| 1438 | 1712 | s->scale_d = *((SCALE_TYPE *)scale); | |
| 1439 | 1712 | s->scale_f = s->scale_d; | |
| 1440 | |||
| 1441 | 1712 | flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */ | |
| 1442 | 1712 | flags |= AV_TX_INPLACE; /* in-place */ | |
| 1443 | 1712 | flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */ | |
| 1444 | |||
| 1445 |
1/2✗ Branch 1 not taken.
✓ Branch 2 taken 1712 times.
|
1712 | if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, |
| 1446 | sub_len, inv, scale))) | ||
| 1447 | ✗ | return ret; | |
| 1448 | |||
| 1449 |
1/2✗ Branch 1 not taken.
✓ Branch 2 taken 1712 times.
|
1712 | if ((ret = ff_tx_gen_compound_mapping(s, opts, s->inv, cd->factors[0], sub_len))) |
| 1450 | ✗ | return ret; | |
| 1451 | |||
| 1452 | /* Our 15-point transform is also a compound one, so embed its input map */ | ||
| 1453 |
2/2✓ Branch 0 taken 1092 times.
✓ Branch 1 taken 620 times.
|
1712 | if (cd->factors[0] == 15) |
| 1454 |
6/6✓ Branch 0 taken 283200 times.
✓ Branch 1 taken 94400 times.
✓ Branch 2 taken 94400 times.
✓ Branch 3 taken 18880 times.
✓ Branch 4 taken 18880 times.
✓ Branch 5 taken 1092 times.
|
397572 | TX_EMBED_INPUT_PFA_MAP(s->map, len, 3, 5); |
| 1455 | |||
| 1456 |
2/4✓ Branch 0 taken 1712 times.
✗ Branch 1 not taken.
✗ Branch 3 not taken.
✓ Branch 4 taken 1712 times.
|
1712 | if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL))) |
| 1457 | ✗ | return ret; | |
| 1458 | |||
| 1459 | /* Saves multiplies in loops. */ | ||
| 1460 |
2/2✓ Branch 0 taken 417008 times.
✓ Branch 1 taken 1712 times.
|
418720 | for (int i = 0; i < len; i++) |
| 1461 | 417008 | s->map[i] <<= 1; | |
| 1462 | |||
| 1463 |
1/2✗ Branch 1 not taken.
✓ Branch 2 taken 1712 times.
|
1712 | if (!(s->tmp = av_malloc(len*sizeof(*s->tmp)))) |
| 1464 | ✗ | return AVERROR(ENOMEM); | |
| 1465 | |||
| 1466 | 1712 | TX_TAB(ff_tx_init_tabs)(len / sub_len); | |
| 1467 | |||
| 1468 | 1712 | return 0; | |
| 1469 | } | ||
| 1470 | |||
| 1471 | #define DECL_COMP_IMDCT(N) \ | ||
| 1472 | static void TX_NAME(ff_tx_mdct_pfa_##N##xM_inv)(AVTXContext *s, void *_dst, \ | ||
| 1473 | void *_src, ptrdiff_t stride) \ | ||
| 1474 | { \ | ||
| 1475 | TXComplex fft##N##in[N]; \ | ||
| 1476 | TXComplex *z = _dst, *exp = s->exp; \ | ||
| 1477 | const TXSample *src = _src, *in1, *in2; \ | ||
| 1478 | const int len4 = s->len >> 2; \ | ||
| 1479 | const int len2 = s->len >> 1; \ | ||
| 1480 | const int m = s->sub->len; \ | ||
| 1481 | const int *in_map = s->map, *out_map = in_map + N*m; \ | ||
| 1482 | const int *sub_map = s->sub->map; \ | ||
| 1483 | \ | ||
| 1484 | stride /= sizeof(*src); /* To convert it from bytes */ \ | ||
| 1485 | in1 = src; \ | ||
| 1486 | in2 = src + ((N*m*2) - 1) * stride; \ | ||
| 1487 | \ | ||
| 1488 | for (int i = 0; i < len2; i += N) { \ | ||
| 1489 | for (int j = 0; j < N; j++) { \ | ||
| 1490 | const int k = in_map[j]; \ | ||
| 1491 | TXComplex tmp = { in2[-k*stride], in1[k*stride] }; \ | ||
| 1492 | CMUL3(fft##N##in[j], tmp, exp[j]); \ | ||
| 1493 | } \ | ||
| 1494 | fft##N(s->tmp + *(sub_map++), fft##N##in, m); \ | ||
| 1495 | exp += N; \ | ||
| 1496 | in_map += N; \ | ||
| 1497 | } \ | ||
| 1498 | \ | ||
| 1499 | for (int i = 0; i < N; i++) \ | ||
| 1500 | s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex)); \ | ||
| 1501 | \ | ||
| 1502 | for (int i = 0; i < len4; i++) { \ | ||
| 1503 | const int i0 = len4 + i, i1 = len4 - i - 1; \ | ||
| 1504 | const int s0 = out_map[i0], s1 = out_map[i1]; \ | ||
| 1505 | TXComplex src1 = { s->tmp[s1].im, s->tmp[s1].re }; \ | ||
| 1506 | TXComplex src0 = { s->tmp[s0].im, s->tmp[s0].re }; \ | ||
| 1507 | \ | ||
| 1508 | CMUL(z[i1].re, z[i0].im, src1.re, src1.im, exp[i1].im, exp[i1].re); \ | ||
| 1509 | CMUL(z[i0].re, z[i1].im, src0.re, src0.im, exp[i0].im, exp[i0].re); \ | ||
| 1510 | } \ | ||
| 1511 | } \ | ||
| 1512 | \ | ||
| 1513 | static const FFTXCodelet TX_NAME(ff_tx_mdct_pfa_##N##xM_inv_def) = { \ | ||
| 1514 | .name = TX_NAME_STR("mdct_pfa_" #N "xM_inv"), \ | ||
| 1515 | .function = TX_NAME(ff_tx_mdct_pfa_##N##xM_inv), \ | ||
| 1516 | .type = TX_TYPE(MDCT), \ | ||
| 1517 | .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_INVERSE_ONLY, \ | ||
| 1518 | .factors = { N, TX_FACTOR_ANY }, \ | ||
| 1519 | .nb_factors = 2, \ | ||
| 1520 | .min_len = N*2, \ | ||
| 1521 | .max_len = TX_LEN_UNLIMITED, \ | ||
| 1522 | .init = TX_NAME(ff_tx_mdct_pfa_init), \ | ||
| 1523 | .cpu_flags = FF_TX_CPU_FLAGS_ALL, \ | ||
| 1524 | .prio = FF_TX_PRIO_BASE, \ | ||
| 1525 | }; | ||
| 1526 | |||
| 1527 | ✗ | DECL_COMP_IMDCT(3) | |
| 1528 |
8/8✓ Branch 0 taken 32800 times.
✓ Branch 1 taken 6560 times.
✓ Branch 3 taken 6560 times.
✓ Branch 4 taken 205 times.
✓ Branch 6 taken 1025 times.
✓ Branch 7 taken 205 times.
✓ Branch 8 taken 16400 times.
✓ Branch 9 taken 205 times.
|
56990 | DECL_COMP_IMDCT(5) |
| 1529 | ✗ | DECL_COMP_IMDCT(7) | |
| 1530 | ✗ | DECL_COMP_IMDCT(9) | |
| 1531 |
8/8✓ Branch 0 taken 13384380 times.
✓ Branch 1 taken 892292 times.
✓ Branch 3 taken 892292 times.
✓ Branch 4 taken 100543 times.
✓ Branch 6 taken 1508145 times.
✓ Branch 7 taken 100543 times.
✓ Branch 8 taken 6692190 times.
✓ Branch 9 taken 100543 times.
|
22577550 | DECL_COMP_IMDCT(15) |
| 1532 | |||
| 1533 | #define DECL_COMP_MDCT(N) \ | ||
| 1534 | static void TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd)(AVTXContext *s, void *_dst, \ | ||
| 1535 | void *_src, ptrdiff_t stride) \ | ||
| 1536 | { \ | ||
| 1537 | TXComplex fft##N##in[N]; \ | ||
| 1538 | TXSample *src = _src, *dst = _dst; \ | ||
| 1539 | TXComplex *exp = s->exp, tmp; \ | ||
| 1540 | const int m = s->sub->len; \ | ||
| 1541 | const int len4 = N*m; \ | ||
| 1542 | const int len3 = len4 * 3; \ | ||
| 1543 | const int len8 = s->len >> 2; \ | ||
| 1544 | const int *in_map = s->map, *out_map = in_map + N*m; \ | ||
| 1545 | const int *sub_map = s->sub->map; \ | ||
| 1546 | \ | ||
| 1547 | stride /= sizeof(*dst); \ | ||
| 1548 | \ | ||
| 1549 | for (int i = 0; i < m; i++) { /* Folding and pre-reindexing */ \ | ||
| 1550 | for (int j = 0; j < N; j++) { \ | ||
| 1551 | const int k = in_map[i*N + j]; \ | ||
| 1552 | if (k < len4) { \ | ||
| 1553 | tmp.re = FOLD(-src[ len4 + k], src[1*len4 - 1 - k]); \ | ||
| 1554 | tmp.im = FOLD(-src[ len3 + k], -src[1*len3 - 1 - k]); \ | ||
| 1555 | } else { \ | ||
| 1556 | tmp.re = FOLD(-src[ len4 + k], -src[5*len4 - 1 - k]); \ | ||
| 1557 | tmp.im = FOLD( src[-len4 + k], -src[1*len3 - 1 - k]); \ | ||
| 1558 | } \ | ||
| 1559 | CMUL(fft##N##in[j].im, fft##N##in[j].re, tmp.re, tmp.im, \ | ||
| 1560 | exp[k >> 1].re, exp[k >> 1].im); \ | ||
| 1561 | } \ | ||
| 1562 | fft##N(s->tmp + sub_map[i], fft##N##in, m); \ | ||
| 1563 | } \ | ||
| 1564 | \ | ||
| 1565 | for (int i = 0; i < N; i++) \ | ||
| 1566 | s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex)); \ | ||
| 1567 | \ | ||
| 1568 | for (int i = 0; i < len8; i++) { \ | ||
| 1569 | const int i0 = len8 + i, i1 = len8 - i - 1; \ | ||
| 1570 | const int s0 = out_map[i0], s1 = out_map[i1]; \ | ||
| 1571 | TXComplex src1 = { s->tmp[s1].re, s->tmp[s1].im }; \ | ||
| 1572 | TXComplex src0 = { s->tmp[s0].re, s->tmp[s0].im }; \ | ||
| 1573 | \ | ||
| 1574 | CMUL(dst[2*i1*stride + stride], dst[2*i0*stride], src0.re, src0.im, \ | ||
| 1575 | exp[i0].im, exp[i0].re); \ | ||
| 1576 | CMUL(dst[2*i0*stride + stride], dst[2*i1*stride], src1.re, src1.im, \ | ||
| 1577 | exp[i1].im, exp[i1].re); \ | ||
| 1578 | } \ | ||
| 1579 | } \ | ||
| 1580 | \ | ||
| 1581 | static const FFTXCodelet TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd_def) = { \ | ||
| 1582 | .name = TX_NAME_STR("mdct_pfa_" #N "xM_fwd"), \ | ||
| 1583 | .function = TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd), \ | ||
| 1584 | .type = TX_TYPE(MDCT), \ | ||
| 1585 | .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY, \ | ||
| 1586 | .factors = { N, TX_FACTOR_ANY }, \ | ||
| 1587 | .nb_factors = 2, \ | ||
| 1588 | .min_len = N*2, \ | ||
| 1589 | .max_len = TX_LEN_UNLIMITED, \ | ||
| 1590 | .init = TX_NAME(ff_tx_mdct_pfa_init), \ | ||
| 1591 | .cpu_flags = FF_TX_CPU_FLAGS_ALL, \ | ||
| 1592 | .prio = FF_TX_PRIO_BASE, \ | ||
| 1593 | }; | ||
| 1594 | |||
| 1595 | ✗ | DECL_COMP_MDCT(3) | |
| 1596 | ✗ | DECL_COMP_MDCT(5) | |
| 1597 | ✗ | DECL_COMP_MDCT(7) | |
| 1598 | ✗ | DECL_COMP_MDCT(9) | |
| 1599 | ✗ | DECL_COMP_MDCT(15) | |
| 1600 | |||
| 1601 | 59 | static av_cold int TX_NAME(ff_tx_rdft_init)(AVTXContext *s, | |
| 1602 | const FFTXCodelet *cd, | ||
| 1603 | uint64_t flags, | ||
| 1604 | FFTXCodeletOptions *opts, | ||
| 1605 | int len, int inv, | ||
| 1606 | const void *scale) | ||
| 1607 | { | ||
| 1608 | int ret; | ||
| 1609 | double f, m; | ||
| 1610 | TXSample *tab; | ||
| 1611 | 59 | uint64_t r2r = flags & AV_TX_REAL_TO_REAL; | |
| 1612 | 59 | int len4 = FFALIGN(len, 4) / 4; | |
| 1613 | |||
| 1614 | 59 | s->scale_d = *((SCALE_TYPE *)scale); | |
| 1615 | 59 | s->scale_f = s->scale_d; | |
| 1616 | |||
| 1617 | 59 | flags &= ~(AV_TX_REAL_TO_REAL | AV_TX_REAL_TO_IMAGINARY); | |
| 1618 | |||
| 1619 |
1/2✗ Branch 1 not taken.
✓ Branch 2 taken 59 times.
|
59 | if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, NULL, len >> 1, inv, scale))) |
| 1620 | ✗ | return ret; | |
| 1621 | |||
| 1622 |
1/2✗ Branch 1 not taken.
✓ Branch 2 taken 59 times.
|
59 | if (!(s->exp = av_mallocz((8 + 2*len4)*sizeof(*s->exp)))) |
| 1623 | ✗ | return AVERROR(ENOMEM); | |
| 1624 | |||
| 1625 | 59 | tab = (TXSample *)s->exp; | |
| 1626 | |||
| 1627 | 59 | f = 2*M_PI/len; | |
| 1628 | |||
| 1629 |
2/2✓ Branch 0 taken 29 times.
✓ Branch 1 taken 30 times.
|
59 | m = (inv ? 2*s->scale_d : s->scale_d); |
| 1630 | |||
| 1631 |
2/2✓ Branch 0 taken 29 times.
✓ Branch 1 taken 30 times.
|
59 | *tab++ = RESCALE((inv ? 0.5 : 1.0) * m); |
| 1632 |
2/2✓ Branch 0 taken 29 times.
✓ Branch 1 taken 30 times.
|
59 | *tab++ = RESCALE(inv ? 0.5*m : 1.0*m); |
| 1633 | 59 | *tab++ = RESCALE( m); | |
| 1634 | 59 | *tab++ = RESCALE(-m); | |
| 1635 | |||
| 1636 | 59 | *tab++ = RESCALE( (0.5 - 0.0) * m); | |
| 1637 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 51 times.
|
59 | if (r2r) |
| 1638 | 8 | *tab++ = 1 / s->scale_f; | |
| 1639 | else | ||
| 1640 | 51 | *tab++ = RESCALE( (0.0 - 0.5) * m); | |
| 1641 | 59 | *tab++ = RESCALE( (0.5 - inv) * m); | |
| 1642 | 59 | *tab++ = RESCALE(-(0.5 - inv) * m); | |
| 1643 | |||
| 1644 |
2/2✓ Branch 0 taken 46088 times.
✓ Branch 1 taken 59 times.
|
46147 | for (int i = 0; i < len4; i++) |
| 1645 | 46088 | *tab++ = RESCALE(cos(i*f)); | |
| 1646 | |||
| 1647 | 59 | tab = ((TXSample *)s->exp) + len4 + 8; | |
| 1648 | |||
| 1649 |
2/2✓ Branch 0 taken 46088 times.
✓ Branch 1 taken 59 times.
|
46147 | for (int i = 0; i < len4; i++) |
| 1650 |
2/2✓ Branch 0 taken 35072 times.
✓ Branch 1 taken 11016 times.
|
46088 | *tab++ = RESCALE(cos(((len - i*4)/4.0)*f)) * (inv ? 1 : -1); |
| 1651 | |||
| 1652 | 59 | return 0; | |
| 1653 | } | ||
| 1654 | |||
| 1655 | #define DECL_RDFT(n, inv) \ | ||
| 1656 | static void TX_NAME(ff_tx_rdft_ ##n)(AVTXContext *s, void *_dst, \ | ||
| 1657 | void *_src, ptrdiff_t stride) \ | ||
| 1658 | { \ | ||
| 1659 | const int len2 = s->len >> 1; \ | ||
| 1660 | const int len4 = s->len >> 2; \ | ||
| 1661 | const TXSample *fact = (void *)s->exp; \ | ||
| 1662 | const TXSample *tcos = fact + 8; \ | ||
| 1663 | const TXSample *tsin = tcos + len4; \ | ||
| 1664 | TXComplex *data = inv ? _src : _dst; \ | ||
| 1665 | TXComplex t[3]; \ | ||
| 1666 | \ | ||
| 1667 | if (!inv) \ | ||
| 1668 | s->fn[0](&s->sub[0], data, _src, sizeof(TXComplex)); \ | ||
| 1669 | else \ | ||
| 1670 | data[0].im = data[len2].re; \ | ||
| 1671 | \ | ||
| 1672 | /* The DC value's both components are real, but we need to change them \ | ||
| 1673 | * into complex values. Also, the middle of the array is special-cased. \ | ||
| 1674 | * These operations can be done before or after the loop. */ \ | ||
| 1675 | t[0].re = data[0].re; \ | ||
| 1676 | data[0].re = t[0].re + data[0].im; \ | ||
| 1677 | data[0].im = t[0].re - data[0].im; \ | ||
| 1678 | data[ 0].re = MULT(fact[0], data[ 0].re); \ | ||
| 1679 | data[ 0].im = MULT(fact[1], data[ 0].im); \ | ||
| 1680 | data[len4].re = MULT(fact[2], data[len4].re); \ | ||
| 1681 | data[len4].im = MULT(fact[3], data[len4].im); \ | ||
| 1682 | \ | ||
| 1683 | for (int i = 1; i < len4; i++) { \ | ||
| 1684 | /* Separate even and odd FFTs */ \ | ||
| 1685 | t[0].re = MULT(fact[4], (data[i].re + data[len2 - i].re)); \ | ||
| 1686 | t[0].im = MULT(fact[5], (data[i].im - data[len2 - i].im)); \ | ||
| 1687 | t[1].re = MULT(fact[6], (data[i].im + data[len2 - i].im)); \ | ||
| 1688 | t[1].im = MULT(fact[7], (data[i].re - data[len2 - i].re)); \ | ||
| 1689 | \ | ||
| 1690 | /* Apply twiddle factors to the odd FFT and add to the even FFT */ \ | ||
| 1691 | CMUL(t[2].re, t[2].im, t[1].re, t[1].im, tcos[i], tsin[i]); \ | ||
| 1692 | \ | ||
| 1693 | data[ i].re = t[0].re + t[2].re; \ | ||
| 1694 | data[ i].im = t[2].im - t[0].im; \ | ||
| 1695 | data[len2 - i].re = t[0].re - t[2].re; \ | ||
| 1696 | data[len2 - i].im = t[2].im + t[0].im; \ | ||
| 1697 | } \ | ||
| 1698 | \ | ||
| 1699 | if (inv) { \ | ||
| 1700 | s->fn[0](&s->sub[0], _dst, data, sizeof(TXComplex)); \ | ||
| 1701 | } else { \ | ||
| 1702 | /* Move [0].im to the last position, as convention requires */ \ | ||
| 1703 | data[len2].re = data[0].im; \ | ||
| 1704 | data[ 0].im = data[len2].im = 0; \ | ||
| 1705 | } \ | ||
| 1706 | } \ | ||
| 1707 | \ | ||
| 1708 | static const FFTXCodelet TX_NAME(ff_tx_rdft_ ##n## _def) = { \ | ||
| 1709 | .name = TX_NAME_STR("rdft_" #n), \ | ||
| 1710 | .function = TX_NAME(ff_tx_rdft_ ##n), \ | ||
| 1711 | .type = TX_TYPE(RDFT), \ | ||
| 1712 | .flags = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE | \ | ||
| 1713 | (inv ? FF_TX_INVERSE_ONLY : FF_TX_FORWARD_ONLY), \ | ||
| 1714 | .factors = { 4, TX_FACTOR_ANY }, \ | ||
| 1715 | .nb_factors = 2, \ | ||
| 1716 | .min_len = 4, \ | ||
| 1717 | .max_len = TX_LEN_UNLIMITED, \ | ||
| 1718 | .init = TX_NAME(ff_tx_rdft_init), \ | ||
| 1719 | .cpu_flags = FF_TX_CPU_FLAGS_ALL, \ | ||
| 1720 | .prio = FF_TX_PRIO_BASE, \ | ||
| 1721 | }; | ||
| 1722 | |||
| 1723 |
2/2✓ Branch 1 taken 1701749 times.
✓ Branch 2 taken 17867 times.
|
1719616 | DECL_RDFT(r2c, 0) |
| 1724 |
2/2✓ Branch 0 taken 2096993 times.
✓ Branch 1 taken 16927 times.
|
2113920 | DECL_RDFT(c2r, 1) |
| 1725 | |||
| 1726 | #define DECL_RDFT_HALF(n, mode, mod2) \ | ||
| 1727 | static void TX_NAME(ff_tx_rdft_ ##n)(AVTXContext *s, void *_dst, \ | ||
| 1728 | void *_src, ptrdiff_t stride) \ | ||
| 1729 | { \ | ||
| 1730 | const int len = s->len; \ | ||
| 1731 | const int len2 = len >> 1; \ | ||
| 1732 | const int len4 = len >> 2; \ | ||
| 1733 | const int aligned_len4 = FFALIGN(len, 4)/4; \ | ||
| 1734 | const TXSample *fact = (void *)s->exp; \ | ||
| 1735 | const TXSample *tcos = fact + 8; \ | ||
| 1736 | const TXSample *tsin = tcos + aligned_len4; \ | ||
| 1737 | TXComplex *data = _dst; \ | ||
| 1738 | TXSample *out = _dst; /* Half-complex is forward-only */ \ | ||
| 1739 | TXSample tmp_dc; \ | ||
| 1740 | av_unused TXSample tmp_mid; \ | ||
| 1741 | TXSample tmp[4]; \ | ||
| 1742 | TXComplex sf, sl; \ | ||
| 1743 | \ | ||
| 1744 | s->fn[0](&s->sub[0], _dst, _src, sizeof(TXComplex)); \ | ||
| 1745 | \ | ||
| 1746 | tmp_dc = data[0].re; \ | ||
| 1747 | data[ 0].re = tmp_dc + data[0].im; \ | ||
| 1748 | tmp_dc = tmp_dc - data[0].im; \ | ||
| 1749 | \ | ||
| 1750 | data[ 0].re = MULT(fact[0], data[ 0].re); \ | ||
| 1751 | tmp_dc = MULT(fact[1], tmp_dc); \ | ||
| 1752 | data[len4].re = MULT(fact[2], data[len4].re); \ | ||
| 1753 | \ | ||
| 1754 | if (!mod2) { \ | ||
| 1755 | data[len4].im = MULT(fact[3], data[len4].im); \ | ||
| 1756 | } else { \ | ||
| 1757 | sf = data[len4]; \ | ||
| 1758 | sl = data[len4 + 1]; \ | ||
| 1759 | if (mode == AV_TX_REAL_TO_REAL) \ | ||
| 1760 | tmp[0] = MULT(fact[4], (sf.re + sl.re)); \ | ||
| 1761 | else \ | ||
| 1762 | tmp[0] = MULT(fact[5], (sf.im - sl.im)); \ | ||
| 1763 | tmp[1] = MULT(fact[6], (sf.im + sl.im)); \ | ||
| 1764 | tmp[2] = MULT(fact[7], (sf.re - sl.re)); \ | ||
| 1765 | \ | ||
| 1766 | if (mode == AV_TX_REAL_TO_REAL) { \ | ||
| 1767 | tmp[3] = tmp[1]*tcos[len4] - tmp[2]*tsin[len4]; \ | ||
| 1768 | tmp_mid = (tmp[0] - tmp[3]); \ | ||
| 1769 | } else { \ | ||
| 1770 | tmp[3] = tmp[1]*tsin[len4] + tmp[2]*tcos[len4]; \ | ||
| 1771 | tmp_mid = (tmp[0] + tmp[3]); \ | ||
| 1772 | } \ | ||
| 1773 | } \ | ||
| 1774 | \ | ||
| 1775 | /* NOTE: unrolling this breaks non-mod8 lengths */ \ | ||
| 1776 | for (int i = 1; i <= len4; i++) { \ | ||
| 1777 | TXSample tmp[4]; \ | ||
| 1778 | TXComplex sf = data[i]; \ | ||
| 1779 | TXComplex sl = data[len2 - i]; \ | ||
| 1780 | \ | ||
| 1781 | if (mode == AV_TX_REAL_TO_REAL) \ | ||
| 1782 | tmp[0] = MULT(fact[4], (sf.re + sl.re)); \ | ||
| 1783 | else \ | ||
| 1784 | tmp[0] = MULT(fact[5], (sf.im - sl.im)); \ | ||
| 1785 | \ | ||
| 1786 | tmp[1] = MULT(fact[6], (sf.im + sl.im)); \ | ||
| 1787 | tmp[2] = MULT(fact[7], (sf.re - sl.re)); \ | ||
| 1788 | \ | ||
| 1789 | if (mode == AV_TX_REAL_TO_REAL) { \ | ||
| 1790 | tmp[3] = tmp[1]*tcos[i] - tmp[2]*tsin[i]; \ | ||
| 1791 | out[i] = (tmp[0] + tmp[3]); \ | ||
| 1792 | out[len - i] = (tmp[0] - tmp[3]); \ | ||
| 1793 | } else { \ | ||
| 1794 | tmp[3] = tmp[1]*tsin[i] + tmp[2]*tcos[i]; \ | ||
| 1795 | out[i - 1] = (tmp[3] - tmp[0]); \ | ||
| 1796 | out[len - i - 1] = (tmp[0] + tmp[3]); \ | ||
| 1797 | } \ | ||
| 1798 | } \ | ||
| 1799 | \ | ||
| 1800 | for (int i = 1; i < (len4 + (mode == AV_TX_REAL_TO_IMAGINARY)); i++) \ | ||
| 1801 | out[len2 - i] = out[len - i]; \ | ||
| 1802 | \ | ||
| 1803 | if (mode == AV_TX_REAL_TO_REAL) { \ | ||
| 1804 | out[len2] = tmp_dc; \ | ||
| 1805 | if (mod2) \ | ||
| 1806 | out[len4 + 1] = tmp_mid * fact[5]; \ | ||
| 1807 | } else if (mod2) { \ | ||
| 1808 | out[len4] = tmp_mid; \ | ||
| 1809 | } \ | ||
| 1810 | } \ | ||
| 1811 | \ | ||
| 1812 | static const FFTXCodelet TX_NAME(ff_tx_rdft_ ##n## _def) = { \ | ||
| 1813 | .name = TX_NAME_STR("rdft_" #n), \ | ||
| 1814 | .function = TX_NAME(ff_tx_rdft_ ##n), \ | ||
| 1815 | .type = TX_TYPE(RDFT), \ | ||
| 1816 | .flags = AV_TX_UNALIGNED | AV_TX_INPLACE | mode | \ | ||
| 1817 | FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY, \ | ||
| 1818 | .factors = { 2 + 2*(!mod2), TX_FACTOR_ANY }, \ | ||
| 1819 | .nb_factors = 2, \ | ||
| 1820 | .min_len = 2 + 2*(!mod2), \ | ||
| 1821 | .max_len = TX_LEN_UNLIMITED, \ | ||
| 1822 | .init = TX_NAME(ff_tx_rdft_init), \ | ||
| 1823 | .cpu_flags = FF_TX_CPU_FLAGS_ALL, \ | ||
| 1824 | .prio = FF_TX_PRIO_BASE, \ | ||
| 1825 | }; | ||
| 1826 | |||
| 1827 | ✗ | DECL_RDFT_HALF(r2r, AV_TX_REAL_TO_REAL, 0) | |
| 1828 |
4/4✓ Branch 1 taken 174034 times.
✓ Branch 2 taken 5614 times.
✓ Branch 3 taken 168420 times.
✓ Branch 4 taken 5614 times.
|
348068 | DECL_RDFT_HALF(r2r_mod2, AV_TX_REAL_TO_REAL, 1) |
| 1829 | ✗ | DECL_RDFT_HALF(r2i, AV_TX_REAL_TO_IMAGINARY, 0) | |
| 1830 |
4/4✓ Branch 1 taken 179648 times.
✓ Branch 2 taken 5614 times.
✓ Branch 3 taken 179648 times.
✓ Branch 4 taken 5614 times.
|
364910 | DECL_RDFT_HALF(r2i_mod2, AV_TX_REAL_TO_IMAGINARY, 1) |
| 1831 | |||
| 1832 | 4 | static av_cold int TX_NAME(ff_tx_dct_init)(AVTXContext *s, | |
| 1833 | const FFTXCodelet *cd, | ||
| 1834 | uint64_t flags, | ||
| 1835 | FFTXCodeletOptions *opts, | ||
| 1836 | int len, int inv, | ||
| 1837 | const void *scale) | ||
| 1838 | { | ||
| 1839 | int ret; | ||
| 1840 | double freq; | ||
| 1841 | TXSample *tab; | ||
| 1842 | 4 | SCALE_TYPE rsc = *((SCALE_TYPE *)scale); | |
| 1843 | |||
| 1844 |
1/2✓ Branch 0 taken 4 times.
✗ Branch 1 not taken.
|
4 | if (inv) { |
| 1845 | 4 | len *= 2; | |
| 1846 | 4 | s->len *= 2; | |
| 1847 | 4 | rsc *= 0.5; | |
| 1848 | } | ||
| 1849 | |||
| 1850 |
1/2✗ Branch 1 not taken.
✓ Branch 2 taken 4 times.
|
4 | if ((ret = ff_tx_init_subtx(s, TX_TYPE(RDFT), flags, NULL, len, inv, &rsc))) |
| 1851 | ✗ | return ret; | |
| 1852 | |||
| 1853 | 4 | s->exp = av_malloc((len/2)*3*sizeof(TXSample)); | |
| 1854 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 4 times.
|
4 | if (!s->exp) |
| 1855 | ✗ | return AVERROR(ENOMEM); | |
| 1856 | |||
| 1857 | 4 | tab = (TXSample *)s->exp; | |
| 1858 | |||
| 1859 | 4 | freq = M_PI/(len*2); | |
| 1860 | |||
| 1861 |
2/2✓ Branch 0 taken 6144 times.
✓ Branch 1 taken 4 times.
|
6148 | for (int i = 0; i < len; i++) |
| 1862 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 6144 times.
|
6144 | tab[i] = RESCALE(cos(i*freq)*(!inv + 1)); |
| 1863 | |||
| 1864 |
1/2✓ Branch 0 taken 4 times.
✗ Branch 1 not taken.
|
4 | if (inv) { |
| 1865 |
2/2✓ Branch 0 taken 3072 times.
✓ Branch 1 taken 4 times.
|
3076 | for (int i = 0; i < len/2; i++) |
| 1866 | 3072 | tab[len + i] = RESCALE(0.5 / sin((2*i + 1)*freq)); | |
| 1867 | } else { | ||
| 1868 | ✗ | for (int i = 0; i < len/2; i++) | |
| 1869 | ✗ | tab[len + i] = RESCALE(cos((len - 2*i - 1)*freq)); | |
| 1870 | } | ||
| 1871 | |||
| 1872 | 4 | return 0; | |
| 1873 | } | ||
| 1874 | |||
| 1875 | ✗ | static void TX_NAME(ff_tx_dctII)(AVTXContext *s, void *_dst, | |
| 1876 | void *_src, ptrdiff_t stride) | ||
| 1877 | { | ||
| 1878 | ✗ | TXSample *dst = _dst; | |
| 1879 | ✗ | TXSample *src = _src; | |
| 1880 | ✗ | const int len = s->len; | |
| 1881 | ✗ | const int len2 = len >> 1; | |
| 1882 | ✗ | const TXSample *exp = (void *)s->exp; | |
| 1883 | TXSample next; | ||
| 1884 | #ifdef TX_INT32 | ||
| 1885 | int64_t tmp1, tmp2; | ||
| 1886 | #else | ||
| 1887 | TXSample tmp1, tmp2; | ||
| 1888 | #endif | ||
| 1889 | |||
| 1890 | ✗ | for (int i = 0; i < len2; i++) { | |
| 1891 | ✗ | TXSample in1 = src[i]; | |
| 1892 | ✗ | TXSample in2 = src[len - i - 1]; | |
| 1893 | ✗ | TXSample s = exp[len + i]; | |
| 1894 | |||
| 1895 | #ifdef TX_INT32 | ||
| 1896 | ✗ | tmp1 = in1 + in2; | |
| 1897 | ✗ | tmp2 = in1 - in2; | |
| 1898 | |||
| 1899 | ✗ | tmp1 >>= 1; | |
| 1900 | ✗ | tmp2 *= s; | |
| 1901 | |||
| 1902 | ✗ | tmp2 = (tmp2 + 0x40000000) >> 31; | |
| 1903 | #else | ||
| 1904 | ✗ | tmp1 = (in1 + in2)*0.5; | |
| 1905 | ✗ | tmp2 = (in1 - in2)*s; | |
| 1906 | #endif | ||
| 1907 | |||
| 1908 | ✗ | src[i] = tmp1 + tmp2; | |
| 1909 | ✗ | src[len - i - 1] = tmp1 - tmp2; | |
| 1910 | } | ||
| 1911 | |||
| 1912 | ✗ | s->fn[0](&s->sub[0], dst, src, sizeof(TXComplex)); | |
| 1913 | |||
| 1914 | ✗ | next = dst[len]; | |
| 1915 | |||
| 1916 | ✗ | for (int i = len - 2; i > 0; i -= 2) { | |
| 1917 | TXSample tmp; | ||
| 1918 | |||
| 1919 | ✗ | CMUL(tmp, dst[i], exp[len - i], exp[i], dst[i + 0], dst[i + 1]); | |
| 1920 | |||
| 1921 | ✗ | dst[i + 1] = next; | |
| 1922 | |||
| 1923 | ✗ | next += tmp; | |
| 1924 | } | ||
| 1925 | |||
| 1926 | #ifdef TX_INT32 | ||
| 1927 | ✗ | tmp1 = ((int64_t)exp[0]) * ((int64_t)dst[0]); | |
| 1928 | ✗ | dst[0] = (tmp1 + 0x40000000) >> 31; | |
| 1929 | #else | ||
| 1930 | ✗ | dst[0] = exp[0] * dst[0]; | |
| 1931 | #endif | ||
| 1932 | ✗ | dst[1] = next; | |
| 1933 | ✗ | } | |
| 1934 | |||
| 1935 | 238 | static void TX_NAME(ff_tx_dctIII)(AVTXContext *s, void *_dst, | |
| 1936 | void *_src, ptrdiff_t stride) | ||
| 1937 | { | ||
| 1938 | 238 | TXSample *dst = _dst; | |
| 1939 | 238 | TXSample *src = _src; | |
| 1940 | 238 | const int len = s->len; | |
| 1941 | 238 | const int len2 = len >> 1; | |
| 1942 | 238 | const TXSample *exp = (void *)s->exp; | |
| 1943 | #ifdef TX_INT32 | ||
| 1944 | ✗ | int64_t tmp1, tmp2 = src[len - 1]; | |
| 1945 | ✗ | tmp2 = (2*tmp2 + 0x40000000) >> 31; | |
| 1946 | #else | ||
| 1947 | 238 | TXSample tmp1, tmp2 = 2*src[len - 1]; | |
| 1948 | #endif | ||
| 1949 | |||
| 1950 | 238 | src[len] = tmp2; | |
| 1951 | |||
| 1952 |
2/2✓ Branch 0 taken 121618 times.
✓ Branch 1 taken 238 times.
|
121856 | for (int i = len - 2; i >= 2; i -= 2) { |
| 1953 | 121618 | TXSample val1 = src[i - 0]; | |
| 1954 | 121618 | TXSample val2 = src[i - 1] - src[i + 1]; | |
| 1955 | |||
| 1956 | 121618 | CMUL(src[i + 1], src[i], exp[len - i], exp[i], val1, val2); | |
| 1957 | } | ||
| 1958 | |||
| 1959 | 238 | s->fn[0](&s->sub[0], dst, src, sizeof(float)); | |
| 1960 | |||
| 1961 |
2/2✓ Branch 0 taken 121856 times.
✓ Branch 1 taken 238 times.
|
122094 | for (int i = 0; i < len2; i++) { |
| 1962 | 121856 | TXSample in1 = dst[i]; | |
| 1963 | 121856 | TXSample in2 = dst[len - i - 1]; | |
| 1964 | 121856 | TXSample c = exp[len + i]; | |
| 1965 | |||
| 1966 | 121856 | tmp1 = in1 + in2; | |
| 1967 | 121856 | tmp2 = in1 - in2; | |
| 1968 | 121856 | tmp2 *= c; | |
| 1969 | #ifdef TX_INT32 | ||
| 1970 | ✗ | tmp2 = (tmp2 + 0x40000000) >> 31; | |
| 1971 | #endif | ||
| 1972 | |||
| 1973 | 121856 | dst[i] = tmp1 + tmp2; | |
| 1974 | 121856 | dst[len - i - 1] = tmp1 - tmp2; | |
| 1975 | } | ||
| 1976 | 238 | } | |
| 1977 | |||
| 1978 | static const FFTXCodelet TX_NAME(ff_tx_dctII_def) = { | ||
| 1979 | .name = TX_NAME_STR("dctII"), | ||
| 1980 | .function = TX_NAME(ff_tx_dctII), | ||
| 1981 | .type = TX_TYPE(DCT), | ||
| 1982 | .flags = AV_TX_UNALIGNED | AV_TX_INPLACE | | ||
| 1983 | FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY, | ||
| 1984 | .factors = { 2, TX_FACTOR_ANY }, | ||
| 1985 | .min_len = 2, | ||
| 1986 | .max_len = TX_LEN_UNLIMITED, | ||
| 1987 | .init = TX_NAME(ff_tx_dct_init), | ||
| 1988 | .cpu_flags = FF_TX_CPU_FLAGS_ALL, | ||
| 1989 | .prio = FF_TX_PRIO_BASE, | ||
| 1990 | }; | ||
| 1991 | |||
| 1992 | static const FFTXCodelet TX_NAME(ff_tx_dctIII_def) = { | ||
| 1993 | .name = TX_NAME_STR("dctIII"), | ||
| 1994 | .function = TX_NAME(ff_tx_dctIII), | ||
| 1995 | .type = TX_TYPE(DCT), | ||
| 1996 | .flags = AV_TX_UNALIGNED | AV_TX_INPLACE | | ||
| 1997 | FF_TX_OUT_OF_PLACE | FF_TX_INVERSE_ONLY, | ||
| 1998 | .factors = { 2, TX_FACTOR_ANY }, | ||
| 1999 | .min_len = 2, | ||
| 2000 | .max_len = TX_LEN_UNLIMITED, | ||
| 2001 | .init = TX_NAME(ff_tx_dct_init), | ||
| 2002 | .cpu_flags = FF_TX_CPU_FLAGS_ALL, | ||
| 2003 | .prio = FF_TX_PRIO_BASE, | ||
| 2004 | }; | ||
| 2005 | |||
| 2006 | 16 | static av_cold int TX_NAME(ff_tx_dcstI_init)(AVTXContext *s, | |
| 2007 | const FFTXCodelet *cd, | ||
| 2008 | uint64_t flags, | ||
| 2009 | FFTXCodeletOptions *opts, | ||
| 2010 | int len, int inv, | ||
| 2011 | const void *scale) | ||
| 2012 | { | ||
| 2013 | int ret; | ||
| 2014 | 16 | SCALE_TYPE rsc = *((SCALE_TYPE *)scale); | |
| 2015 | |||
| 2016 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 16 times.
|
16 | if (inv) { |
| 2017 | ✗ | len *= 2; | |
| 2018 | ✗ | s->len *= 2; | |
| 2019 | ✗ | rsc *= 0.5; | |
| 2020 | } | ||
| 2021 | |||
| 2022 | /* We want a half-complex RDFT */ | ||
| 2023 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 8 times.
|
16 | flags |= cd->type == TX_TYPE(DCT_I) ? AV_TX_REAL_TO_REAL : |
| 2024 | AV_TX_REAL_TO_IMAGINARY; | ||
| 2025 | |||
| 2026 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 16 times.
|
16 | if ((ret = ff_tx_init_subtx(s, TX_TYPE(RDFT), flags, NULL, |
| 2027 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 8 times.
|
16 | (len - 1 + 2*(cd->type == TX_TYPE(DST_I)))*2, |
| 2028 | 0, &rsc))) | ||
| 2029 | ✗ | return ret; | |
| 2030 | |||
| 2031 | 16 | s->tmp = av_mallocz((len + 1)*2*sizeof(TXSample)); | |
| 2032 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 16 times.
|
16 | if (!s->tmp) |
| 2033 | ✗ | return AVERROR(ENOMEM); | |
| 2034 | |||
| 2035 | 16 | return 0; | |
| 2036 | } | ||
| 2037 | |||
| 2038 | 5614 | static void TX_NAME(ff_tx_dctI)(AVTXContext *s, void *_dst, | |
| 2039 | void *_src, ptrdiff_t stride) | ||
| 2040 | { | ||
| 2041 | 5614 | TXSample *dst = _dst; | |
| 2042 | 5614 | TXSample *src = _src; | |
| 2043 | 5614 | const int len = s->len - 1; | |
| 2044 | 5614 | TXSample *tmp = (TXSample *)s->tmp; | |
| 2045 | |||
| 2046 | 5614 | stride /= sizeof(TXSample); | |
| 2047 | |||
| 2048 |
2/2✓ Branch 0 taken 353682 times.
✓ Branch 1 taken 5614 times.
|
359296 | for (int i = 0; i < len; i++) |
| 2049 | 353682 | tmp[i] = tmp[2*len - i] = src[i * stride]; | |
| 2050 | |||
| 2051 | 5614 | tmp[len] = src[len * stride]; /* Middle */ | |
| 2052 | |||
| 2053 | 5614 | s->fn[0](&s->sub[0], dst, tmp, sizeof(TXSample)); | |
| 2054 | 5614 | } | |
| 2055 | |||
| 2056 | 5614 | static void TX_NAME(ff_tx_dstI)(AVTXContext *s, void *_dst, | |
| 2057 | void *_src, ptrdiff_t stride) | ||
| 2058 | { | ||
| 2059 | 5614 | TXSample *dst = _dst; | |
| 2060 | 5614 | TXSample *src = _src; | |
| 2061 | 5614 | const int len = s->len + 1; | |
| 2062 | 5614 | TXSample *tmp = (void *)s->tmp; | |
| 2063 | |||
| 2064 | 5614 | stride /= sizeof(TXSample); | |
| 2065 | |||
| 2066 | 5614 | tmp[0] = 0; | |
| 2067 | |||
| 2068 |
2/2✓ Branch 0 taken 359296 times.
✓ Branch 1 taken 5614 times.
|
364910 | for (int i = 1; i < len; i++) { |
| 2069 | 359296 | TXSample a = src[(i - 1) * stride]; | |
| 2070 | 359296 | tmp[i] = -a; | |
| 2071 | 359296 | tmp[2*len - i] = a; | |
| 2072 | } | ||
| 2073 | |||
| 2074 | 5614 | tmp[len] = 0; /* i == n, Nyquist */ | |
| 2075 | |||
| 2076 | 5614 | s->fn[0](&s->sub[0], dst, tmp, sizeof(float)); | |
| 2077 | 5614 | } | |
| 2078 | |||
| 2079 | static const FFTXCodelet TX_NAME(ff_tx_dctI_def) = { | ||
| 2080 | .name = TX_NAME_STR("dctI"), | ||
| 2081 | .function = TX_NAME(ff_tx_dctI), | ||
| 2082 | .type = TX_TYPE(DCT_I), | ||
| 2083 | .flags = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE, | ||
| 2084 | .factors = { 2, TX_FACTOR_ANY }, | ||
| 2085 | .nb_factors = 2, | ||
| 2086 | .min_len = 2, | ||
| 2087 | .max_len = TX_LEN_UNLIMITED, | ||
| 2088 | .init = TX_NAME(ff_tx_dcstI_init), | ||
| 2089 | .cpu_flags = FF_TX_CPU_FLAGS_ALL, | ||
| 2090 | .prio = FF_TX_PRIO_BASE, | ||
| 2091 | }; | ||
| 2092 | |||
| 2093 | static const FFTXCodelet TX_NAME(ff_tx_dstI_def) = { | ||
| 2094 | .name = TX_NAME_STR("dstI"), | ||
| 2095 | .function = TX_NAME(ff_tx_dstI), | ||
| 2096 | .type = TX_TYPE(DST_I), | ||
| 2097 | .flags = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE, | ||
| 2098 | .factors = { 2, TX_FACTOR_ANY }, | ||
| 2099 | .nb_factors = 2, | ||
| 2100 | .min_len = 2, | ||
| 2101 | .max_len = TX_LEN_UNLIMITED, | ||
| 2102 | .init = TX_NAME(ff_tx_dcstI_init), | ||
| 2103 | .cpu_flags = FF_TX_CPU_FLAGS_ALL, | ||
| 2104 | .prio = FF_TX_PRIO_BASE, | ||
| 2105 | }; | ||
| 2106 | |||
| 2107 | 5212 | int TX_TAB(ff_tx_mdct_gen_exp)(AVTXContext *s, int *pre_tab) | |
| 2108 | { | ||
| 2109 | 5212 | int off = 0; | |
| 2110 | 5212 | int len4 = s->len >> 1; | |
| 2111 | 5212 | double scale = s->scale_d; | |
| 2112 |
2/2✓ Branch 0 taken 1459 times.
✓ Branch 1 taken 3753 times.
|
5212 | const double theta = (scale < 0 ? len4 : 0) + 1.0/8.0; |
| 2113 |
2/2✓ Branch 0 taken 4851 times.
✓ Branch 1 taken 361 times.
|
5212 | size_t alloc = pre_tab ? 2*len4 : len4; |
| 2114 | |||
| 2115 |
1/2✗ Branch 1 not taken.
✓ Branch 2 taken 5212 times.
|
5212 | if (!(s->exp = av_malloc_array(alloc, sizeof(*s->exp)))) |
| 2116 | ✗ | return AVERROR(ENOMEM); | |
| 2117 | |||
| 2118 | 5212 | scale = sqrt(fabs(scale)); | |
| 2119 | |||
| 2120 |
2/2✓ Branch 0 taken 4851 times.
✓ Branch 1 taken 361 times.
|
5212 | if (pre_tab) |
| 2121 | 4851 | off = len4; | |
| 2122 | |||
| 2123 |
2/2✓ Branch 0 taken 1265938 times.
✓ Branch 1 taken 5212 times.
|
1271150 | for (int i = 0; i < len4; i++) { |
| 2124 | 1265938 | const double alpha = M_PI_2 * (i + theta) / len4; | |
| 2125 | 1265938 | s->exp[off + i] = (TXComplex){ RESCALE(cos(alpha) * scale), | |
| 2126 | 1265938 | RESCALE(sin(alpha) * scale) }; | |
| 2127 | } | ||
| 2128 | |||
| 2129 |
2/2✓ Branch 0 taken 4851 times.
✓ Branch 1 taken 361 times.
|
5212 | if (pre_tab) |
| 2130 |
2/2✓ Branch 0 taken 1097058 times.
✓ Branch 1 taken 4851 times.
|
1101909 | for (int i = 0; i < len4; i++) |
| 2131 | 1097058 | s->exp[i] = s->exp[len4 + pre_tab[i]]; | |
| 2132 | |||
| 2133 | 5212 | return 0; | |
| 2134 | } | ||
| 2135 | |||
| 2136 | const FFTXCodelet * const TX_NAME(ff_tx_codelet_list)[] = { | ||
| 2137 | /* Split-Radix codelets */ | ||
| 2138 | &TX_NAME(ff_tx_fft2_ns_def), | ||
| 2139 | &TX_NAME(ff_tx_fft4_ns_def), | ||
| 2140 | &TX_NAME(ff_tx_fft8_ns_def), | ||
| 2141 | &TX_NAME(ff_tx_fft16_ns_def), | ||
| 2142 | &TX_NAME(ff_tx_fft32_ns_def), | ||
| 2143 | &TX_NAME(ff_tx_fft64_ns_def), | ||
| 2144 | &TX_NAME(ff_tx_fft128_ns_def), | ||
| 2145 | &TX_NAME(ff_tx_fft256_ns_def), | ||
| 2146 | &TX_NAME(ff_tx_fft512_ns_def), | ||
| 2147 | &TX_NAME(ff_tx_fft1024_ns_def), | ||
| 2148 | &TX_NAME(ff_tx_fft2048_ns_def), | ||
| 2149 | &TX_NAME(ff_tx_fft4096_ns_def), | ||
| 2150 | &TX_NAME(ff_tx_fft8192_ns_def), | ||
| 2151 | &TX_NAME(ff_tx_fft16384_ns_def), | ||
| 2152 | &TX_NAME(ff_tx_fft32768_ns_def), | ||
| 2153 | &TX_NAME(ff_tx_fft65536_ns_def), | ||
| 2154 | &TX_NAME(ff_tx_fft131072_ns_def), | ||
| 2155 | |||
| 2156 | /* Prime factor codelets */ | ||
| 2157 | &TX_NAME(ff_tx_fft3_ns_def), | ||
| 2158 | &TX_NAME(ff_tx_fft5_ns_def), | ||
| 2159 | &TX_NAME(ff_tx_fft7_ns_def), | ||
| 2160 | &TX_NAME(ff_tx_fft9_ns_def), | ||
| 2161 | &TX_NAME(ff_tx_fft15_ns_def), | ||
| 2162 | |||
| 2163 | /* We get these for free */ | ||
| 2164 | &TX_NAME(ff_tx_fft3_fwd_def), | ||
| 2165 | &TX_NAME(ff_tx_fft5_fwd_def), | ||
| 2166 | &TX_NAME(ff_tx_fft7_fwd_def), | ||
| 2167 | &TX_NAME(ff_tx_fft9_fwd_def), | ||
| 2168 | |||
| 2169 | /* Standalone transforms */ | ||
| 2170 | &TX_NAME(ff_tx_fft_def), | ||
| 2171 | &TX_NAME(ff_tx_fft_inplace_def), | ||
| 2172 | &TX_NAME(ff_tx_fft_inplace_small_def), | ||
| 2173 | &TX_NAME(ff_tx_fft_pfa_def), | ||
| 2174 | &TX_NAME(ff_tx_fft_pfa_ns_def), | ||
| 2175 | &TX_NAME(ff_tx_fft_naive_def), | ||
| 2176 | &TX_NAME(ff_tx_fft_naive_small_def), | ||
| 2177 | &TX_NAME(ff_tx_mdct_fwd_def), | ||
| 2178 | &TX_NAME(ff_tx_mdct_inv_def), | ||
| 2179 | &TX_NAME(ff_tx_mdct_pfa_3xM_fwd_def), | ||
| 2180 | &TX_NAME(ff_tx_mdct_pfa_5xM_fwd_def), | ||
| 2181 | &TX_NAME(ff_tx_mdct_pfa_7xM_fwd_def), | ||
| 2182 | &TX_NAME(ff_tx_mdct_pfa_9xM_fwd_def), | ||
| 2183 | &TX_NAME(ff_tx_mdct_pfa_15xM_fwd_def), | ||
| 2184 | &TX_NAME(ff_tx_mdct_pfa_3xM_inv_def), | ||
| 2185 | &TX_NAME(ff_tx_mdct_pfa_5xM_inv_def), | ||
| 2186 | &TX_NAME(ff_tx_mdct_pfa_7xM_inv_def), | ||
| 2187 | &TX_NAME(ff_tx_mdct_pfa_9xM_inv_def), | ||
| 2188 | &TX_NAME(ff_tx_mdct_pfa_15xM_inv_def), | ||
| 2189 | &TX_NAME(ff_tx_mdct_naive_fwd_def), | ||
| 2190 | &TX_NAME(ff_tx_mdct_naive_inv_def), | ||
| 2191 | &TX_NAME(ff_tx_mdct_inv_full_def), | ||
| 2192 | &TX_NAME(ff_tx_rdft_r2c_def), | ||
| 2193 | &TX_NAME(ff_tx_rdft_r2r_def), | ||
| 2194 | &TX_NAME(ff_tx_rdft_r2r_mod2_def), | ||
| 2195 | &TX_NAME(ff_tx_rdft_r2i_def), | ||
| 2196 | &TX_NAME(ff_tx_rdft_r2i_mod2_def), | ||
| 2197 | &TX_NAME(ff_tx_rdft_c2r_def), | ||
| 2198 | &TX_NAME(ff_tx_dctII_def), | ||
| 2199 | &TX_NAME(ff_tx_dctIII_def), | ||
| 2200 | &TX_NAME(ff_tx_dctI_def), | ||
| 2201 | &TX_NAME(ff_tx_dstI_def), | ||
| 2202 | |||
| 2203 | NULL, | ||
| 2204 | }; | ||
| 2205 |