Line |
Branch |
Exec |
Source |
1 |
|
|
/* |
2 |
|
|
* Copyright (c) 2012 Pavel Koshevoy <pkoshevoy at gmail dot com> |
3 |
|
|
* |
4 |
|
|
* This file is part of FFmpeg. |
5 |
|
|
* |
6 |
|
|
* FFmpeg is free software; you can redistribute it and/or |
7 |
|
|
* modify it under the terms of the GNU Lesser General Public |
8 |
|
|
* License as published by the Free Software Foundation; either |
9 |
|
|
* version 2.1 of the License, or (at your option) any later version. |
10 |
|
|
* |
11 |
|
|
* FFmpeg is distributed in the hope that it will be useful, |
12 |
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 |
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 |
|
|
* Lesser General Public License for more details. |
15 |
|
|
* |
16 |
|
|
* You should have received a copy of the GNU Lesser General Public |
17 |
|
|
* License along with FFmpeg; if not, write to the Free Software |
18 |
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
19 |
|
|
*/ |
20 |
|
|
|
21 |
|
|
/** |
22 |
|
|
* @file |
23 |
|
|
* tempo scaling audio filter -- an implementation of WSOLA algorithm |
24 |
|
|
* |
25 |
|
|
* Based on MIT licensed yaeAudioTempoFilter.h and yaeAudioFragment.h |
26 |
|
|
* from Apprentice Video player by Pavel Koshevoy. |
27 |
|
|
* https://sourceforge.net/projects/apprenticevideo/ |
28 |
|
|
* |
29 |
|
|
* An explanation of SOLA algorithm is available at |
30 |
|
|
* http://www.surina.net/article/time-and-pitch-scaling.html |
31 |
|
|
* |
32 |
|
|
* WSOLA is very similar to SOLA, only one major difference exists between |
33 |
|
|
* these algorithms. SOLA shifts audio fragments along the output stream, |
34 |
|
|
* where as WSOLA shifts audio fragments along the input stream. |
35 |
|
|
* |
36 |
|
|
* The advantage of WSOLA algorithm is that the overlap region size is |
37 |
|
|
* always the same, therefore the blending function is constant and |
38 |
|
|
* can be precomputed. |
39 |
|
|
*/ |
40 |
|
|
|
41 |
|
|
#include <float.h> |
42 |
|
|
#include "libavutil/avassert.h" |
43 |
|
|
#include "libavutil/channel_layout.h" |
44 |
|
|
#include "libavutil/mem.h" |
45 |
|
|
#include "libavutil/opt.h" |
46 |
|
|
#include "libavutil/samplefmt.h" |
47 |
|
|
#include "libavutil/tx.h" |
48 |
|
|
#include "avfilter.h" |
49 |
|
|
#include "audio.h" |
50 |
|
|
#include "internal.h" |
51 |
|
|
|
52 |
|
|
/** |
53 |
|
|
* A fragment of audio waveform |
54 |
|
|
*/ |
55 |
|
|
typedef struct AudioFragment { |
56 |
|
|
// index of the first sample of this fragment in the overall waveform; |
57 |
|
|
// 0: input sample position |
58 |
|
|
// 1: output sample position |
59 |
|
|
int64_t position[2]; |
60 |
|
|
|
61 |
|
|
// original packed multi-channel samples: |
62 |
|
|
uint8_t *data; |
63 |
|
|
|
64 |
|
|
// number of samples in this fragment: |
65 |
|
|
int nsamples; |
66 |
|
|
|
67 |
|
|
// rDFT transform of the down-mixed mono fragment, used for |
68 |
|
|
// fast waveform alignment via correlation in frequency domain: |
69 |
|
|
float *xdat_in; |
70 |
|
|
float *xdat; |
71 |
|
|
} AudioFragment; |
72 |
|
|
|
73 |
|
|
/** |
74 |
|
|
* Filter state machine states |
75 |
|
|
*/ |
76 |
|
|
typedef enum { |
77 |
|
|
YAE_LOAD_FRAGMENT, |
78 |
|
|
YAE_ADJUST_POSITION, |
79 |
|
|
YAE_RELOAD_FRAGMENT, |
80 |
|
|
YAE_OUTPUT_OVERLAP_ADD, |
81 |
|
|
YAE_FLUSH_OUTPUT, |
82 |
|
|
} FilterState; |
83 |
|
|
|
84 |
|
|
/** |
85 |
|
|
* Filter state machine |
86 |
|
|
*/ |
87 |
|
|
typedef struct ATempoContext { |
88 |
|
|
const AVClass *class; |
89 |
|
|
|
90 |
|
|
// ring-buffer of input samples, necessary because some times |
91 |
|
|
// input fragment position may be adjusted backwards: |
92 |
|
|
uint8_t *buffer; |
93 |
|
|
|
94 |
|
|
// ring-buffer maximum capacity, expressed in sample rate time base: |
95 |
|
|
int ring; |
96 |
|
|
|
97 |
|
|
// ring-buffer house keeping: |
98 |
|
|
int size; |
99 |
|
|
int head; |
100 |
|
|
int tail; |
101 |
|
|
|
102 |
|
|
// 0: input sample position corresponding to the ring buffer tail |
103 |
|
|
// 1: output sample position |
104 |
|
|
int64_t position[2]; |
105 |
|
|
|
106 |
|
|
// first input timestamp, all other timestamps are offset by this one |
107 |
|
|
int64_t start_pts; |
108 |
|
|
|
109 |
|
|
// sample format: |
110 |
|
|
enum AVSampleFormat format; |
111 |
|
|
|
112 |
|
|
// number of channels: |
113 |
|
|
int channels; |
114 |
|
|
|
115 |
|
|
// row of bytes to skip from one sample to next, across multple channels; |
116 |
|
|
// stride = (number-of-channels * bits-per-sample-per-channel) / 8 |
117 |
|
|
int stride; |
118 |
|
|
|
119 |
|
|
// fragment window size, power-of-two integer: |
120 |
|
|
int window; |
121 |
|
|
|
122 |
|
|
// Hann window coefficients, for feathering |
123 |
|
|
// (blending) the overlapping fragment region: |
124 |
|
|
float *hann; |
125 |
|
|
|
126 |
|
|
// tempo scaling factor: |
127 |
|
|
double tempo; |
128 |
|
|
|
129 |
|
|
// a snapshot of previous fragment input and output position values |
130 |
|
|
// captured when the tempo scale factor was set most recently: |
131 |
|
|
int64_t origin[2]; |
132 |
|
|
|
133 |
|
|
// current/previous fragment ring-buffer: |
134 |
|
|
AudioFragment frag[2]; |
135 |
|
|
|
136 |
|
|
// current fragment index: |
137 |
|
|
uint64_t nfrag; |
138 |
|
|
|
139 |
|
|
// current state: |
140 |
|
|
FilterState state; |
141 |
|
|
|
142 |
|
|
// for fast correlation calculation in frequency domain: |
143 |
|
|
AVTXContext *real_to_complex; |
144 |
|
|
AVTXContext *complex_to_real; |
145 |
|
|
av_tx_fn r2c_fn, c2r_fn; |
146 |
|
|
float *correlation_in; |
147 |
|
|
float *correlation; |
148 |
|
|
|
149 |
|
|
// for managing AVFilterPad.request_frame and AVFilterPad.filter_frame |
150 |
|
|
AVFrame *dst_buffer; |
151 |
|
|
uint8_t *dst; |
152 |
|
|
uint8_t *dst_end; |
153 |
|
|
uint64_t nsamples_in; |
154 |
|
|
uint64_t nsamples_out; |
155 |
|
|
} ATempoContext; |
156 |
|
|
|
157 |
|
|
#define YAE_ATEMPO_MIN 0.5 |
158 |
|
|
#define YAE_ATEMPO_MAX 100.0 |
159 |
|
|
|
160 |
|
|
#define OFFSET(x) offsetof(ATempoContext, x) |
161 |
|
|
|
162 |
|
|
static const AVOption atempo_options[] = { |
163 |
|
|
{ "tempo", "set tempo scale factor", |
164 |
|
|
OFFSET(tempo), AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, |
165 |
|
|
YAE_ATEMPO_MIN, |
166 |
|
|
YAE_ATEMPO_MAX, |
167 |
|
|
AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_RUNTIME_PARAM }, |
168 |
|
|
{ NULL } |
169 |
|
|
}; |
170 |
|
|
|
171 |
|
|
AVFILTER_DEFINE_CLASS(atempo); |
172 |
|
|
|
173 |
|
✗ |
inline static AudioFragment *yae_curr_frag(ATempoContext *atempo) |
174 |
|
|
{ |
175 |
|
✗ |
return &atempo->frag[atempo->nfrag % 2]; |
176 |
|
|
} |
177 |
|
|
|
178 |
|
✗ |
inline static AudioFragment *yae_prev_frag(ATempoContext *atempo) |
179 |
|
|
{ |
180 |
|
✗ |
return &atempo->frag[(atempo->nfrag + 1) % 2]; |
181 |
|
|
} |
182 |
|
|
|
183 |
|
|
/** |
184 |
|
|
* Reset filter to initial state, do not deallocate existing local buffers. |
185 |
|
|
*/ |
186 |
|
✗ |
static void yae_clear(ATempoContext *atempo) |
187 |
|
|
{ |
188 |
|
✗ |
atempo->size = 0; |
189 |
|
✗ |
atempo->head = 0; |
190 |
|
✗ |
atempo->tail = 0; |
191 |
|
|
|
192 |
|
✗ |
atempo->nfrag = 0; |
193 |
|
✗ |
atempo->state = YAE_LOAD_FRAGMENT; |
194 |
|
✗ |
atempo->start_pts = AV_NOPTS_VALUE; |
195 |
|
|
|
196 |
|
✗ |
atempo->position[0] = 0; |
197 |
|
✗ |
atempo->position[1] = 0; |
198 |
|
|
|
199 |
|
✗ |
atempo->origin[0] = 0; |
200 |
|
✗ |
atempo->origin[1] = 0; |
201 |
|
|
|
202 |
|
✗ |
atempo->frag[0].position[0] = 0; |
203 |
|
✗ |
atempo->frag[0].position[1] = 0; |
204 |
|
✗ |
atempo->frag[0].nsamples = 0; |
205 |
|
|
|
206 |
|
✗ |
atempo->frag[1].position[0] = 0; |
207 |
|
✗ |
atempo->frag[1].position[1] = 0; |
208 |
|
✗ |
atempo->frag[1].nsamples = 0; |
209 |
|
|
|
210 |
|
|
// shift left position of 1st fragment by half a window |
211 |
|
|
// so that no re-normalization would be required for |
212 |
|
|
// the left half of the 1st fragment: |
213 |
|
✗ |
atempo->frag[0].position[0] = -(int64_t)(atempo->window / 2); |
214 |
|
✗ |
atempo->frag[0].position[1] = -(int64_t)(atempo->window / 2); |
215 |
|
|
|
216 |
|
✗ |
av_frame_free(&atempo->dst_buffer); |
217 |
|
✗ |
atempo->dst = NULL; |
218 |
|
✗ |
atempo->dst_end = NULL; |
219 |
|
|
|
220 |
|
✗ |
atempo->nsamples_in = 0; |
221 |
|
✗ |
atempo->nsamples_out = 0; |
222 |
|
✗ |
} |
223 |
|
|
|
224 |
|
|
/** |
225 |
|
|
* Reset filter to initial state and deallocate all buffers. |
226 |
|
|
*/ |
227 |
|
✗ |
static void yae_release_buffers(ATempoContext *atempo) |
228 |
|
|
{ |
229 |
|
✗ |
yae_clear(atempo); |
230 |
|
|
|
231 |
|
✗ |
av_freep(&atempo->frag[0].data); |
232 |
|
✗ |
av_freep(&atempo->frag[1].data); |
233 |
|
✗ |
av_freep(&atempo->frag[0].xdat_in); |
234 |
|
✗ |
av_freep(&atempo->frag[1].xdat_in); |
235 |
|
✗ |
av_freep(&atempo->frag[0].xdat); |
236 |
|
✗ |
av_freep(&atempo->frag[1].xdat); |
237 |
|
|
|
238 |
|
✗ |
av_freep(&atempo->buffer); |
239 |
|
✗ |
av_freep(&atempo->hann); |
240 |
|
✗ |
av_freep(&atempo->correlation_in); |
241 |
|
✗ |
av_freep(&atempo->correlation); |
242 |
|
|
|
243 |
|
✗ |
av_tx_uninit(&atempo->real_to_complex); |
244 |
|
✗ |
av_tx_uninit(&atempo->complex_to_real); |
245 |
|
✗ |
} |
246 |
|
|
|
247 |
|
|
/* av_realloc is not aligned enough; fortunately, the data does not need to |
248 |
|
|
* be preserved */ |
249 |
|
|
#define RE_MALLOC_OR_FAIL(field, field_size, element_size) \ |
250 |
|
|
do { \ |
251 |
|
|
av_freep(&field); \ |
252 |
|
|
field = av_calloc(field_size, element_size); \ |
253 |
|
|
if (!field) { \ |
254 |
|
|
yae_release_buffers(atempo); \ |
255 |
|
|
return AVERROR(ENOMEM); \ |
256 |
|
|
} \ |
257 |
|
|
} while (0) |
258 |
|
|
|
259 |
|
|
/** |
260 |
|
|
* Prepare filter for processing audio data of given format, |
261 |
|
|
* sample rate and number of channels. |
262 |
|
|
*/ |
263 |
|
✗ |
static int yae_reset(ATempoContext *atempo, |
264 |
|
|
enum AVSampleFormat format, |
265 |
|
|
int sample_rate, |
266 |
|
|
int channels) |
267 |
|
|
{ |
268 |
|
✗ |
const int sample_size = av_get_bytes_per_sample(format); |
269 |
|
✗ |
uint32_t nlevels = 0; |
270 |
|
✗ |
float scale = 1.f, iscale = 1.f; |
271 |
|
|
uint32_t pot; |
272 |
|
|
int i; |
273 |
|
|
|
274 |
|
✗ |
atempo->format = format; |
275 |
|
✗ |
atempo->channels = channels; |
276 |
|
✗ |
atempo->stride = sample_size * channels; |
277 |
|
|
|
278 |
|
|
// pick a segment window size: |
279 |
|
✗ |
atempo->window = sample_rate / 24; |
280 |
|
|
|
281 |
|
|
// adjust window size to be a power-of-two integer: |
282 |
|
✗ |
nlevels = av_log2(atempo->window); |
283 |
|
✗ |
pot = 1 << nlevels; |
284 |
|
✗ |
av_assert0(pot <= atempo->window); |
285 |
|
|
|
286 |
|
✗ |
if (pot < atempo->window) { |
287 |
|
✗ |
atempo->window = pot * 2; |
288 |
|
✗ |
nlevels++; |
289 |
|
|
} |
290 |
|
|
|
291 |
|
|
// initialize audio fragment buffers: |
292 |
|
✗ |
RE_MALLOC_OR_FAIL(atempo->frag[0].data, atempo->window, atempo->stride); |
293 |
|
✗ |
RE_MALLOC_OR_FAIL(atempo->frag[1].data, atempo->window, atempo->stride); |
294 |
|
✗ |
RE_MALLOC_OR_FAIL(atempo->frag[0].xdat_in, (atempo->window + 1), sizeof(AVComplexFloat)); |
295 |
|
✗ |
RE_MALLOC_OR_FAIL(atempo->frag[1].xdat_in, (atempo->window + 1), sizeof(AVComplexFloat)); |
296 |
|
✗ |
RE_MALLOC_OR_FAIL(atempo->frag[0].xdat, (atempo->window + 1), sizeof(AVComplexFloat)); |
297 |
|
✗ |
RE_MALLOC_OR_FAIL(atempo->frag[1].xdat, (atempo->window + 1), sizeof(AVComplexFloat)); |
298 |
|
|
|
299 |
|
|
// initialize rDFT contexts: |
300 |
|
✗ |
av_tx_uninit(&atempo->real_to_complex); |
301 |
|
✗ |
av_tx_uninit(&atempo->complex_to_real); |
302 |
|
|
|
303 |
|
✗ |
av_tx_init(&atempo->real_to_complex, &atempo->r2c_fn, AV_TX_FLOAT_RDFT, 0, 1 << (nlevels + 1), &scale, 0); |
304 |
|
✗ |
if (!atempo->real_to_complex) { |
305 |
|
✗ |
yae_release_buffers(atempo); |
306 |
|
✗ |
return AVERROR(ENOMEM); |
307 |
|
|
} |
308 |
|
|
|
309 |
|
✗ |
av_tx_init(&atempo->complex_to_real, &atempo->c2r_fn, AV_TX_FLOAT_RDFT, 1, 1 << (nlevels + 1), &iscale, 0); |
310 |
|
✗ |
if (!atempo->complex_to_real) { |
311 |
|
✗ |
yae_release_buffers(atempo); |
312 |
|
✗ |
return AVERROR(ENOMEM); |
313 |
|
|
} |
314 |
|
|
|
315 |
|
✗ |
RE_MALLOC_OR_FAIL(atempo->correlation_in, (atempo->window + 1), sizeof(AVComplexFloat)); |
316 |
|
✗ |
RE_MALLOC_OR_FAIL(atempo->correlation, atempo->window, sizeof(AVComplexFloat)); |
317 |
|
|
|
318 |
|
✗ |
atempo->ring = atempo->window * 3; |
319 |
|
✗ |
RE_MALLOC_OR_FAIL(atempo->buffer, atempo->ring, atempo->stride); |
320 |
|
|
|
321 |
|
|
// initialize the Hann window function: |
322 |
|
✗ |
RE_MALLOC_OR_FAIL(atempo->hann, atempo->window, sizeof(float)); |
323 |
|
|
|
324 |
|
✗ |
for (i = 0; i < atempo->window; i++) { |
325 |
|
✗ |
double t = (double)i / (double)(atempo->window - 1); |
326 |
|
✗ |
double h = 0.5 * (1.0 - cos(2.0 * M_PI * t)); |
327 |
|
✗ |
atempo->hann[i] = (float)h; |
328 |
|
|
} |
329 |
|
|
|
330 |
|
✗ |
yae_clear(atempo); |
331 |
|
✗ |
return 0; |
332 |
|
|
} |
333 |
|
|
|
334 |
|
✗ |
static int yae_update(AVFilterContext *ctx) |
335 |
|
|
{ |
336 |
|
|
const AudioFragment *prev; |
337 |
|
✗ |
ATempoContext *atempo = ctx->priv; |
338 |
|
|
|
339 |
|
✗ |
prev = yae_prev_frag(atempo); |
340 |
|
✗ |
atempo->origin[0] = prev->position[0] + atempo->window / 2; |
341 |
|
✗ |
atempo->origin[1] = prev->position[1] + atempo->window / 2; |
342 |
|
✗ |
return 0; |
343 |
|
|
} |
344 |
|
|
|
345 |
|
|
/** |
346 |
|
|
* A helper macro for initializing complex data buffer with scalar data |
347 |
|
|
* of a given type. |
348 |
|
|
*/ |
349 |
|
|
#define yae_init_xdat(scalar_type, scalar_max) \ |
350 |
|
|
do { \ |
351 |
|
|
const uint8_t *src_end = src + \ |
352 |
|
|
frag->nsamples * atempo->channels * sizeof(scalar_type); \ |
353 |
|
|
\ |
354 |
|
|
float *xdat = frag->xdat_in; \ |
355 |
|
|
scalar_type tmp; \ |
356 |
|
|
\ |
357 |
|
|
if (atempo->channels == 1) { \ |
358 |
|
|
for (; src < src_end; xdat++) { \ |
359 |
|
|
tmp = *(const scalar_type *)src; \ |
360 |
|
|
src += sizeof(scalar_type); \ |
361 |
|
|
\ |
362 |
|
|
*xdat = (float)tmp; \ |
363 |
|
|
} \ |
364 |
|
|
} else { \ |
365 |
|
|
float s, max, ti, si; \ |
366 |
|
|
int i; \ |
367 |
|
|
\ |
368 |
|
|
for (; src < src_end; xdat++) { \ |
369 |
|
|
tmp = *(const scalar_type *)src; \ |
370 |
|
|
src += sizeof(scalar_type); \ |
371 |
|
|
\ |
372 |
|
|
max = (float)tmp; \ |
373 |
|
|
s = FFMIN((float)scalar_max, \ |
374 |
|
|
(float)fabsf(max)); \ |
375 |
|
|
\ |
376 |
|
|
for (i = 1; i < atempo->channels; i++) { \ |
377 |
|
|
tmp = *(const scalar_type *)src; \ |
378 |
|
|
src += sizeof(scalar_type); \ |
379 |
|
|
\ |
380 |
|
|
ti = (float)tmp; \ |
381 |
|
|
si = FFMIN((float)scalar_max, \ |
382 |
|
|
(float)fabsf(ti)); \ |
383 |
|
|
\ |
384 |
|
|
if (s < si) { \ |
385 |
|
|
s = si; \ |
386 |
|
|
max = ti; \ |
387 |
|
|
} \ |
388 |
|
|
} \ |
389 |
|
|
\ |
390 |
|
|
*xdat = max; \ |
391 |
|
|
} \ |
392 |
|
|
} \ |
393 |
|
|
} while (0) |
394 |
|
|
|
395 |
|
|
/** |
396 |
|
|
* Initialize complex data buffer of a given audio fragment |
397 |
|
|
* with down-mixed mono data of appropriate scalar type. |
398 |
|
|
*/ |
399 |
|
✗ |
static void yae_downmix(ATempoContext *atempo, AudioFragment *frag) |
400 |
|
|
{ |
401 |
|
|
// shortcuts: |
402 |
|
✗ |
const uint8_t *src = frag->data; |
403 |
|
|
|
404 |
|
|
// init complex data buffer used for FFT and Correlation: |
405 |
|
✗ |
memset(frag->xdat_in, 0, sizeof(AVComplexFloat) * (atempo->window + 1)); |
406 |
|
|
|
407 |
|
✗ |
if (atempo->format == AV_SAMPLE_FMT_U8) { |
408 |
|
✗ |
yae_init_xdat(uint8_t, 127); |
409 |
|
✗ |
} else if (atempo->format == AV_SAMPLE_FMT_S16) { |
410 |
|
✗ |
yae_init_xdat(int16_t, 32767); |
411 |
|
✗ |
} else if (atempo->format == AV_SAMPLE_FMT_S32) { |
412 |
|
✗ |
yae_init_xdat(int, 2147483647); |
413 |
|
✗ |
} else if (atempo->format == AV_SAMPLE_FMT_FLT) { |
414 |
|
✗ |
yae_init_xdat(float, 1); |
415 |
|
✗ |
} else if (atempo->format == AV_SAMPLE_FMT_DBL) { |
416 |
|
✗ |
yae_init_xdat(double, 1); |
417 |
|
|
} |
418 |
|
✗ |
} |
419 |
|
|
|
420 |
|
|
/** |
421 |
|
|
* Populate the internal data buffer on as-needed basis. |
422 |
|
|
* |
423 |
|
|
* @return |
424 |
|
|
* 0 if requested data was already available or was successfully loaded, |
425 |
|
|
* AVERROR(EAGAIN) if more input data is required. |
426 |
|
|
*/ |
427 |
|
✗ |
static int yae_load_data(ATempoContext *atempo, |
428 |
|
|
const uint8_t **src_ref, |
429 |
|
|
const uint8_t *src_end, |
430 |
|
|
int64_t stop_here) |
431 |
|
|
{ |
432 |
|
|
// shortcut: |
433 |
|
✗ |
const uint8_t *src = *src_ref; |
434 |
|
✗ |
const int read_size = stop_here - atempo->position[0]; |
435 |
|
|
|
436 |
|
✗ |
if (stop_here <= atempo->position[0]) { |
437 |
|
✗ |
return 0; |
438 |
|
|
} |
439 |
|
|
|
440 |
|
|
// samples are not expected to be skipped, unless tempo is greater than 2: |
441 |
|
✗ |
av_assert0(read_size <= atempo->ring || atempo->tempo > 2.0); |
442 |
|
|
|
443 |
|
✗ |
while (atempo->position[0] < stop_here && src < src_end) { |
444 |
|
✗ |
int src_samples = (src_end - src) / atempo->stride; |
445 |
|
|
|
446 |
|
|
// load data piece-wise, in order to avoid complicating the logic: |
447 |
|
✗ |
int nsamples = FFMIN(read_size, src_samples); |
448 |
|
|
int na; |
449 |
|
|
int nb; |
450 |
|
|
|
451 |
|
✗ |
nsamples = FFMIN(nsamples, atempo->ring); |
452 |
|
✗ |
na = FFMIN(nsamples, atempo->ring - atempo->tail); |
453 |
|
✗ |
nb = FFMIN(nsamples - na, atempo->ring); |
454 |
|
|
|
455 |
|
✗ |
if (na) { |
456 |
|
✗ |
uint8_t *a = atempo->buffer + atempo->tail * atempo->stride; |
457 |
|
✗ |
memcpy(a, src, na * atempo->stride); |
458 |
|
|
|
459 |
|
✗ |
src += na * atempo->stride; |
460 |
|
✗ |
atempo->position[0] += na; |
461 |
|
|
|
462 |
|
✗ |
atempo->size = FFMIN(atempo->size + na, atempo->ring); |
463 |
|
✗ |
atempo->tail = (atempo->tail + na) % atempo->ring; |
464 |
|
✗ |
atempo->head = |
465 |
|
✗ |
atempo->size < atempo->ring ? |
466 |
|
✗ |
atempo->tail - atempo->size : |
467 |
|
|
atempo->tail; |
468 |
|
|
} |
469 |
|
|
|
470 |
|
✗ |
if (nb) { |
471 |
|
✗ |
uint8_t *b = atempo->buffer; |
472 |
|
✗ |
memcpy(b, src, nb * atempo->stride); |
473 |
|
|
|
474 |
|
✗ |
src += nb * atempo->stride; |
475 |
|
✗ |
atempo->position[0] += nb; |
476 |
|
|
|
477 |
|
✗ |
atempo->size = FFMIN(atempo->size + nb, atempo->ring); |
478 |
|
✗ |
atempo->tail = (atempo->tail + nb) % atempo->ring; |
479 |
|
✗ |
atempo->head = |
480 |
|
✗ |
atempo->size < atempo->ring ? |
481 |
|
✗ |
atempo->tail - atempo->size : |
482 |
|
|
atempo->tail; |
483 |
|
|
} |
484 |
|
|
} |
485 |
|
|
|
486 |
|
|
// pass back the updated source buffer pointer: |
487 |
|
✗ |
*src_ref = src; |
488 |
|
|
|
489 |
|
|
// sanity check: |
490 |
|
✗ |
av_assert0(atempo->position[0] <= stop_here); |
491 |
|
|
|
492 |
|
✗ |
return atempo->position[0] == stop_here ? 0 : AVERROR(EAGAIN); |
493 |
|
|
} |
494 |
|
|
|
495 |
|
|
/** |
496 |
|
|
* Populate current audio fragment data buffer. |
497 |
|
|
* |
498 |
|
|
* @return |
499 |
|
|
* 0 when the fragment is ready, |
500 |
|
|
* AVERROR(EAGAIN) if more input data is required. |
501 |
|
|
*/ |
502 |
|
✗ |
static int yae_load_frag(ATempoContext *atempo, |
503 |
|
|
const uint8_t **src_ref, |
504 |
|
|
const uint8_t *src_end) |
505 |
|
|
{ |
506 |
|
|
// shortcuts: |
507 |
|
✗ |
AudioFragment *frag = yae_curr_frag(atempo); |
508 |
|
|
uint8_t *dst; |
509 |
|
|
int64_t missing, start, zeros; |
510 |
|
|
uint32_t nsamples; |
511 |
|
|
const uint8_t *a, *b; |
512 |
|
|
int i0, i1, n0, n1, na, nb; |
513 |
|
|
|
514 |
|
✗ |
int64_t stop_here = frag->position[0] + atempo->window; |
515 |
|
✗ |
if (src_ref && yae_load_data(atempo, src_ref, src_end, stop_here) != 0) { |
516 |
|
✗ |
return AVERROR(EAGAIN); |
517 |
|
|
} |
518 |
|
|
|
519 |
|
|
// calculate the number of samples we don't have: |
520 |
|
✗ |
missing = |
521 |
|
✗ |
stop_here > atempo->position[0] ? |
522 |
|
✗ |
stop_here - atempo->position[0] : 0; |
523 |
|
|
|
524 |
|
✗ |
nsamples = |
525 |
|
✗ |
missing < (int64_t)atempo->window ? |
526 |
|
✗ |
(uint32_t)(atempo->window - missing) : 0; |
527 |
|
|
|
528 |
|
|
// setup the output buffer: |
529 |
|
✗ |
frag->nsamples = nsamples; |
530 |
|
✗ |
dst = frag->data; |
531 |
|
|
|
532 |
|
✗ |
start = atempo->position[0] - atempo->size; |
533 |
|
|
|
534 |
|
|
// what we don't have we substitute with zeros: |
535 |
|
✗ |
zeros = |
536 |
|
✗ |
frag->position[0] < start ? |
537 |
|
✗ |
FFMIN(start - frag->position[0], (int64_t)nsamples) : 0; |
538 |
|
|
|
539 |
|
✗ |
if (zeros == nsamples) { |
540 |
|
✗ |
return 0; |
541 |
|
|
} |
542 |
|
|
|
543 |
|
✗ |
if (frag->position[0] < start) { |
544 |
|
✗ |
memset(dst, 0, zeros * atempo->stride); |
545 |
|
✗ |
dst += zeros * atempo->stride; |
546 |
|
|
} |
547 |
|
|
|
548 |
|
|
// get the remaining data from the ring buffer: |
549 |
|
✗ |
na = (atempo->head < atempo->tail ? |
550 |
|
✗ |
atempo->tail - atempo->head : |
551 |
|
✗ |
atempo->ring - atempo->head); |
552 |
|
|
|
553 |
|
✗ |
nb = atempo->head < atempo->tail ? 0 : atempo->tail; |
554 |
|
|
|
555 |
|
|
// sanity check: |
556 |
|
✗ |
av_assert0(nsamples <= zeros + na + nb); |
557 |
|
|
|
558 |
|
✗ |
a = atempo->buffer + atempo->head * atempo->stride; |
559 |
|
✗ |
b = atempo->buffer; |
560 |
|
|
|
561 |
|
✗ |
i0 = frag->position[0] + zeros - start; |
562 |
|
✗ |
i1 = i0 < na ? 0 : i0 - na; |
563 |
|
|
|
564 |
|
✗ |
n0 = i0 < na ? FFMIN(na - i0, (int)(nsamples - zeros)) : 0; |
565 |
|
✗ |
n1 = nsamples - zeros - n0; |
566 |
|
|
|
567 |
|
✗ |
if (n0) { |
568 |
|
✗ |
memcpy(dst, a + i0 * atempo->stride, n0 * atempo->stride); |
569 |
|
✗ |
dst += n0 * atempo->stride; |
570 |
|
|
} |
571 |
|
|
|
572 |
|
✗ |
if (n1) { |
573 |
|
✗ |
memcpy(dst, b + i1 * atempo->stride, n1 * atempo->stride); |
574 |
|
|
} |
575 |
|
|
|
576 |
|
✗ |
return 0; |
577 |
|
|
} |
578 |
|
|
|
579 |
|
|
/** |
580 |
|
|
* Prepare for loading next audio fragment. |
581 |
|
|
*/ |
582 |
|
✗ |
static void yae_advance_to_next_frag(ATempoContext *atempo) |
583 |
|
|
{ |
584 |
|
✗ |
const double fragment_step = atempo->tempo * (double)(atempo->window / 2); |
585 |
|
|
|
586 |
|
|
const AudioFragment *prev; |
587 |
|
|
AudioFragment *frag; |
588 |
|
|
|
589 |
|
✗ |
atempo->nfrag++; |
590 |
|
✗ |
prev = yae_prev_frag(atempo); |
591 |
|
✗ |
frag = yae_curr_frag(atempo); |
592 |
|
|
|
593 |
|
✗ |
frag->position[0] = prev->position[0] + (int64_t)fragment_step; |
594 |
|
✗ |
frag->position[1] = prev->position[1] + atempo->window / 2; |
595 |
|
✗ |
frag->nsamples = 0; |
596 |
|
✗ |
} |
597 |
|
|
|
598 |
|
|
/** |
599 |
|
|
* Calculate cross-correlation via rDFT. |
600 |
|
|
* |
601 |
|
|
* Multiply two vectors of complex numbers (result of real_to_complex rDFT) |
602 |
|
|
* and transform back via complex_to_real rDFT. |
603 |
|
|
*/ |
604 |
|
✗ |
static void yae_xcorr_via_rdft(float *xcorr_in, |
605 |
|
|
float *xcorr, |
606 |
|
|
AVTXContext *complex_to_real, |
607 |
|
|
av_tx_fn c2r_fn, |
608 |
|
|
const AVComplexFloat *xa, |
609 |
|
|
const AVComplexFloat *xb, |
610 |
|
|
const int window) |
611 |
|
|
{ |
612 |
|
✗ |
AVComplexFloat *xc = (AVComplexFloat *)xcorr_in; |
613 |
|
|
int i; |
614 |
|
|
|
615 |
|
✗ |
for (i = 0; i <= window; i++, xa++, xb++, xc++) { |
616 |
|
✗ |
xc->re = (xa->re * xb->re + xa->im * xb->im); |
617 |
|
✗ |
xc->im = (xa->im * xb->re - xa->re * xb->im); |
618 |
|
|
} |
619 |
|
|
|
620 |
|
|
// apply inverse rDFT: |
621 |
|
✗ |
c2r_fn(complex_to_real, xcorr, xcorr_in, sizeof(*xc)); |
622 |
|
✗ |
} |
623 |
|
|
|
624 |
|
|
/** |
625 |
|
|
* Calculate alignment offset for given fragment |
626 |
|
|
* relative to the previous fragment. |
627 |
|
|
* |
628 |
|
|
* @return alignment offset of current fragment relative to previous. |
629 |
|
|
*/ |
630 |
|
✗ |
static int yae_align(AudioFragment *frag, |
631 |
|
|
const AudioFragment *prev, |
632 |
|
|
const int window, |
633 |
|
|
const int delta_max, |
634 |
|
|
const int drift, |
635 |
|
|
float *correlation_in, |
636 |
|
|
float *correlation, |
637 |
|
|
AVTXContext *complex_to_real, |
638 |
|
|
av_tx_fn c2r_fn) |
639 |
|
|
{ |
640 |
|
✗ |
int best_offset = -drift; |
641 |
|
✗ |
float best_metric = -FLT_MAX; |
642 |
|
|
float *xcorr; |
643 |
|
|
|
644 |
|
|
int i0; |
645 |
|
|
int i1; |
646 |
|
|
int i; |
647 |
|
|
|
648 |
|
✗ |
yae_xcorr_via_rdft(correlation_in, |
649 |
|
|
correlation, |
650 |
|
|
complex_to_real, |
651 |
|
|
c2r_fn, |
652 |
|
✗ |
(const AVComplexFloat *)prev->xdat, |
653 |
|
✗ |
(const AVComplexFloat *)frag->xdat, |
654 |
|
|
window); |
655 |
|
|
|
656 |
|
|
// identify search window boundaries: |
657 |
|
✗ |
i0 = FFMAX(window / 2 - delta_max - drift, 0); |
658 |
|
✗ |
i0 = FFMIN(i0, window); |
659 |
|
|
|
660 |
|
✗ |
i1 = FFMIN(window / 2 + delta_max - drift, window - window / 16); |
661 |
|
✗ |
i1 = FFMAX(i1, 0); |
662 |
|
|
|
663 |
|
|
// identify cross-correlation peaks within search window: |
664 |
|
✗ |
xcorr = correlation + i0; |
665 |
|
|
|
666 |
|
✗ |
for (i = i0; i < i1; i++, xcorr++) { |
667 |
|
✗ |
float metric = *xcorr; |
668 |
|
|
|
669 |
|
|
// normalize: |
670 |
|
✗ |
float drifti = (float)(drift + i); |
671 |
|
✗ |
metric *= drifti * (float)(i - i0) * (float)(i1 - i); |
672 |
|
|
|
673 |
|
✗ |
if (metric > best_metric) { |
674 |
|
✗ |
best_metric = metric; |
675 |
|
✗ |
best_offset = i - window / 2; |
676 |
|
|
} |
677 |
|
|
} |
678 |
|
|
|
679 |
|
✗ |
return best_offset; |
680 |
|
|
} |
681 |
|
|
|
682 |
|
|
/** |
683 |
|
|
* Adjust current fragment position for better alignment |
684 |
|
|
* with previous fragment. |
685 |
|
|
* |
686 |
|
|
* @return alignment correction. |
687 |
|
|
*/ |
688 |
|
✗ |
static int yae_adjust_position(ATempoContext *atempo) |
689 |
|
|
{ |
690 |
|
✗ |
const AudioFragment *prev = yae_prev_frag(atempo); |
691 |
|
✗ |
AudioFragment *frag = yae_curr_frag(atempo); |
692 |
|
|
|
693 |
|
✗ |
const double prev_output_position = |
694 |
|
✗ |
(double)(prev->position[1] - atempo->origin[1] + atempo->window / 2) * |
695 |
|
✗ |
atempo->tempo; |
696 |
|
|
|
697 |
|
✗ |
const double ideal_output_position = |
698 |
|
✗ |
(double)(prev->position[0] - atempo->origin[0] + atempo->window / 2); |
699 |
|
|
|
700 |
|
✗ |
const int drift = (int)(prev_output_position - ideal_output_position); |
701 |
|
|
|
702 |
|
✗ |
const int delta_max = atempo->window / 2; |
703 |
|
✗ |
const int correction = yae_align(frag, |
704 |
|
|
prev, |
705 |
|
|
atempo->window, |
706 |
|
|
delta_max, |
707 |
|
|
drift, |
708 |
|
|
atempo->correlation_in, |
709 |
|
|
atempo->correlation, |
710 |
|
|
atempo->complex_to_real, |
711 |
|
|
atempo->c2r_fn); |
712 |
|
|
|
713 |
|
✗ |
if (correction) { |
714 |
|
|
// adjust fragment position: |
715 |
|
✗ |
frag->position[0] -= correction; |
716 |
|
|
|
717 |
|
|
// clear so that the fragment can be reloaded: |
718 |
|
✗ |
frag->nsamples = 0; |
719 |
|
|
} |
720 |
|
|
|
721 |
|
✗ |
return correction; |
722 |
|
|
} |
723 |
|
|
|
724 |
|
|
/** |
725 |
|
|
* A helper macro for blending the overlap region of previous |
726 |
|
|
* and current audio fragment. |
727 |
|
|
*/ |
728 |
|
|
#define yae_blend(scalar_type) \ |
729 |
|
|
do { \ |
730 |
|
|
const scalar_type *aaa = (const scalar_type *)a; \ |
731 |
|
|
const scalar_type *bbb = (const scalar_type *)b; \ |
732 |
|
|
\ |
733 |
|
|
scalar_type *out = (scalar_type *)dst; \ |
734 |
|
|
scalar_type *out_end = (scalar_type *)dst_end; \ |
735 |
|
|
int64_t i; \ |
736 |
|
|
\ |
737 |
|
|
for (i = 0; i < overlap && out < out_end; \ |
738 |
|
|
i++, atempo->position[1]++, wa++, wb++) { \ |
739 |
|
|
float w0 = *wa; \ |
740 |
|
|
float w1 = *wb; \ |
741 |
|
|
int j; \ |
742 |
|
|
\ |
743 |
|
|
for (j = 0; j < atempo->channels; \ |
744 |
|
|
j++, aaa++, bbb++, out++) { \ |
745 |
|
|
float t0 = (float)*aaa; \ |
746 |
|
|
float t1 = (float)*bbb; \ |
747 |
|
|
\ |
748 |
|
|
*out = \ |
749 |
|
|
frag->position[0] + i < 0 ? \ |
750 |
|
|
*aaa : \ |
751 |
|
|
(scalar_type)(t0 * w0 + t1 * w1); \ |
752 |
|
|
} \ |
753 |
|
|
} \ |
754 |
|
|
dst = (uint8_t *)out; \ |
755 |
|
|
} while (0) |
756 |
|
|
|
757 |
|
|
/** |
758 |
|
|
* Blend the overlap region of previous and current audio fragment |
759 |
|
|
* and output the results to the given destination buffer. |
760 |
|
|
* |
761 |
|
|
* @return |
762 |
|
|
* 0 if the overlap region was completely stored in the dst buffer, |
763 |
|
|
* AVERROR(EAGAIN) if more destination buffer space is required. |
764 |
|
|
*/ |
765 |
|
✗ |
static int yae_overlap_add(ATempoContext *atempo, |
766 |
|
|
uint8_t **dst_ref, |
767 |
|
|
uint8_t *dst_end) |
768 |
|
|
{ |
769 |
|
|
// shortcuts: |
770 |
|
✗ |
const AudioFragment *prev = yae_prev_frag(atempo); |
771 |
|
✗ |
const AudioFragment *frag = yae_curr_frag(atempo); |
772 |
|
|
|
773 |
|
✗ |
const int64_t start_here = FFMAX(atempo->position[1], |
774 |
|
|
frag->position[1]); |
775 |
|
|
|
776 |
|
✗ |
const int64_t stop_here = FFMIN(prev->position[1] + prev->nsamples, |
777 |
|
|
frag->position[1] + frag->nsamples); |
778 |
|
|
|
779 |
|
✗ |
const int64_t overlap = stop_here - start_here; |
780 |
|
|
|
781 |
|
✗ |
const int64_t ia = start_here - prev->position[1]; |
782 |
|
✗ |
const int64_t ib = start_here - frag->position[1]; |
783 |
|
|
|
784 |
|
✗ |
const float *wa = atempo->hann + ia; |
785 |
|
✗ |
const float *wb = atempo->hann + ib; |
786 |
|
|
|
787 |
|
✗ |
const uint8_t *a = prev->data + ia * atempo->stride; |
788 |
|
✗ |
const uint8_t *b = frag->data + ib * atempo->stride; |
789 |
|
|
|
790 |
|
✗ |
uint8_t *dst = *dst_ref; |
791 |
|
|
|
792 |
|
✗ |
av_assert0(start_here <= stop_here && |
793 |
|
|
frag->position[1] <= start_here && |
794 |
|
|
overlap <= frag->nsamples); |
795 |
|
|
|
796 |
|
✗ |
if (atempo->format == AV_SAMPLE_FMT_U8) { |
797 |
|
✗ |
yae_blend(uint8_t); |
798 |
|
✗ |
} else if (atempo->format == AV_SAMPLE_FMT_S16) { |
799 |
|
✗ |
yae_blend(int16_t); |
800 |
|
✗ |
} else if (atempo->format == AV_SAMPLE_FMT_S32) { |
801 |
|
✗ |
yae_blend(int); |
802 |
|
✗ |
} else if (atempo->format == AV_SAMPLE_FMT_FLT) { |
803 |
|
✗ |
yae_blend(float); |
804 |
|
✗ |
} else if (atempo->format == AV_SAMPLE_FMT_DBL) { |
805 |
|
✗ |
yae_blend(double); |
806 |
|
|
} |
807 |
|
|
|
808 |
|
|
// pass-back the updated destination buffer pointer: |
809 |
|
✗ |
*dst_ref = dst; |
810 |
|
|
|
811 |
|
✗ |
return atempo->position[1] == stop_here ? 0 : AVERROR(EAGAIN); |
812 |
|
|
} |
813 |
|
|
|
814 |
|
|
/** |
815 |
|
|
* Feed as much data to the filter as it is able to consume |
816 |
|
|
* and receive as much processed data in the destination buffer |
817 |
|
|
* as it is able to produce or store. |
818 |
|
|
*/ |
819 |
|
|
static void |
820 |
|
✗ |
yae_apply(ATempoContext *atempo, |
821 |
|
|
const uint8_t **src_ref, |
822 |
|
|
const uint8_t *src_end, |
823 |
|
|
uint8_t **dst_ref, |
824 |
|
|
uint8_t *dst_end) |
825 |
|
|
{ |
826 |
|
|
while (1) { |
827 |
|
✗ |
if (atempo->state == YAE_LOAD_FRAGMENT) { |
828 |
|
|
// load additional data for the current fragment: |
829 |
|
✗ |
if (yae_load_frag(atempo, src_ref, src_end) != 0) { |
830 |
|
✗ |
break; |
831 |
|
|
} |
832 |
|
|
|
833 |
|
|
// down-mix to mono: |
834 |
|
✗ |
yae_downmix(atempo, yae_curr_frag(atempo)); |
835 |
|
|
|
836 |
|
|
// apply rDFT: |
837 |
|
✗ |
atempo->r2c_fn(atempo->real_to_complex, yae_curr_frag(atempo)->xdat, yae_curr_frag(atempo)->xdat_in, sizeof(float)); |
838 |
|
|
|
839 |
|
|
// must load the second fragment before alignment can start: |
840 |
|
✗ |
if (!atempo->nfrag) { |
841 |
|
✗ |
yae_advance_to_next_frag(atempo); |
842 |
|
✗ |
continue; |
843 |
|
|
} |
844 |
|
|
|
845 |
|
✗ |
atempo->state = YAE_ADJUST_POSITION; |
846 |
|
|
} |
847 |
|
|
|
848 |
|
✗ |
if (atempo->state == YAE_ADJUST_POSITION) { |
849 |
|
|
// adjust position for better alignment: |
850 |
|
✗ |
if (yae_adjust_position(atempo)) { |
851 |
|
|
// reload the fragment at the corrected position, so that the |
852 |
|
|
// Hann window blending would not require normalization: |
853 |
|
✗ |
atempo->state = YAE_RELOAD_FRAGMENT; |
854 |
|
|
} else { |
855 |
|
✗ |
atempo->state = YAE_OUTPUT_OVERLAP_ADD; |
856 |
|
|
} |
857 |
|
|
} |
858 |
|
|
|
859 |
|
✗ |
if (atempo->state == YAE_RELOAD_FRAGMENT) { |
860 |
|
|
// load additional data if necessary due to position adjustment: |
861 |
|
✗ |
if (yae_load_frag(atempo, src_ref, src_end) != 0) { |
862 |
|
✗ |
break; |
863 |
|
|
} |
864 |
|
|
|
865 |
|
|
// down-mix to mono: |
866 |
|
✗ |
yae_downmix(atempo, yae_curr_frag(atempo)); |
867 |
|
|
|
868 |
|
|
// apply rDFT: |
869 |
|
✗ |
atempo->r2c_fn(atempo->real_to_complex, yae_curr_frag(atempo)->xdat, yae_curr_frag(atempo)->xdat_in, sizeof(float)); |
870 |
|
|
|
871 |
|
✗ |
atempo->state = YAE_OUTPUT_OVERLAP_ADD; |
872 |
|
|
} |
873 |
|
|
|
874 |
|
✗ |
if (atempo->state == YAE_OUTPUT_OVERLAP_ADD) { |
875 |
|
|
// overlap-add and output the result: |
876 |
|
✗ |
if (yae_overlap_add(atempo, dst_ref, dst_end) != 0) { |
877 |
|
✗ |
break; |
878 |
|
|
} |
879 |
|
|
|
880 |
|
|
// advance to the next fragment, repeat: |
881 |
|
✗ |
yae_advance_to_next_frag(atempo); |
882 |
|
✗ |
atempo->state = YAE_LOAD_FRAGMENT; |
883 |
|
|
} |
884 |
|
|
} |
885 |
|
✗ |
} |
886 |
|
|
|
887 |
|
|
/** |
888 |
|
|
* Flush any buffered data from the filter. |
889 |
|
|
* |
890 |
|
|
* @return |
891 |
|
|
* 0 if all data was completely stored in the dst buffer, |
892 |
|
|
* AVERROR(EAGAIN) if more destination buffer space is required. |
893 |
|
|
*/ |
894 |
|
✗ |
static int yae_flush(ATempoContext *atempo, |
895 |
|
|
uint8_t **dst_ref, |
896 |
|
|
uint8_t *dst_end) |
897 |
|
|
{ |
898 |
|
✗ |
AudioFragment *frag = yae_curr_frag(atempo); |
899 |
|
|
int64_t overlap_end; |
900 |
|
|
int64_t start_here; |
901 |
|
|
int64_t stop_here; |
902 |
|
|
int64_t offset; |
903 |
|
|
|
904 |
|
|
const uint8_t *src; |
905 |
|
|
uint8_t *dst; |
906 |
|
|
|
907 |
|
|
int src_size; |
908 |
|
|
int dst_size; |
909 |
|
|
int nbytes; |
910 |
|
|
|
911 |
|
✗ |
atempo->state = YAE_FLUSH_OUTPUT; |
912 |
|
|
|
913 |
|
✗ |
if (!atempo->nfrag) { |
914 |
|
|
// there is nothing to flush: |
915 |
|
✗ |
return 0; |
916 |
|
|
} |
917 |
|
|
|
918 |
|
✗ |
if (atempo->position[0] == frag->position[0] + frag->nsamples && |
919 |
|
✗ |
atempo->position[1] == frag->position[1] + frag->nsamples) { |
920 |
|
|
// the current fragment is already flushed: |
921 |
|
✗ |
return 0; |
922 |
|
|
} |
923 |
|
|
|
924 |
|
✗ |
if (frag->position[0] + frag->nsamples < atempo->position[0]) { |
925 |
|
|
// finish loading the current (possibly partial) fragment: |
926 |
|
✗ |
yae_load_frag(atempo, NULL, NULL); |
927 |
|
|
|
928 |
|
✗ |
if (atempo->nfrag) { |
929 |
|
|
// down-mix to mono: |
930 |
|
✗ |
yae_downmix(atempo, frag); |
931 |
|
|
|
932 |
|
|
// apply rDFT: |
933 |
|
✗ |
atempo->r2c_fn(atempo->real_to_complex, frag->xdat, frag->xdat_in, sizeof(float)); |
934 |
|
|
|
935 |
|
|
// align current fragment to previous fragment: |
936 |
|
✗ |
if (yae_adjust_position(atempo)) { |
937 |
|
|
// reload the current fragment due to adjusted position: |
938 |
|
✗ |
yae_load_frag(atempo, NULL, NULL); |
939 |
|
|
} |
940 |
|
|
} |
941 |
|
|
} |
942 |
|
|
|
943 |
|
|
// flush the overlap region: |
944 |
|
✗ |
overlap_end = frag->position[1] + FFMIN(atempo->window / 2, |
945 |
|
|
frag->nsamples); |
946 |
|
|
|
947 |
|
✗ |
while (atempo->position[1] < overlap_end) { |
948 |
|
✗ |
if (yae_overlap_add(atempo, dst_ref, dst_end) != 0) { |
949 |
|
✗ |
return AVERROR(EAGAIN); |
950 |
|
|
} |
951 |
|
|
} |
952 |
|
|
|
953 |
|
|
// check whether all of the input samples have been consumed: |
954 |
|
✗ |
if (frag->position[0] + frag->nsamples < atempo->position[0]) { |
955 |
|
✗ |
yae_advance_to_next_frag(atempo); |
956 |
|
✗ |
return AVERROR(EAGAIN); |
957 |
|
|
} |
958 |
|
|
|
959 |
|
|
// flush the remainder of the current fragment: |
960 |
|
✗ |
start_here = FFMAX(atempo->position[1], overlap_end); |
961 |
|
✗ |
stop_here = frag->position[1] + frag->nsamples; |
962 |
|
✗ |
offset = start_here - frag->position[1]; |
963 |
|
✗ |
av_assert0(start_here <= stop_here && frag->position[1] <= start_here); |
964 |
|
|
|
965 |
|
✗ |
src = frag->data + offset * atempo->stride; |
966 |
|
✗ |
dst = (uint8_t *)*dst_ref; |
967 |
|
|
|
968 |
|
✗ |
src_size = (int)(stop_here - start_here) * atempo->stride; |
969 |
|
✗ |
dst_size = dst_end - dst; |
970 |
|
✗ |
nbytes = FFMIN(src_size, dst_size); |
971 |
|
|
|
972 |
|
✗ |
memcpy(dst, src, nbytes); |
973 |
|
✗ |
dst += nbytes; |
974 |
|
|
|
975 |
|
✗ |
atempo->position[1] += (nbytes / atempo->stride); |
976 |
|
|
|
977 |
|
|
// pass-back the updated destination buffer pointer: |
978 |
|
✗ |
*dst_ref = (uint8_t *)dst; |
979 |
|
|
|
980 |
|
✗ |
return atempo->position[1] == stop_here ? 0 : AVERROR(EAGAIN); |
981 |
|
|
} |
982 |
|
|
|
983 |
|
✗ |
static av_cold int init(AVFilterContext *ctx) |
984 |
|
|
{ |
985 |
|
✗ |
ATempoContext *atempo = ctx->priv; |
986 |
|
✗ |
atempo->format = AV_SAMPLE_FMT_NONE; |
987 |
|
✗ |
atempo->state = YAE_LOAD_FRAGMENT; |
988 |
|
✗ |
return 0; |
989 |
|
|
} |
990 |
|
|
|
991 |
|
✗ |
static av_cold void uninit(AVFilterContext *ctx) |
992 |
|
|
{ |
993 |
|
✗ |
ATempoContext *atempo = ctx->priv; |
994 |
|
✗ |
yae_release_buffers(atempo); |
995 |
|
✗ |
} |
996 |
|
|
|
997 |
|
|
// WSOLA necessitates an internal sliding window ring buffer |
998 |
|
|
// for incoming audio stream. |
999 |
|
|
// |
1000 |
|
|
// Planar sample formats are too cumbersome to store in a ring buffer, |
1001 |
|
|
// therefore planar sample formats are not supported. |
1002 |
|
|
// |
1003 |
|
|
static const enum AVSampleFormat sample_fmts[] = { |
1004 |
|
|
AV_SAMPLE_FMT_U8, |
1005 |
|
|
AV_SAMPLE_FMT_S16, |
1006 |
|
|
AV_SAMPLE_FMT_S32, |
1007 |
|
|
AV_SAMPLE_FMT_FLT, |
1008 |
|
|
AV_SAMPLE_FMT_DBL, |
1009 |
|
|
AV_SAMPLE_FMT_NONE |
1010 |
|
|
}; |
1011 |
|
|
|
1012 |
|
✗ |
static int config_props(AVFilterLink *inlink) |
1013 |
|
|
{ |
1014 |
|
✗ |
AVFilterContext *ctx = inlink->dst; |
1015 |
|
✗ |
ATempoContext *atempo = ctx->priv; |
1016 |
|
|
|
1017 |
|
✗ |
enum AVSampleFormat format = inlink->format; |
1018 |
|
✗ |
int sample_rate = (int)inlink->sample_rate; |
1019 |
|
|
|
1020 |
|
✗ |
return yae_reset(atempo, format, sample_rate, inlink->ch_layout.nb_channels); |
1021 |
|
|
} |
1022 |
|
|
|
1023 |
|
✗ |
static int push_samples(ATempoContext *atempo, |
1024 |
|
|
AVFilterLink *outlink, |
1025 |
|
|
int n_out) |
1026 |
|
|
{ |
1027 |
|
|
int ret; |
1028 |
|
|
|
1029 |
|
✗ |
atempo->dst_buffer->sample_rate = outlink->sample_rate; |
1030 |
|
✗ |
atempo->dst_buffer->nb_samples = n_out; |
1031 |
|
|
|
1032 |
|
|
// adjust the PTS: |
1033 |
|
✗ |
atempo->dst_buffer->pts = atempo->start_pts + |
1034 |
|
✗ |
av_rescale_q(atempo->nsamples_out, |
1035 |
|
✗ |
(AVRational){ 1, outlink->sample_rate }, |
1036 |
|
|
outlink->time_base); |
1037 |
|
|
|
1038 |
|
✗ |
ret = ff_filter_frame(outlink, atempo->dst_buffer); |
1039 |
|
✗ |
atempo->dst_buffer = NULL; |
1040 |
|
✗ |
atempo->dst = NULL; |
1041 |
|
✗ |
atempo->dst_end = NULL; |
1042 |
|
✗ |
if (ret < 0) |
1043 |
|
✗ |
return ret; |
1044 |
|
|
|
1045 |
|
✗ |
atempo->nsamples_out += n_out; |
1046 |
|
✗ |
return 0; |
1047 |
|
|
} |
1048 |
|
|
|
1049 |
|
✗ |
static int filter_frame(AVFilterLink *inlink, AVFrame *src_buffer) |
1050 |
|
|
{ |
1051 |
|
✗ |
AVFilterContext *ctx = inlink->dst; |
1052 |
|
✗ |
ATempoContext *atempo = ctx->priv; |
1053 |
|
✗ |
AVFilterLink *outlink = ctx->outputs[0]; |
1054 |
|
|
|
1055 |
|
✗ |
int ret = 0; |
1056 |
|
✗ |
int n_in = src_buffer->nb_samples; |
1057 |
|
✗ |
int n_out = (int)(0.5 + ((double)n_in) / atempo->tempo); |
1058 |
|
|
|
1059 |
|
✗ |
const uint8_t *src = src_buffer->data[0]; |
1060 |
|
✗ |
const uint8_t *src_end = src + n_in * atempo->stride; |
1061 |
|
|
|
1062 |
|
✗ |
if (atempo->start_pts == AV_NOPTS_VALUE) |
1063 |
|
✗ |
atempo->start_pts = av_rescale_q(src_buffer->pts, |
1064 |
|
|
inlink->time_base, |
1065 |
|
|
outlink->time_base); |
1066 |
|
|
|
1067 |
|
✗ |
while (src < src_end) { |
1068 |
|
✗ |
if (!atempo->dst_buffer) { |
1069 |
|
✗ |
atempo->dst_buffer = ff_get_audio_buffer(outlink, n_out); |
1070 |
|
✗ |
if (!atempo->dst_buffer) { |
1071 |
|
✗ |
av_frame_free(&src_buffer); |
1072 |
|
✗ |
return AVERROR(ENOMEM); |
1073 |
|
|
} |
1074 |
|
✗ |
av_frame_copy_props(atempo->dst_buffer, src_buffer); |
1075 |
|
|
|
1076 |
|
✗ |
atempo->dst = atempo->dst_buffer->data[0]; |
1077 |
|
✗ |
atempo->dst_end = atempo->dst + n_out * atempo->stride; |
1078 |
|
|
} |
1079 |
|
|
|
1080 |
|
✗ |
yae_apply(atempo, &src, src_end, &atempo->dst, atempo->dst_end); |
1081 |
|
|
|
1082 |
|
✗ |
if (atempo->dst == atempo->dst_end) { |
1083 |
|
✗ |
int n_samples = ((atempo->dst - atempo->dst_buffer->data[0]) / |
1084 |
|
✗ |
atempo->stride); |
1085 |
|
✗ |
ret = push_samples(atempo, outlink, n_samples); |
1086 |
|
✗ |
if (ret < 0) |
1087 |
|
✗ |
goto end; |
1088 |
|
|
} |
1089 |
|
|
} |
1090 |
|
|
|
1091 |
|
✗ |
atempo->nsamples_in += n_in; |
1092 |
|
✗ |
end: |
1093 |
|
✗ |
av_frame_free(&src_buffer); |
1094 |
|
✗ |
return ret; |
1095 |
|
|
} |
1096 |
|
|
|
1097 |
|
✗ |
static int request_frame(AVFilterLink *outlink) |
1098 |
|
|
{ |
1099 |
|
✗ |
AVFilterContext *ctx = outlink->src; |
1100 |
|
✗ |
ATempoContext *atempo = ctx->priv; |
1101 |
|
|
int ret; |
1102 |
|
|
|
1103 |
|
✗ |
ret = ff_request_frame(ctx->inputs[0]); |
1104 |
|
|
|
1105 |
|
✗ |
if (ret == AVERROR_EOF) { |
1106 |
|
|
// flush the filter: |
1107 |
|
✗ |
int n_max = atempo->ring; |
1108 |
|
|
int n_out; |
1109 |
|
✗ |
int err = AVERROR(EAGAIN); |
1110 |
|
|
|
1111 |
|
✗ |
while (err == AVERROR(EAGAIN)) { |
1112 |
|
✗ |
if (!atempo->dst_buffer) { |
1113 |
|
✗ |
atempo->dst_buffer = ff_get_audio_buffer(outlink, n_max); |
1114 |
|
✗ |
if (!atempo->dst_buffer) |
1115 |
|
✗ |
return AVERROR(ENOMEM); |
1116 |
|
|
|
1117 |
|
✗ |
atempo->dst = atempo->dst_buffer->data[0]; |
1118 |
|
✗ |
atempo->dst_end = atempo->dst + n_max * atempo->stride; |
1119 |
|
|
} |
1120 |
|
|
|
1121 |
|
✗ |
err = yae_flush(atempo, &atempo->dst, atempo->dst_end); |
1122 |
|
|
|
1123 |
|
✗ |
n_out = ((atempo->dst - atempo->dst_buffer->data[0]) / |
1124 |
|
✗ |
atempo->stride); |
1125 |
|
|
|
1126 |
|
✗ |
if (n_out) { |
1127 |
|
✗ |
ret = push_samples(atempo, outlink, n_out); |
1128 |
|
✗ |
if (ret < 0) |
1129 |
|
✗ |
return ret; |
1130 |
|
|
} |
1131 |
|
|
} |
1132 |
|
|
|
1133 |
|
✗ |
av_frame_free(&atempo->dst_buffer); |
1134 |
|
✗ |
atempo->dst = NULL; |
1135 |
|
✗ |
atempo->dst_end = NULL; |
1136 |
|
|
|
1137 |
|
✗ |
return AVERROR_EOF; |
1138 |
|
|
} |
1139 |
|
|
|
1140 |
|
✗ |
return ret; |
1141 |
|
|
} |
1142 |
|
|
|
1143 |
|
✗ |
static int process_command(AVFilterContext *ctx, |
1144 |
|
|
const char *cmd, |
1145 |
|
|
const char *arg, |
1146 |
|
|
char *res, |
1147 |
|
|
int res_len, |
1148 |
|
|
int flags) |
1149 |
|
|
{ |
1150 |
|
✗ |
int ret = ff_filter_process_command(ctx, cmd, arg, res, res_len, flags); |
1151 |
|
|
|
1152 |
|
✗ |
if (ret < 0) |
1153 |
|
✗ |
return ret; |
1154 |
|
|
|
1155 |
|
✗ |
return yae_update(ctx); |
1156 |
|
|
} |
1157 |
|
|
|
1158 |
|
|
static const AVFilterPad atempo_inputs[] = { |
1159 |
|
|
{ |
1160 |
|
|
.name = "default", |
1161 |
|
|
.type = AVMEDIA_TYPE_AUDIO, |
1162 |
|
|
.filter_frame = filter_frame, |
1163 |
|
|
.config_props = config_props, |
1164 |
|
|
}, |
1165 |
|
|
}; |
1166 |
|
|
|
1167 |
|
|
static const AVFilterPad atempo_outputs[] = { |
1168 |
|
|
{ |
1169 |
|
|
.name = "default", |
1170 |
|
|
.request_frame = request_frame, |
1171 |
|
|
.type = AVMEDIA_TYPE_AUDIO, |
1172 |
|
|
}, |
1173 |
|
|
}; |
1174 |
|
|
|
1175 |
|
|
const AVFilter ff_af_atempo = { |
1176 |
|
|
.name = "atempo", |
1177 |
|
|
.description = NULL_IF_CONFIG_SMALL("Adjust audio tempo."), |
1178 |
|
|
.init = init, |
1179 |
|
|
.uninit = uninit, |
1180 |
|
|
.process_command = process_command, |
1181 |
|
|
.priv_size = sizeof(ATempoContext), |
1182 |
|
|
.priv_class = &atempo_class, |
1183 |
|
|
FILTER_INPUTS(atempo_inputs), |
1184 |
|
|
FILTER_OUTPUTS(atempo_outputs), |
1185 |
|
|
FILTER_SAMPLEFMTS_ARRAY(sample_fmts), |
1186 |
|
|
}; |
1187 |
|
|
|