Line |
Branch |
Exec |
Source |
1 |
|
|
/* |
2 |
|
|
* Copyright (c) 2014 - 2021 Jason Jang |
3 |
|
|
* Copyright (c) 2021 Paul B Mahol |
4 |
|
|
* |
5 |
|
|
* This file is part of FFmpeg. |
6 |
|
|
* |
7 |
|
|
* FFmpeg is free software; you can redistribute it and/or |
8 |
|
|
* modify it under the terms of the GNU Lesser General Public License |
9 |
|
|
* as published by the Free Software Foundation; either |
10 |
|
|
* version 2.1 of the License, or (at your option) any later version. |
11 |
|
|
* |
12 |
|
|
* FFmpeg is distributed in the hope that it will be useful, |
13 |
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 |
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 |
|
|
* GNU Lesser General Public License for more details. |
16 |
|
|
* |
17 |
|
|
* You should have received a copy of the GNU Lesser General Public License |
18 |
|
|
* along with FFmpeg; if not, write to the Free Software Foundation, Inc., |
19 |
|
|
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
20 |
|
|
*/ |
21 |
|
|
|
22 |
|
|
#include "libavutil/mem.h" |
23 |
|
|
#include "libavutil/opt.h" |
24 |
|
|
#include "libavutil/tx.h" |
25 |
|
|
#include "audio.h" |
26 |
|
|
#include "avfilter.h" |
27 |
|
|
#include "filters.h" |
28 |
|
|
|
29 |
|
|
typedef struct AudioPsyClipContext { |
30 |
|
|
const AVClass *class; |
31 |
|
|
|
32 |
|
|
double level_in; |
33 |
|
|
double level_out; |
34 |
|
|
double clip_level; |
35 |
|
|
double adaptive; |
36 |
|
|
int auto_level; |
37 |
|
|
int diff_only; |
38 |
|
|
int iterations; |
39 |
|
|
char *protections_str; |
40 |
|
|
double *protections; |
41 |
|
|
|
42 |
|
|
int num_psy_bins; |
43 |
|
|
int fft_size; |
44 |
|
|
int overlap; |
45 |
|
|
int channels; |
46 |
|
|
|
47 |
|
|
int spread_table_rows; |
48 |
|
|
int *spread_table_index; |
49 |
|
|
int (*spread_table_range)[2]; |
50 |
|
|
float *window, *inv_window, *spread_table, *margin_curve; |
51 |
|
|
|
52 |
|
|
AVFrame *in; |
53 |
|
|
AVFrame *in_buffer; |
54 |
|
|
AVFrame *in_frame; |
55 |
|
|
AVFrame *out_dist_frame; |
56 |
|
|
AVFrame *windowed_frame; |
57 |
|
|
AVFrame *clipping_delta; |
58 |
|
|
AVFrame *spectrum_buf; |
59 |
|
|
AVFrame *mask_curve; |
60 |
|
|
|
61 |
|
|
AVTXContext **tx_ctx; |
62 |
|
|
av_tx_fn tx_fn; |
63 |
|
|
AVTXContext **itx_ctx; |
64 |
|
|
av_tx_fn itx_fn; |
65 |
|
|
} AudioPsyClipContext; |
66 |
|
|
|
67 |
|
|
#define OFFSET(x) offsetof(AudioPsyClipContext, x) |
68 |
|
|
#define FLAGS AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_RUNTIME_PARAM |
69 |
|
|
|
70 |
|
|
static const AVOption apsyclip_options[] = { |
71 |
|
|
{ "level_in", "set input level", OFFSET(level_in), AV_OPT_TYPE_DOUBLE, {.dbl=1},.015625, 64, FLAGS }, |
72 |
|
|
{ "level_out", "set output level", OFFSET(level_out), AV_OPT_TYPE_DOUBLE, {.dbl=1},.015625, 64, FLAGS }, |
73 |
|
|
{ "clip", "set clip level", OFFSET(clip_level), AV_OPT_TYPE_DOUBLE, {.dbl=1},.015625, 1, FLAGS }, |
74 |
|
|
{ "diff", "enable difference", OFFSET(diff_only), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS }, |
75 |
|
|
{ "adaptive", "set adaptive distortion", OFFSET(adaptive), AV_OPT_TYPE_DOUBLE, {.dbl=0.5}, 0, 1, FLAGS }, |
76 |
|
|
{ "iterations", "set iterations", OFFSET(iterations), AV_OPT_TYPE_INT, {.i64=10}, 1, 20, FLAGS }, |
77 |
|
|
{ "level", "set auto level", OFFSET(auto_level), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS }, |
78 |
|
|
{NULL} |
79 |
|
|
}; |
80 |
|
|
|
81 |
|
|
AVFILTER_DEFINE_CLASS(apsyclip); |
82 |
|
|
|
83 |
|
✗ |
static void generate_hann_window(float *window, float *inv_window, int size) |
84 |
|
|
{ |
85 |
|
✗ |
for (int i = 0; i < size; i++) { |
86 |
|
✗ |
float value = 0.5f * (1.f - cosf(2.f * M_PI * i / size)); |
87 |
|
|
|
88 |
|
✗ |
window[i] = value; |
89 |
|
|
// 1/window to calculate unwindowed peak. |
90 |
|
✗ |
inv_window[i] = value > 0.1f ? 1.f / value : 0.f; |
91 |
|
|
} |
92 |
|
✗ |
} |
93 |
|
|
|
94 |
|
✗ |
static void set_margin_curve(AudioPsyClipContext *s, |
95 |
|
|
const int (*points)[2], int num_points, int sample_rate) |
96 |
|
|
{ |
97 |
|
✗ |
int j = 0; |
98 |
|
|
|
99 |
|
✗ |
s->margin_curve[0] = points[0][1]; |
100 |
|
|
|
101 |
|
✗ |
for (int i = 0; i < num_points - 1; i++) { |
102 |
|
✗ |
while (j < s->fft_size / 2 + 1 && j * sample_rate / s->fft_size < points[i + 1][0]) { |
103 |
|
|
// linearly interpolate between points |
104 |
|
✗ |
int binHz = j * sample_rate / s->fft_size; |
105 |
|
✗ |
s->margin_curve[j] = points[i][1] + (binHz - points[i][0]) * (points[i + 1][1] - points[i][1]) / (points[i + 1][0] - points[i][0]); |
106 |
|
✗ |
j++; |
107 |
|
|
} |
108 |
|
|
} |
109 |
|
|
// handle bins after the last point |
110 |
|
✗ |
while (j < s->fft_size / 2 + 1) { |
111 |
|
✗ |
s->margin_curve[j] = points[num_points - 1][1]; |
112 |
|
✗ |
j++; |
113 |
|
|
} |
114 |
|
|
|
115 |
|
|
// convert margin curve to linear amplitude scale |
116 |
|
✗ |
for (j = 0; j < s->fft_size / 2 + 1; j++) |
117 |
|
✗ |
s->margin_curve[j] = powf(10.f, s->margin_curve[j] / 20.f); |
118 |
|
✗ |
} |
119 |
|
|
|
120 |
|
✗ |
static void generate_spread_table(AudioPsyClipContext *s) |
121 |
|
|
{ |
122 |
|
|
// Calculate tent-shape function in log-log scale. |
123 |
|
|
|
124 |
|
|
// As an optimization, only consider bins close to "bin" |
125 |
|
|
// This reduces the number of multiplications needed in calculate_mask_curve |
126 |
|
|
// The masking contribution at faraway bins is negligeable |
127 |
|
|
|
128 |
|
|
// Another optimization to save memory and speed up the calculation of the |
129 |
|
|
// spread table is to calculate and store only 2 spread functions per |
130 |
|
|
// octave, and reuse the same spread function for multiple bins. |
131 |
|
✗ |
int table_index = 0; |
132 |
|
✗ |
int bin = 0; |
133 |
|
✗ |
int increment = 1; |
134 |
|
|
|
135 |
|
✗ |
while (bin < s->num_psy_bins) { |
136 |
|
✗ |
float sum = 0; |
137 |
|
✗ |
int base_idx = table_index * s->num_psy_bins; |
138 |
|
✗ |
int start_bin = bin * 3 / 4; |
139 |
|
✗ |
int end_bin = FFMIN(s->num_psy_bins, ((bin + 1) * 4 + 2) / 3); |
140 |
|
|
int next_bin; |
141 |
|
|
|
142 |
|
✗ |
for (int j = start_bin; j < end_bin; j++) { |
143 |
|
|
// add 0.5 so i=0 doesn't get log(0) |
144 |
|
✗ |
float rel_idx_log = FFABS(logf((j + 0.5f) / (bin + 0.5f))); |
145 |
|
|
float value; |
146 |
|
✗ |
if (j >= bin) { |
147 |
|
|
// mask up |
148 |
|
✗ |
value = expf(-rel_idx_log * 40.f); |
149 |
|
|
} else { |
150 |
|
|
// mask down |
151 |
|
✗ |
value = expf(-rel_idx_log * 80.f); |
152 |
|
|
} |
153 |
|
|
// the spreading function is centred in the row |
154 |
|
✗ |
sum += value; |
155 |
|
✗ |
s->spread_table[base_idx + s->num_psy_bins / 2 + j - bin] = value; |
156 |
|
|
} |
157 |
|
|
// now normalize it |
158 |
|
✗ |
for (int j = start_bin; j < end_bin; j++) { |
159 |
|
✗ |
s->spread_table[base_idx + s->num_psy_bins / 2 + j - bin] /= sum; |
160 |
|
|
} |
161 |
|
|
|
162 |
|
✗ |
s->spread_table_range[table_index][0] = start_bin - bin; |
163 |
|
✗ |
s->spread_table_range[table_index][1] = end_bin - bin; |
164 |
|
|
|
165 |
|
✗ |
if (bin <= 1) { |
166 |
|
✗ |
next_bin = bin + 1; |
167 |
|
|
} else { |
168 |
|
✗ |
if ((bin & (bin - 1)) == 0) { |
169 |
|
|
// power of 2 |
170 |
|
✗ |
increment = bin / 2; |
171 |
|
|
} |
172 |
|
|
|
173 |
|
✗ |
next_bin = bin + increment; |
174 |
|
|
} |
175 |
|
|
|
176 |
|
|
// set bins between "bin" and "next_bin" to use this table_index |
177 |
|
✗ |
for (int i = bin; i < next_bin; i++) |
178 |
|
✗ |
s->spread_table_index[i] = table_index; |
179 |
|
|
|
180 |
|
✗ |
bin = next_bin; |
181 |
|
✗ |
table_index++; |
182 |
|
|
} |
183 |
|
✗ |
} |
184 |
|
|
|
185 |
|
✗ |
static int config_input(AVFilterLink *inlink) |
186 |
|
|
{ |
187 |
|
✗ |
AVFilterContext *ctx = inlink->dst; |
188 |
|
✗ |
AudioPsyClipContext *s = ctx->priv; |
189 |
|
|
static const int points[][2] = { {0,14}, {125,14}, {250,16}, {500,18}, {1000,20}, {2000,20}, {4000,20}, {8000,17}, {16000,14}, {20000,-10} }; |
190 |
|
|
static const int num_points = 10; |
191 |
|
✗ |
float scale = 1.f; |
192 |
|
|
int ret; |
193 |
|
|
|
194 |
|
✗ |
s->fft_size = inlink->sample_rate > 100000 ? 1024 : inlink->sample_rate > 50000 ? 512 : 256; |
195 |
|
✗ |
s->overlap = s->fft_size / 4; |
196 |
|
|
|
197 |
|
|
// The psy masking calculation is O(n^2), |
198 |
|
|
// so skip it for frequencies not covered by base sampling rantes (i.e. 44k) |
199 |
|
✗ |
if (inlink->sample_rate <= 50000) { |
200 |
|
✗ |
s->num_psy_bins = s->fft_size / 2; |
201 |
|
✗ |
} else if (inlink->sample_rate <= 100000) { |
202 |
|
✗ |
s->num_psy_bins = s->fft_size / 4; |
203 |
|
|
} else { |
204 |
|
✗ |
s->num_psy_bins = s->fft_size / 8; |
205 |
|
|
} |
206 |
|
|
|
207 |
|
✗ |
s->window = av_calloc(s->fft_size, sizeof(*s->window)); |
208 |
|
✗ |
s->inv_window = av_calloc(s->fft_size, sizeof(*s->inv_window)); |
209 |
|
✗ |
if (!s->window || !s->inv_window) |
210 |
|
✗ |
return AVERROR(ENOMEM); |
211 |
|
|
|
212 |
|
✗ |
s->in_buffer = ff_get_audio_buffer(inlink, s->fft_size * 2); |
213 |
|
✗ |
s->in_frame = ff_get_audio_buffer(inlink, s->fft_size * 2); |
214 |
|
✗ |
s->out_dist_frame = ff_get_audio_buffer(inlink, s->fft_size * 2); |
215 |
|
✗ |
s->windowed_frame = ff_get_audio_buffer(inlink, s->fft_size * 2); |
216 |
|
✗ |
s->clipping_delta = ff_get_audio_buffer(inlink, s->fft_size * 2); |
217 |
|
✗ |
s->spectrum_buf = ff_get_audio_buffer(inlink, s->fft_size * 2); |
218 |
|
✗ |
s->mask_curve = ff_get_audio_buffer(inlink, s->fft_size / 2 + 1); |
219 |
|
✗ |
if (!s->in_buffer || !s->in_frame || |
220 |
|
✗ |
!s->out_dist_frame || !s->windowed_frame || |
221 |
|
✗ |
!s->clipping_delta || !s->spectrum_buf || !s->mask_curve) |
222 |
|
✗ |
return AVERROR(ENOMEM); |
223 |
|
|
|
224 |
|
✗ |
generate_hann_window(s->window, s->inv_window, s->fft_size); |
225 |
|
|
|
226 |
|
✗ |
s->margin_curve = av_calloc(s->fft_size / 2 + 1, sizeof(*s->margin_curve)); |
227 |
|
✗ |
if (!s->margin_curve) |
228 |
|
✗ |
return AVERROR(ENOMEM); |
229 |
|
|
|
230 |
|
✗ |
s->spread_table_rows = av_log2(s->num_psy_bins) * 2; |
231 |
|
✗ |
s->spread_table = av_calloc(s->spread_table_rows * s->num_psy_bins, sizeof(*s->spread_table)); |
232 |
|
✗ |
if (!s->spread_table) |
233 |
|
✗ |
return AVERROR(ENOMEM); |
234 |
|
|
|
235 |
|
✗ |
s->spread_table_range = av_calloc(s->spread_table_rows * 2, sizeof(*s->spread_table_range)); |
236 |
|
✗ |
if (!s->spread_table_range) |
237 |
|
✗ |
return AVERROR(ENOMEM); |
238 |
|
|
|
239 |
|
✗ |
s->spread_table_index = av_calloc(s->num_psy_bins, sizeof(*s->spread_table_index)); |
240 |
|
✗ |
if (!s->spread_table_index) |
241 |
|
✗ |
return AVERROR(ENOMEM); |
242 |
|
|
|
243 |
|
✗ |
set_margin_curve(s, points, num_points, inlink->sample_rate); |
244 |
|
|
|
245 |
|
✗ |
generate_spread_table(s); |
246 |
|
|
|
247 |
|
✗ |
s->channels = inlink->ch_layout.nb_channels; |
248 |
|
|
|
249 |
|
✗ |
s->tx_ctx = av_calloc(s->channels, sizeof(*s->tx_ctx)); |
250 |
|
✗ |
s->itx_ctx = av_calloc(s->channels, sizeof(*s->itx_ctx)); |
251 |
|
✗ |
if (!s->tx_ctx || !s->itx_ctx) |
252 |
|
✗ |
return AVERROR(ENOMEM); |
253 |
|
|
|
254 |
|
✗ |
for (int ch = 0; ch < s->channels; ch++) { |
255 |
|
✗ |
ret = av_tx_init(&s->tx_ctx[ch], &s->tx_fn, AV_TX_FLOAT_FFT, 0, s->fft_size, &scale, 0); |
256 |
|
✗ |
if (ret < 0) |
257 |
|
✗ |
return ret; |
258 |
|
|
|
259 |
|
✗ |
ret = av_tx_init(&s->itx_ctx[ch], &s->itx_fn, AV_TX_FLOAT_FFT, 1, s->fft_size, &scale, 0); |
260 |
|
✗ |
if (ret < 0) |
261 |
|
✗ |
return ret; |
262 |
|
|
} |
263 |
|
|
|
264 |
|
✗ |
return 0; |
265 |
|
|
} |
266 |
|
|
|
267 |
|
✗ |
static void apply_window(AudioPsyClipContext *s, |
268 |
|
|
const float *in_frame, float *out_frame, const int add_to_out_frame) |
269 |
|
|
{ |
270 |
|
✗ |
const float *window = s->window; |
271 |
|
|
|
272 |
|
✗ |
for (int i = 0; i < s->fft_size; i++) { |
273 |
|
✗ |
if (add_to_out_frame) { |
274 |
|
✗ |
out_frame[i] += in_frame[i] * window[i]; |
275 |
|
|
} else { |
276 |
|
✗ |
out_frame[i] = in_frame[i] * window[i]; |
277 |
|
|
} |
278 |
|
|
} |
279 |
|
✗ |
} |
280 |
|
|
|
281 |
|
✗ |
static void calculate_mask_curve(AudioPsyClipContext *s, |
282 |
|
|
const float *spectrum, float *mask_curve) |
283 |
|
|
{ |
284 |
|
✗ |
for (int i = 0; i < s->fft_size / 2 + 1; i++) |
285 |
|
✗ |
mask_curve[i] = 0; |
286 |
|
|
|
287 |
|
✗ |
for (int i = 0; i < s->num_psy_bins; i++) { |
288 |
|
|
int base_idx, start_bin, end_bin, table_idx; |
289 |
|
|
float magnitude; |
290 |
|
|
int range[2]; |
291 |
|
|
|
292 |
|
✗ |
if (i == 0) { |
293 |
|
✗ |
magnitude = FFABS(spectrum[0]); |
294 |
|
✗ |
} else if (i == s->fft_size / 2) { |
295 |
|
✗ |
magnitude = FFABS(spectrum[s->fft_size]); |
296 |
|
|
} else { |
297 |
|
|
// Because the input signal is real, the + and - frequencies are redundant. |
298 |
|
|
// Multiply the magnitude by 2 to simulate adding up the + and - frequencies. |
299 |
|
✗ |
magnitude = hypotf(spectrum[2 * i], spectrum[2 * i + 1]) * 2; |
300 |
|
|
} |
301 |
|
|
|
302 |
|
✗ |
table_idx = s->spread_table_index[i]; |
303 |
|
✗ |
range[0] = s->spread_table_range[table_idx][0]; |
304 |
|
✗ |
range[1] = s->spread_table_range[table_idx][1]; |
305 |
|
✗ |
base_idx = table_idx * s->num_psy_bins; |
306 |
|
✗ |
start_bin = FFMAX(0, i + range[0]); |
307 |
|
✗ |
end_bin = FFMIN(s->num_psy_bins, i + range[1]); |
308 |
|
|
|
309 |
|
✗ |
for (int j = start_bin; j < end_bin; j++) |
310 |
|
✗ |
mask_curve[j] += s->spread_table[base_idx + s->num_psy_bins / 2 + j - i] * magnitude; |
311 |
|
|
} |
312 |
|
|
|
313 |
|
|
// for ultrasonic frequencies, skip the O(n^2) spread calculation and just copy the magnitude |
314 |
|
✗ |
for (int i = s->num_psy_bins; i < s->fft_size / 2 + 1; i++) { |
315 |
|
|
float magnitude; |
316 |
|
✗ |
if (i == s->fft_size / 2) { |
317 |
|
✗ |
magnitude = FFABS(spectrum[s->fft_size]); |
318 |
|
|
} else { |
319 |
|
|
// Because the input signal is real, the + and - frequencies are redundant. |
320 |
|
|
// Multiply the magnitude by 2 to simulate adding up the + and - frequencies. |
321 |
|
✗ |
magnitude = hypotf(spectrum[2 * i], spectrum[2 * i + 1]) * 2; |
322 |
|
|
} |
323 |
|
|
|
324 |
|
✗ |
mask_curve[i] = magnitude; |
325 |
|
|
} |
326 |
|
|
|
327 |
|
✗ |
for (int i = 0; i < s->fft_size / 2 + 1; i++) |
328 |
|
✗ |
mask_curve[i] = mask_curve[i] / s->margin_curve[i]; |
329 |
|
✗ |
} |
330 |
|
|
|
331 |
|
✗ |
static void clip_to_window(AudioPsyClipContext *s, |
332 |
|
|
const float *windowed_frame, float *clipping_delta, float delta_boost) |
333 |
|
|
{ |
334 |
|
✗ |
const float *window = s->window; |
335 |
|
|
|
336 |
|
✗ |
for (int i = 0; i < s->fft_size; i++) { |
337 |
|
✗ |
const float limit = s->clip_level * window[i]; |
338 |
|
✗ |
const float effective_value = windowed_frame[i] + clipping_delta[i]; |
339 |
|
|
|
340 |
|
✗ |
if (effective_value > limit) { |
341 |
|
✗ |
clipping_delta[i] += (limit - effective_value) * delta_boost; |
342 |
|
✗ |
} else if (effective_value < -limit) { |
343 |
|
✗ |
clipping_delta[i] += (-limit - effective_value) * delta_boost; |
344 |
|
|
} |
345 |
|
|
} |
346 |
|
✗ |
} |
347 |
|
|
|
348 |
|
✗ |
static void limit_clip_spectrum(AudioPsyClipContext *s, |
349 |
|
|
float *clip_spectrum, const float *mask_curve) |
350 |
|
|
{ |
351 |
|
|
// bin 0 |
352 |
|
✗ |
float relative_distortion_level = FFABS(clip_spectrum[0]) / mask_curve[0]; |
353 |
|
|
|
354 |
|
✗ |
if (relative_distortion_level > 1.f) |
355 |
|
✗ |
clip_spectrum[0] /= relative_distortion_level; |
356 |
|
|
|
357 |
|
|
// bin 1..N/2-1 |
358 |
|
✗ |
for (int i = 1; i < s->fft_size / 2; i++) { |
359 |
|
✗ |
float real = clip_spectrum[i * 2]; |
360 |
|
✗ |
float imag = clip_spectrum[i * 2 + 1]; |
361 |
|
|
// Because the input signal is real, the + and - frequencies are redundant. |
362 |
|
|
// Multiply the magnitude by 2 to simulate adding up the + and - frequencies. |
363 |
|
✗ |
relative_distortion_level = hypotf(real, imag) * 2 / mask_curve[i]; |
364 |
|
✗ |
if (relative_distortion_level > 1.0) { |
365 |
|
✗ |
clip_spectrum[i * 2] /= relative_distortion_level; |
366 |
|
✗ |
clip_spectrum[i * 2 + 1] /= relative_distortion_level; |
367 |
|
✗ |
clip_spectrum[s->fft_size * 2 - i * 2] /= relative_distortion_level; |
368 |
|
✗ |
clip_spectrum[s->fft_size * 2 - i * 2 + 1] /= relative_distortion_level; |
369 |
|
|
} |
370 |
|
|
} |
371 |
|
|
// bin N/2 |
372 |
|
✗ |
relative_distortion_level = FFABS(clip_spectrum[s->fft_size]) / mask_curve[s->fft_size / 2]; |
373 |
|
✗ |
if (relative_distortion_level > 1.f) |
374 |
|
✗ |
clip_spectrum[s->fft_size] /= relative_distortion_level; |
375 |
|
✗ |
} |
376 |
|
|
|
377 |
|
✗ |
static void r2c(float *buffer, int size) |
378 |
|
|
{ |
379 |
|
✗ |
for (int i = size - 1; i >= 0; i--) |
380 |
|
✗ |
buffer[2 * i] = buffer[i]; |
381 |
|
|
|
382 |
|
✗ |
for (int i = size - 1; i >= 0; i--) |
383 |
|
✗ |
buffer[2 * i + 1] = 0.f; |
384 |
|
✗ |
} |
385 |
|
|
|
386 |
|
✗ |
static void c2r(float *buffer, int size) |
387 |
|
|
{ |
388 |
|
✗ |
for (int i = 0; i < size; i++) |
389 |
|
✗ |
buffer[i] = buffer[2 * i]; |
390 |
|
|
|
391 |
|
✗ |
for (int i = 0; i < size; i++) |
392 |
|
✗ |
buffer[i + size] = 0.f; |
393 |
|
✗ |
} |
394 |
|
|
|
395 |
|
✗ |
static void feed(AVFilterContext *ctx, int ch, |
396 |
|
|
const float *in_samples, float *out_samples, int diff_only, |
397 |
|
|
float *in_frame, float *out_dist_frame, |
398 |
|
|
float *windowed_frame, float *clipping_delta, |
399 |
|
|
float *spectrum_buf, float *mask_curve) |
400 |
|
|
{ |
401 |
|
✗ |
AudioPsyClipContext *s = ctx->priv; |
402 |
|
✗ |
const float clip_level_inv = 1.f / s->clip_level; |
403 |
|
✗ |
const float level_out = s->level_out; |
404 |
|
✗ |
float orig_peak = 0; |
405 |
|
|
float peak; |
406 |
|
|
|
407 |
|
|
// shift in/out buffers |
408 |
|
✗ |
for (int i = 0; i < s->fft_size - s->overlap; i++) { |
409 |
|
✗ |
in_frame[i] = in_frame[i + s->overlap]; |
410 |
|
✗ |
out_dist_frame[i] = out_dist_frame[i + s->overlap]; |
411 |
|
|
} |
412 |
|
|
|
413 |
|
✗ |
for (int i = 0; i < s->overlap; i++) { |
414 |
|
✗ |
in_frame[i + s->fft_size - s->overlap] = in_samples[i]; |
415 |
|
✗ |
out_dist_frame[i + s->fft_size - s->overlap] = 0.f; |
416 |
|
|
} |
417 |
|
|
|
418 |
|
✗ |
apply_window(s, in_frame, windowed_frame, 0); |
419 |
|
✗ |
r2c(windowed_frame, s->fft_size); |
420 |
|
✗ |
s->tx_fn(s->tx_ctx[ch], spectrum_buf, windowed_frame, sizeof(AVComplexFloat)); |
421 |
|
✗ |
c2r(windowed_frame, s->fft_size); |
422 |
|
✗ |
calculate_mask_curve(s, spectrum_buf, mask_curve); |
423 |
|
|
|
424 |
|
|
// It would be easier to calculate the peak from the unwindowed input. |
425 |
|
|
// This is just for consistency with the clipped peak calculateion |
426 |
|
|
// because the inv_window zeros out samples on the edge of the window. |
427 |
|
✗ |
for (int i = 0; i < s->fft_size; i++) |
428 |
|
✗ |
orig_peak = FFMAX(orig_peak, FFABS(windowed_frame[i] * s->inv_window[i])); |
429 |
|
✗ |
orig_peak *= clip_level_inv; |
430 |
|
✗ |
peak = orig_peak; |
431 |
|
|
|
432 |
|
|
// clear clipping_delta |
433 |
|
✗ |
for (int i = 0; i < s->fft_size * 2; i++) |
434 |
|
✗ |
clipping_delta[i] = 0.f; |
435 |
|
|
|
436 |
|
|
// repeat clipping-filtering process a few times to control both the peaks and the spectrum |
437 |
|
✗ |
for (int i = 0; i < s->iterations; i++) { |
438 |
|
✗ |
float mask_curve_shift = 1.122f; // 1.122 is 1dB |
439 |
|
|
// The last 1/3 of rounds have boosted delta to help reach the peak target faster |
440 |
|
✗ |
float delta_boost = 1.f; |
441 |
|
✗ |
if (i >= s->iterations - s->iterations / 3) { |
442 |
|
|
// boosting the delta when largs peaks are still present is dangerous |
443 |
|
✗ |
if (peak < 2.f) |
444 |
|
✗ |
delta_boost = 2.f; |
445 |
|
|
} |
446 |
|
|
|
447 |
|
✗ |
clip_to_window(s, windowed_frame, clipping_delta, delta_boost); |
448 |
|
|
|
449 |
|
✗ |
r2c(clipping_delta, s->fft_size); |
450 |
|
✗ |
s->tx_fn(s->tx_ctx[ch], spectrum_buf, clipping_delta, sizeof(AVComplexFloat)); |
451 |
|
|
|
452 |
|
✗ |
limit_clip_spectrum(s, spectrum_buf, mask_curve); |
453 |
|
|
|
454 |
|
✗ |
s->itx_fn(s->itx_ctx[ch], clipping_delta, spectrum_buf, sizeof(AVComplexFloat)); |
455 |
|
✗ |
c2r(clipping_delta, s->fft_size); |
456 |
|
|
|
457 |
|
✗ |
for (int i = 0; i < s->fft_size; i++) |
458 |
|
✗ |
clipping_delta[i] /= s->fft_size; |
459 |
|
|
|
460 |
|
✗ |
peak = 0; |
461 |
|
✗ |
for (int i = 0; i < s->fft_size; i++) |
462 |
|
✗ |
peak = FFMAX(peak, FFABS((windowed_frame[i] + clipping_delta[i]) * s->inv_window[i])); |
463 |
|
✗ |
peak *= clip_level_inv; |
464 |
|
|
|
465 |
|
|
// Automatically adjust mask_curve as necessary to reach peak target |
466 |
|
✗ |
if (orig_peak > 1.f && peak > 1.f) { |
467 |
|
✗ |
float diff_achieved = orig_peak - peak; |
468 |
|
✗ |
if (i + 1 < s->iterations - s->iterations / 3 && diff_achieved > 0) { |
469 |
|
✗ |
float diff_needed = orig_peak - 1.f; |
470 |
|
✗ |
float diff_ratio = diff_needed / diff_achieved; |
471 |
|
|
// If a good amount of peak reduction was already achieved, |
472 |
|
|
// don't shift the mask_curve by the full peak value |
473 |
|
|
// On the other hand, if only a little peak reduction was achieved, |
474 |
|
|
// don't shift the mask_curve by the enormous diff_ratio. |
475 |
|
✗ |
diff_ratio = FFMIN(diff_ratio, peak); |
476 |
|
✗ |
mask_curve_shift = FFMAX(mask_curve_shift, diff_ratio); |
477 |
|
|
} else { |
478 |
|
|
// If the peak got higher than the input or we are in the last 1/3 rounds, |
479 |
|
|
// go back to the heavy-handed peak heuristic. |
480 |
|
✗ |
mask_curve_shift = FFMAX(mask_curve_shift, peak); |
481 |
|
|
} |
482 |
|
|
} |
483 |
|
|
|
484 |
|
✗ |
mask_curve_shift = 1.f + (mask_curve_shift - 1.f) * s->adaptive; |
485 |
|
|
|
486 |
|
|
// Be less strict in the next iteration. |
487 |
|
|
// This helps with peak control. |
488 |
|
✗ |
for (int i = 0; i < s->fft_size / 2 + 1; i++) |
489 |
|
✗ |
mask_curve[i] *= mask_curve_shift; |
490 |
|
|
} |
491 |
|
|
|
492 |
|
|
// do overlap & add |
493 |
|
✗ |
apply_window(s, clipping_delta, out_dist_frame, 1); |
494 |
|
|
|
495 |
|
✗ |
for (int i = 0; i < s->overlap; i++) { |
496 |
|
|
// 4 times overlap with squared hanning window results in 1.5 time increase in amplitude |
497 |
|
✗ |
if (!ctx->is_disabled) { |
498 |
|
✗ |
out_samples[i] = out_dist_frame[i] / 1.5f; |
499 |
|
✗ |
if (!diff_only) |
500 |
|
✗ |
out_samples[i] += in_frame[i]; |
501 |
|
✗ |
if (s->auto_level) |
502 |
|
✗ |
out_samples[i] *= clip_level_inv; |
503 |
|
✗ |
out_samples[i] *= level_out; |
504 |
|
|
} else { |
505 |
|
✗ |
out_samples[i] = in_frame[i]; |
506 |
|
|
} |
507 |
|
|
} |
508 |
|
✗ |
} |
509 |
|
|
|
510 |
|
✗ |
static int psy_channel(AVFilterContext *ctx, AVFrame *in, AVFrame *out, int ch) |
511 |
|
|
{ |
512 |
|
✗ |
AudioPsyClipContext *s = ctx->priv; |
513 |
|
✗ |
const float *src = (const float *)in->extended_data[ch]; |
514 |
|
✗ |
float *in_buffer = (float *)s->in_buffer->extended_data[ch]; |
515 |
|
✗ |
float *dst = (float *)out->extended_data[ch]; |
516 |
|
|
|
517 |
|
✗ |
for (int n = 0; n < s->overlap; n++) |
518 |
|
✗ |
in_buffer[n] = src[n] * s->level_in; |
519 |
|
|
|
520 |
|
✗ |
feed(ctx, ch, in_buffer, dst, s->diff_only, |
521 |
|
✗ |
(float *)(s->in_frame->extended_data[ch]), |
522 |
|
✗ |
(float *)(s->out_dist_frame->extended_data[ch]), |
523 |
|
✗ |
(float *)(s->windowed_frame->extended_data[ch]), |
524 |
|
✗ |
(float *)(s->clipping_delta->extended_data[ch]), |
525 |
|
✗ |
(float *)(s->spectrum_buf->extended_data[ch]), |
526 |
|
✗ |
(float *)(s->mask_curve->extended_data[ch])); |
527 |
|
|
|
528 |
|
✗ |
return 0; |
529 |
|
|
} |
530 |
|
|
|
531 |
|
✗ |
static int psy_channels(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) |
532 |
|
|
{ |
533 |
|
✗ |
AudioPsyClipContext *s = ctx->priv; |
534 |
|
✗ |
AVFrame *out = arg; |
535 |
|
✗ |
const int start = (out->ch_layout.nb_channels * jobnr) / nb_jobs; |
536 |
|
✗ |
const int end = (out->ch_layout.nb_channels * (jobnr+1)) / nb_jobs; |
537 |
|
|
|
538 |
|
✗ |
for (int ch = start; ch < end; ch++) |
539 |
|
✗ |
psy_channel(ctx, s->in, out, ch); |
540 |
|
|
|
541 |
|
✗ |
return 0; |
542 |
|
|
} |
543 |
|
|
|
544 |
|
✗ |
static int filter_frame(AVFilterLink *inlink, AVFrame *in) |
545 |
|
|
{ |
546 |
|
✗ |
AVFilterContext *ctx = inlink->dst; |
547 |
|
✗ |
AVFilterLink *outlink = ctx->outputs[0]; |
548 |
|
✗ |
AudioPsyClipContext *s = ctx->priv; |
549 |
|
|
AVFrame *out; |
550 |
|
|
int ret; |
551 |
|
|
|
552 |
|
✗ |
out = ff_get_audio_buffer(outlink, s->overlap); |
553 |
|
✗ |
if (!out) { |
554 |
|
✗ |
ret = AVERROR(ENOMEM); |
555 |
|
✗ |
goto fail; |
556 |
|
|
} |
557 |
|
|
|
558 |
|
✗ |
s->in = in; |
559 |
|
✗ |
av_frame_copy_props(out, in); |
560 |
|
✗ |
ff_filter_execute(ctx, psy_channels, out, NULL, |
561 |
|
✗ |
FFMIN(outlink->ch_layout.nb_channels, ff_filter_get_nb_threads(ctx))); |
562 |
|
|
|
563 |
|
✗ |
out->pts = in->pts; |
564 |
|
✗ |
out->nb_samples = in->nb_samples; |
565 |
|
✗ |
ret = ff_filter_frame(outlink, out); |
566 |
|
✗ |
fail: |
567 |
|
✗ |
av_frame_free(&in); |
568 |
|
✗ |
s->in = NULL; |
569 |
|
✗ |
return ret < 0 ? ret : 0; |
570 |
|
|
} |
571 |
|
|
|
572 |
|
✗ |
static int activate(AVFilterContext *ctx) |
573 |
|
|
{ |
574 |
|
✗ |
AVFilterLink *inlink = ctx->inputs[0]; |
575 |
|
✗ |
AVFilterLink *outlink = ctx->outputs[0]; |
576 |
|
✗ |
AudioPsyClipContext *s = ctx->priv; |
577 |
|
✗ |
AVFrame *in = NULL; |
578 |
|
✗ |
int ret = 0, status; |
579 |
|
|
int64_t pts; |
580 |
|
|
|
581 |
|
✗ |
FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink); |
582 |
|
|
|
583 |
|
✗ |
ret = ff_inlink_consume_samples(inlink, s->overlap, s->overlap, &in); |
584 |
|
✗ |
if (ret < 0) |
585 |
|
✗ |
return ret; |
586 |
|
|
|
587 |
|
✗ |
if (ret > 0) { |
588 |
|
✗ |
return filter_frame(inlink, in); |
589 |
|
✗ |
} else if (ff_inlink_acknowledge_status(inlink, &status, &pts)) { |
590 |
|
✗ |
ff_outlink_set_status(outlink, status, pts); |
591 |
|
✗ |
return 0; |
592 |
|
|
} else { |
593 |
|
✗ |
if (ff_inlink_queued_samples(inlink) >= s->overlap) { |
594 |
|
✗ |
ff_filter_set_ready(ctx, 10); |
595 |
|
✗ |
} else if (ff_outlink_frame_wanted(outlink)) { |
596 |
|
✗ |
ff_inlink_request_frame(inlink); |
597 |
|
|
} |
598 |
|
✗ |
return 0; |
599 |
|
|
} |
600 |
|
|
} |
601 |
|
|
|
602 |
|
✗ |
static av_cold void uninit(AVFilterContext *ctx) |
603 |
|
|
{ |
604 |
|
✗ |
AudioPsyClipContext *s = ctx->priv; |
605 |
|
|
|
606 |
|
✗ |
av_freep(&s->window); |
607 |
|
✗ |
av_freep(&s->inv_window); |
608 |
|
✗ |
av_freep(&s->spread_table); |
609 |
|
✗ |
av_freep(&s->spread_table_range); |
610 |
|
✗ |
av_freep(&s->spread_table_index); |
611 |
|
✗ |
av_freep(&s->margin_curve); |
612 |
|
|
|
613 |
|
✗ |
av_frame_free(&s->in_buffer); |
614 |
|
✗ |
av_frame_free(&s->in_frame); |
615 |
|
✗ |
av_frame_free(&s->out_dist_frame); |
616 |
|
✗ |
av_frame_free(&s->windowed_frame); |
617 |
|
✗ |
av_frame_free(&s->clipping_delta); |
618 |
|
✗ |
av_frame_free(&s->spectrum_buf); |
619 |
|
✗ |
av_frame_free(&s->mask_curve); |
620 |
|
|
|
621 |
|
✗ |
for (int ch = 0; ch < s->channels; ch++) { |
622 |
|
✗ |
if (s->tx_ctx) |
623 |
|
✗ |
av_tx_uninit(&s->tx_ctx[ch]); |
624 |
|
✗ |
if (s->itx_ctx) |
625 |
|
✗ |
av_tx_uninit(&s->itx_ctx[ch]); |
626 |
|
|
} |
627 |
|
|
|
628 |
|
✗ |
av_freep(&s->tx_ctx); |
629 |
|
✗ |
av_freep(&s->itx_ctx); |
630 |
|
✗ |
} |
631 |
|
|
|
632 |
|
|
static const AVFilterPad inputs[] = { |
633 |
|
|
{ |
634 |
|
|
.name = "default", |
635 |
|
|
.type = AVMEDIA_TYPE_AUDIO, |
636 |
|
|
.config_props = config_input, |
637 |
|
|
}, |
638 |
|
|
}; |
639 |
|
|
|
640 |
|
|
const FFFilter ff_af_apsyclip = { |
641 |
|
|
.p.name = "apsyclip", |
642 |
|
|
.p.description = NULL_IF_CONFIG_SMALL("Audio Psychoacoustic Clipper."), |
643 |
|
|
.p.priv_class = &apsyclip_class, |
644 |
|
|
.p.flags = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL | |
645 |
|
|
AVFILTER_FLAG_SLICE_THREADS, |
646 |
|
|
.priv_size = sizeof(AudioPsyClipContext), |
647 |
|
|
.uninit = uninit, |
648 |
|
|
FILTER_INPUTS(inputs), |
649 |
|
|
FILTER_OUTPUTS(ff_audio_default_filterpad), |
650 |
|
|
FILTER_SINGLE_SAMPLEFMT(AV_SAMPLE_FMT_FLTP), |
651 |
|
|
.activate = activate, |
652 |
|
|
.process_command = ff_filter_process_command, |
653 |
|
|
}; |
654 |
|
|
|