Line |
Branch |
Exec |
Source |
1 |
|
|
/* |
2 |
|
|
* Copyright (c) 2020 Paul B Mahol |
3 |
|
|
* |
4 |
|
|
* Speech Normalizer |
5 |
|
|
* |
6 |
|
|
* This file is part of FFmpeg. |
7 |
|
|
* |
8 |
|
|
* FFmpeg is free software; you can redistribute it and/or |
9 |
|
|
* modify it under the terms of the GNU Lesser General Public |
10 |
|
|
* License as published by the Free Software Foundation; either |
11 |
|
|
* version 2.1 of the License, or (at your option) any later version. |
12 |
|
|
* |
13 |
|
|
* FFmpeg is distributed in the hope that it will be useful, |
14 |
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 |
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
16 |
|
|
* Lesser General Public License for more details. |
17 |
|
|
* |
18 |
|
|
* You should have received a copy of the GNU Lesser General Public |
19 |
|
|
* License along with FFmpeg; if not, write to the Free Software |
20 |
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
21 |
|
|
*/ |
22 |
|
|
|
23 |
|
|
/** |
24 |
|
|
* @file |
25 |
|
|
* Speech Normalizer |
26 |
|
|
*/ |
27 |
|
|
|
28 |
|
|
#include <float.h> |
29 |
|
|
|
30 |
|
|
#include "libavutil/avassert.h" |
31 |
|
|
#include "libavutil/channel_layout.h" |
32 |
|
|
#include "libavutil/mem.h" |
33 |
|
|
#include "libavutil/opt.h" |
34 |
|
|
|
35 |
|
|
#define FF_BUFQUEUE_SIZE (1024) |
36 |
|
|
#include "bufferqueue.h" |
37 |
|
|
|
38 |
|
|
#include "audio.h" |
39 |
|
|
#include "avfilter.h" |
40 |
|
|
#include "filters.h" |
41 |
|
|
#include "internal.h" |
42 |
|
|
|
43 |
|
|
#define MAX_ITEMS 882000 |
44 |
|
|
#define MIN_PEAK (1. / 32768.) |
45 |
|
|
|
46 |
|
|
typedef struct PeriodItem { |
47 |
|
|
int size; |
48 |
|
|
int type; |
49 |
|
|
double max_peak; |
50 |
|
|
double rms_sum; |
51 |
|
|
} PeriodItem; |
52 |
|
|
|
53 |
|
|
typedef struct ChannelContext { |
54 |
|
|
int state; |
55 |
|
|
int bypass; |
56 |
|
|
PeriodItem pi[MAX_ITEMS]; |
57 |
|
|
double gain_state; |
58 |
|
|
double pi_max_peak; |
59 |
|
|
double pi_rms_sum; |
60 |
|
|
int pi_start; |
61 |
|
|
int pi_end; |
62 |
|
|
int pi_size; |
63 |
|
|
} ChannelContext; |
64 |
|
|
|
65 |
|
|
typedef struct SpeechNormalizerContext { |
66 |
|
|
const AVClass *class; |
67 |
|
|
|
68 |
|
|
double rms_value; |
69 |
|
|
double peak_value; |
70 |
|
|
double max_expansion; |
71 |
|
|
double max_compression; |
72 |
|
|
double threshold_value; |
73 |
|
|
double raise_amount; |
74 |
|
|
double fall_amount; |
75 |
|
|
char *ch_layout_str; |
76 |
|
|
AVChannelLayout ch_layout; |
77 |
|
|
int invert; |
78 |
|
|
int link; |
79 |
|
|
|
80 |
|
|
ChannelContext *cc; |
81 |
|
|
double prev_gain; |
82 |
|
|
|
83 |
|
|
int max_period; |
84 |
|
|
int eof; |
85 |
|
|
int64_t pts; |
86 |
|
|
|
87 |
|
|
struct FFBufQueue queue; |
88 |
|
|
|
89 |
|
|
void (*analyze_channel)(AVFilterContext *ctx, ChannelContext *cc, |
90 |
|
|
const uint8_t *srcp, int nb_samples); |
91 |
|
|
void (*filter_channels[2])(AVFilterContext *ctx, |
92 |
|
|
AVFrame *in, AVFrame *out, int nb_samples); |
93 |
|
|
} SpeechNormalizerContext; |
94 |
|
|
|
95 |
|
|
#define OFFSET(x) offsetof(SpeechNormalizerContext, x) |
96 |
|
|
#define FLAGS AV_OPT_FLAG_AUDIO_PARAM|AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_RUNTIME_PARAM |
97 |
|
|
|
98 |
|
|
static const AVOption speechnorm_options[] = { |
99 |
|
|
{ "peak", "set the peak value", OFFSET(peak_value), AV_OPT_TYPE_DOUBLE, {.dbl=0.95}, 0.0, 1.0, FLAGS }, |
100 |
|
|
{ "p", "set the peak value", OFFSET(peak_value), AV_OPT_TYPE_DOUBLE, {.dbl=0.95}, 0.0, 1.0, FLAGS }, |
101 |
|
|
{ "expansion", "set the max expansion factor", OFFSET(max_expansion), AV_OPT_TYPE_DOUBLE, {.dbl=2.0}, 1.0, 50.0, FLAGS }, |
102 |
|
|
{ "e", "set the max expansion factor", OFFSET(max_expansion), AV_OPT_TYPE_DOUBLE, {.dbl=2.0}, 1.0, 50.0, FLAGS }, |
103 |
|
|
{ "compression", "set the max compression factor", OFFSET(max_compression), AV_OPT_TYPE_DOUBLE, {.dbl=2.0}, 1.0, 50.0, FLAGS }, |
104 |
|
|
{ "c", "set the max compression factor", OFFSET(max_compression), AV_OPT_TYPE_DOUBLE, {.dbl=2.0}, 1.0, 50.0, FLAGS }, |
105 |
|
|
{ "threshold", "set the threshold value", OFFSET(threshold_value), AV_OPT_TYPE_DOUBLE, {.dbl=0}, 0.0, 1.0, FLAGS }, |
106 |
|
|
{ "t", "set the threshold value", OFFSET(threshold_value), AV_OPT_TYPE_DOUBLE, {.dbl=0}, 0.0, 1.0, FLAGS }, |
107 |
|
|
{ "raise", "set the expansion raising amount", OFFSET(raise_amount), AV_OPT_TYPE_DOUBLE, {.dbl=0.001}, 0.0, 1.0, FLAGS }, |
108 |
|
|
{ "r", "set the expansion raising amount", OFFSET(raise_amount), AV_OPT_TYPE_DOUBLE, {.dbl=0.001}, 0.0, 1.0, FLAGS }, |
109 |
|
|
{ "fall", "set the compression raising amount", OFFSET(fall_amount), AV_OPT_TYPE_DOUBLE, {.dbl=0.001}, 0.0, 1.0, FLAGS }, |
110 |
|
|
{ "f", "set the compression raising amount", OFFSET(fall_amount), AV_OPT_TYPE_DOUBLE, {.dbl=0.001}, 0.0, 1.0, FLAGS }, |
111 |
|
|
{ "channels", "set channels to filter", OFFSET(ch_layout_str), AV_OPT_TYPE_STRING, {.str="all"}, 0, 0, FLAGS }, |
112 |
|
|
{ "h", "set channels to filter", OFFSET(ch_layout_str), AV_OPT_TYPE_STRING, {.str="all"}, 0, 0, FLAGS }, |
113 |
|
|
{ "invert", "set inverted filtering", OFFSET(invert), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS }, |
114 |
|
|
{ "i", "set inverted filtering", OFFSET(invert), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS }, |
115 |
|
|
{ "link", "set linked channels filtering", OFFSET(link), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS }, |
116 |
|
|
{ "l", "set linked channels filtering", OFFSET(link), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS }, |
117 |
|
|
{ "rms", "set the RMS value", OFFSET(rms_value), AV_OPT_TYPE_DOUBLE, {.dbl=0.0}, 0.0, 1.0, FLAGS }, |
118 |
|
|
{ "m", "set the RMS value", OFFSET(rms_value), AV_OPT_TYPE_DOUBLE, {.dbl=0.0}, 0.0, 1.0, FLAGS }, |
119 |
|
|
{ NULL } |
120 |
|
|
}; |
121 |
|
|
|
122 |
|
|
AVFILTER_DEFINE_CLASS(speechnorm); |
123 |
|
|
|
124 |
|
✗ |
static int get_pi_samples(PeriodItem *pi, int start, int end, int remain) |
125 |
|
|
{ |
126 |
|
|
int sum; |
127 |
|
|
|
128 |
|
✗ |
if (pi[start].type == 0) |
129 |
|
✗ |
return remain; |
130 |
|
|
|
131 |
|
✗ |
sum = remain; |
132 |
|
✗ |
while (start != end) { |
133 |
|
✗ |
start++; |
134 |
|
✗ |
if (start >= MAX_ITEMS) |
135 |
|
✗ |
start = 0; |
136 |
|
✗ |
if (pi[start].type == 0) |
137 |
|
✗ |
break; |
138 |
|
|
av_assert1(pi[start].size > 0); |
139 |
|
✗ |
sum += pi[start].size; |
140 |
|
|
} |
141 |
|
|
|
142 |
|
✗ |
return sum; |
143 |
|
|
} |
144 |
|
|
|
145 |
|
✗ |
static int available_samples(AVFilterContext *ctx) |
146 |
|
|
{ |
147 |
|
✗ |
SpeechNormalizerContext *s = ctx->priv; |
148 |
|
✗ |
AVFilterLink *inlink = ctx->inputs[0]; |
149 |
|
|
int min_pi_nb_samples; |
150 |
|
|
|
151 |
|
✗ |
min_pi_nb_samples = get_pi_samples(s->cc[0].pi, s->cc[0].pi_start, s->cc[0].pi_end, s->cc[0].pi_size); |
152 |
|
✗ |
for (int ch = 1; ch < inlink->ch_layout.nb_channels && min_pi_nb_samples > 0; ch++) { |
153 |
|
✗ |
ChannelContext *cc = &s->cc[ch]; |
154 |
|
|
|
155 |
|
✗ |
min_pi_nb_samples = FFMIN(min_pi_nb_samples, get_pi_samples(cc->pi, cc->pi_start, cc->pi_end, cc->pi_size)); |
156 |
|
|
} |
157 |
|
|
|
158 |
|
✗ |
return min_pi_nb_samples; |
159 |
|
|
} |
160 |
|
|
|
161 |
|
✗ |
static void consume_pi(ChannelContext *cc, int nb_samples) |
162 |
|
|
{ |
163 |
|
✗ |
if (cc->pi_size >= nb_samples) { |
164 |
|
✗ |
cc->pi_size -= nb_samples; |
165 |
|
|
} else { |
166 |
|
|
av_assert1(0); |
167 |
|
|
} |
168 |
|
✗ |
} |
169 |
|
|
|
170 |
|
✗ |
static double next_gain(AVFilterContext *ctx, double pi_max_peak, int bypass, double state, |
171 |
|
|
double pi_rms_sum, int pi_size) |
172 |
|
|
{ |
173 |
|
✗ |
SpeechNormalizerContext *s = ctx->priv; |
174 |
|
✗ |
const double compression = 1. / s->max_compression; |
175 |
|
✗ |
const int type = s->invert ? pi_max_peak <= s->threshold_value : pi_max_peak >= s->threshold_value; |
176 |
|
✗ |
double expansion = FFMIN(s->max_expansion, s->peak_value / pi_max_peak); |
177 |
|
|
|
178 |
|
✗ |
if (s->rms_value > DBL_EPSILON) |
179 |
|
✗ |
expansion = FFMIN(expansion, s->rms_value / sqrt(pi_rms_sum / pi_size)); |
180 |
|
|
|
181 |
|
✗ |
if (bypass) { |
182 |
|
✗ |
return 1.; |
183 |
|
✗ |
} else if (type) { |
184 |
|
✗ |
return FFMIN(expansion, state + s->raise_amount); |
185 |
|
|
} else { |
186 |
|
✗ |
return FFMIN(expansion, FFMAX(compression, state - s->fall_amount)); |
187 |
|
|
} |
188 |
|
|
} |
189 |
|
|
|
190 |
|
✗ |
static void next_pi(AVFilterContext *ctx, ChannelContext *cc, int bypass) |
191 |
|
|
{ |
192 |
|
|
av_assert1(cc->pi_size >= 0); |
193 |
|
✗ |
if (cc->pi_size == 0) { |
194 |
|
✗ |
SpeechNormalizerContext *s = ctx->priv; |
195 |
|
✗ |
int start = cc->pi_start; |
196 |
|
|
|
197 |
|
|
av_assert1(cc->pi[start].size > 0); |
198 |
|
✗ |
av_assert0(cc->pi[start].type > 0 || s->eof); |
199 |
|
✗ |
cc->pi_size = cc->pi[start].size; |
200 |
|
✗ |
cc->pi_rms_sum = cc->pi[start].rms_sum; |
201 |
|
✗ |
cc->pi_max_peak = cc->pi[start].max_peak; |
202 |
|
|
av_assert1(cc->pi_start != cc->pi_end || s->eof); |
203 |
|
✗ |
start++; |
204 |
|
✗ |
if (start >= MAX_ITEMS) |
205 |
|
✗ |
start = 0; |
206 |
|
✗ |
cc->pi_start = start; |
207 |
|
✗ |
cc->gain_state = next_gain(ctx, cc->pi_max_peak, bypass, cc->gain_state, |
208 |
|
|
cc->pi_rms_sum, cc->pi_size); |
209 |
|
|
} |
210 |
|
✗ |
} |
211 |
|
|
|
212 |
|
✗ |
static double min_gain(AVFilterContext *ctx, ChannelContext *cc, int max_size) |
213 |
|
|
{ |
214 |
|
✗ |
SpeechNormalizerContext *s = ctx->priv; |
215 |
|
✗ |
double min_gain = s->max_expansion; |
216 |
|
✗ |
double gain_state = cc->gain_state; |
217 |
|
✗ |
int size = cc->pi_size; |
218 |
|
✗ |
int idx = cc->pi_start; |
219 |
|
|
|
220 |
|
✗ |
min_gain = FFMIN(min_gain, gain_state); |
221 |
|
✗ |
while (size <= max_size) { |
222 |
|
✗ |
if (idx == cc->pi_end) |
223 |
|
✗ |
break; |
224 |
|
✗ |
gain_state = next_gain(ctx, cc->pi[idx].max_peak, 0, gain_state, |
225 |
|
|
cc->pi[idx].rms_sum, cc->pi[idx].size); |
226 |
|
✗ |
min_gain = FFMIN(min_gain, gain_state); |
227 |
|
✗ |
size += cc->pi[idx].size; |
228 |
|
✗ |
idx++; |
229 |
|
✗ |
if (idx >= MAX_ITEMS) |
230 |
|
✗ |
idx = 0; |
231 |
|
|
} |
232 |
|
|
|
233 |
|
✗ |
return min_gain; |
234 |
|
|
} |
235 |
|
|
|
236 |
|
|
#define ANALYZE_CHANNEL(name, ptype, zero, min_peak) \ |
237 |
|
|
static void analyze_channel_## name (AVFilterContext *ctx, ChannelContext *cc, \ |
238 |
|
|
const uint8_t *srcp, int nb_samples) \ |
239 |
|
|
{ \ |
240 |
|
|
SpeechNormalizerContext *s = ctx->priv; \ |
241 |
|
|
const ptype *src = (const ptype *)srcp; \ |
242 |
|
|
const int max_period = s->max_period; \ |
243 |
|
|
PeriodItem *pi = (PeriodItem *)&cc->pi; \ |
244 |
|
|
int pi_end = cc->pi_end; \ |
245 |
|
|
int n = 0; \ |
246 |
|
|
\ |
247 |
|
|
if (cc->state < 0) \ |
248 |
|
|
cc->state = src[0] >= zero; \ |
249 |
|
|
\ |
250 |
|
|
while (n < nb_samples) { \ |
251 |
|
|
ptype new_max_peak; \ |
252 |
|
|
ptype new_rms_sum; \ |
253 |
|
|
int new_size; \ |
254 |
|
|
\ |
255 |
|
|
if ((cc->state != (src[n] >= zero)) || \ |
256 |
|
|
(pi[pi_end].size > max_period)) { \ |
257 |
|
|
ptype max_peak = pi[pi_end].max_peak; \ |
258 |
|
|
ptype rms_sum = pi[pi_end].rms_sum; \ |
259 |
|
|
int state = cc->state; \ |
260 |
|
|
\ |
261 |
|
|
cc->state = src[n] >= zero; \ |
262 |
|
|
av_assert1(pi[pi_end].size > 0); \ |
263 |
|
|
if (max_peak >= min_peak || \ |
264 |
|
|
pi[pi_end].size > max_period) { \ |
265 |
|
|
pi[pi_end].type = 1; \ |
266 |
|
|
pi_end++; \ |
267 |
|
|
if (pi_end >= MAX_ITEMS) \ |
268 |
|
|
pi_end = 0; \ |
269 |
|
|
if (cc->state != state) { \ |
270 |
|
|
pi[pi_end].max_peak = DBL_MIN; \ |
271 |
|
|
pi[pi_end].rms_sum = 0.0; \ |
272 |
|
|
} else { \ |
273 |
|
|
pi[pi_end].max_peak = max_peak; \ |
274 |
|
|
pi[pi_end].rms_sum = rms_sum; \ |
275 |
|
|
} \ |
276 |
|
|
pi[pi_end].type = 0; \ |
277 |
|
|
pi[pi_end].size = 0; \ |
278 |
|
|
av_assert1(pi_end != cc->pi_start); \ |
279 |
|
|
} \ |
280 |
|
|
} \ |
281 |
|
|
\ |
282 |
|
|
new_max_peak = pi[pi_end].max_peak; \ |
283 |
|
|
new_rms_sum = pi[pi_end].rms_sum; \ |
284 |
|
|
new_size = pi[pi_end].size; \ |
285 |
|
|
if (cc->state) { \ |
286 |
|
|
while (src[n] >= zero) { \ |
287 |
|
|
new_max_peak = FFMAX(new_max_peak, src[n]); \ |
288 |
|
|
new_rms_sum += src[n] * src[n]; \ |
289 |
|
|
new_size++; \ |
290 |
|
|
n++; \ |
291 |
|
|
if (n >= nb_samples) \ |
292 |
|
|
break; \ |
293 |
|
|
} \ |
294 |
|
|
} else { \ |
295 |
|
|
while (src[n] < zero) { \ |
296 |
|
|
new_max_peak = FFMAX(new_max_peak, -src[n]); \ |
297 |
|
|
new_rms_sum += src[n] * src[n]; \ |
298 |
|
|
new_size++; \ |
299 |
|
|
n++; \ |
300 |
|
|
if (n >= nb_samples) \ |
301 |
|
|
break; \ |
302 |
|
|
} \ |
303 |
|
|
} \ |
304 |
|
|
\ |
305 |
|
|
pi[pi_end].max_peak = new_max_peak; \ |
306 |
|
|
pi[pi_end].rms_sum = new_rms_sum; \ |
307 |
|
|
pi[pi_end].size = new_size; \ |
308 |
|
|
} \ |
309 |
|
|
cc->pi_end = pi_end; \ |
310 |
|
|
} |
311 |
|
|
|
312 |
|
✗ |
ANALYZE_CHANNEL(dbl, double, 0.0, MIN_PEAK) |
313 |
|
✗ |
ANALYZE_CHANNEL(flt, float, 0.f, (float)MIN_PEAK) |
314 |
|
|
|
315 |
|
|
#define FILTER_CHANNELS(name, ptype) \ |
316 |
|
|
static void filter_channels_## name (AVFilterContext *ctx, \ |
317 |
|
|
AVFrame *in, AVFrame *out, int nb_samples) \ |
318 |
|
|
{ \ |
319 |
|
|
SpeechNormalizerContext *s = ctx->priv; \ |
320 |
|
|
AVFilterLink *inlink = ctx->inputs[0]; \ |
321 |
|
|
\ |
322 |
|
|
for (int ch = 0; ch < inlink->ch_layout.nb_channels; ch++) { \ |
323 |
|
|
ChannelContext *cc = &s->cc[ch]; \ |
324 |
|
|
const ptype *src = (const ptype *)in->extended_data[ch]; \ |
325 |
|
|
ptype *dst = (ptype *)out->extended_data[ch]; \ |
326 |
|
|
enum AVChannel channel = av_channel_layout_channel_from_index(&inlink->ch_layout, ch); \ |
327 |
|
|
const int bypass = av_channel_layout_index_from_channel(&s->ch_layout, channel) < 0; \ |
328 |
|
|
int n = 0; \ |
329 |
|
|
\ |
330 |
|
|
while (n < nb_samples) { \ |
331 |
|
|
ptype gain; \ |
332 |
|
|
int size; \ |
333 |
|
|
\ |
334 |
|
|
next_pi(ctx, cc, bypass); \ |
335 |
|
|
size = FFMIN(nb_samples - n, cc->pi_size); \ |
336 |
|
|
av_assert1(size > 0); \ |
337 |
|
|
gain = cc->gain_state; \ |
338 |
|
|
consume_pi(cc, size); \ |
339 |
|
|
for (int i = n; !ctx->is_disabled && i < n + size; i++) \ |
340 |
|
|
dst[i] = src[i] * gain; \ |
341 |
|
|
n += size; \ |
342 |
|
|
} \ |
343 |
|
|
} \ |
344 |
|
|
} |
345 |
|
|
|
346 |
|
✗ |
FILTER_CHANNELS(dbl, double) |
347 |
|
✗ |
FILTER_CHANNELS(flt, float) |
348 |
|
|
|
349 |
|
✗ |
static double dlerp(double min, double max, double mix) |
350 |
|
|
{ |
351 |
|
✗ |
return min + (max - min) * mix; |
352 |
|
|
} |
353 |
|
|
|
354 |
|
✗ |
static float flerp(float min, float max, float mix) |
355 |
|
|
{ |
356 |
|
✗ |
return min + (max - min) * mix; |
357 |
|
|
} |
358 |
|
|
|
359 |
|
|
#define FILTER_LINK_CHANNELS(name, ptype, tlerp) \ |
360 |
|
|
static void filter_link_channels_## name (AVFilterContext *ctx, \ |
361 |
|
|
AVFrame *in, AVFrame *out, \ |
362 |
|
|
int nb_samples) \ |
363 |
|
|
{ \ |
364 |
|
|
SpeechNormalizerContext *s = ctx->priv; \ |
365 |
|
|
AVFilterLink *inlink = ctx->inputs[0]; \ |
366 |
|
|
int n = 0; \ |
367 |
|
|
\ |
368 |
|
|
while (n < nb_samples) { \ |
369 |
|
|
int min_size = nb_samples - n; \ |
370 |
|
|
ptype gain = s->max_expansion; \ |
371 |
|
|
\ |
372 |
|
|
for (int ch = 0; ch < inlink->ch_layout.nb_channels; ch++) { \ |
373 |
|
|
ChannelContext *cc = &s->cc[ch]; \ |
374 |
|
|
\ |
375 |
|
|
enum AVChannel channel = av_channel_layout_channel_from_index(&inlink->ch_layout, ch); \ |
376 |
|
|
cc->bypass = av_channel_layout_index_from_channel(&s->ch_layout, channel) < 0; \ |
377 |
|
|
\ |
378 |
|
|
next_pi(ctx, cc, cc->bypass); \ |
379 |
|
|
min_size = FFMIN(min_size, cc->pi_size); \ |
380 |
|
|
} \ |
381 |
|
|
\ |
382 |
|
|
av_assert1(min_size > 0); \ |
383 |
|
|
for (int ch = 0; ch < inlink->ch_layout.nb_channels; ch++) { \ |
384 |
|
|
ChannelContext *cc = &s->cc[ch]; \ |
385 |
|
|
\ |
386 |
|
|
if (cc->bypass) \ |
387 |
|
|
continue; \ |
388 |
|
|
gain = FFMIN(gain, min_gain(ctx, cc, min_size)); \ |
389 |
|
|
} \ |
390 |
|
|
\ |
391 |
|
|
for (int ch = 0; ch < inlink->ch_layout.nb_channels; ch++) { \ |
392 |
|
|
ChannelContext *cc = &s->cc[ch]; \ |
393 |
|
|
const ptype *src = (const ptype *)in->extended_data[ch]; \ |
394 |
|
|
ptype *dst = (ptype *)out->extended_data[ch]; \ |
395 |
|
|
\ |
396 |
|
|
consume_pi(cc, min_size); \ |
397 |
|
|
if (cc->bypass) \ |
398 |
|
|
continue; \ |
399 |
|
|
\ |
400 |
|
|
for (int i = n; !ctx->is_disabled && i < n + min_size; i++) { \ |
401 |
|
|
ptype g = tlerp(s->prev_gain, gain, (i - n) / (ptype)min_size); \ |
402 |
|
|
dst[i] = src[i] * g; \ |
403 |
|
|
} \ |
404 |
|
|
} \ |
405 |
|
|
\ |
406 |
|
|
s->prev_gain = gain; \ |
407 |
|
|
n += min_size; \ |
408 |
|
|
} \ |
409 |
|
|
} |
410 |
|
|
|
411 |
|
✗ |
FILTER_LINK_CHANNELS(dbl, double, dlerp) |
412 |
|
✗ |
FILTER_LINK_CHANNELS(flt, float, flerp) |
413 |
|
|
|
414 |
|
✗ |
static int filter_frame(AVFilterContext *ctx) |
415 |
|
|
{ |
416 |
|
✗ |
SpeechNormalizerContext *s = ctx->priv; |
417 |
|
✗ |
AVFilterLink *outlink = ctx->outputs[0]; |
418 |
|
✗ |
AVFilterLink *inlink = ctx->inputs[0]; |
419 |
|
|
int ret; |
420 |
|
|
|
421 |
|
✗ |
while (s->queue.available > 0) { |
422 |
|
|
int min_pi_nb_samples; |
423 |
|
|
AVFrame *in, *out; |
424 |
|
|
|
425 |
|
✗ |
in = ff_bufqueue_peek(&s->queue, 0); |
426 |
|
✗ |
if (!in) |
427 |
|
✗ |
break; |
428 |
|
|
|
429 |
|
✗ |
min_pi_nb_samples = available_samples(ctx); |
430 |
|
✗ |
if (min_pi_nb_samples < in->nb_samples && !s->eof) |
431 |
|
✗ |
break; |
432 |
|
|
|
433 |
|
✗ |
in = ff_bufqueue_get(&s->queue); |
434 |
|
|
|
435 |
|
✗ |
if (av_frame_is_writable(in)) { |
436 |
|
✗ |
out = in; |
437 |
|
|
} else { |
438 |
|
✗ |
out = ff_get_audio_buffer(outlink, in->nb_samples); |
439 |
|
✗ |
if (!out) { |
440 |
|
✗ |
av_frame_free(&in); |
441 |
|
✗ |
return AVERROR(ENOMEM); |
442 |
|
|
} |
443 |
|
✗ |
av_frame_copy_props(out, in); |
444 |
|
|
} |
445 |
|
|
|
446 |
|
✗ |
s->filter_channels[s->link](ctx, in, out, in->nb_samples); |
447 |
|
|
|
448 |
|
✗ |
s->pts = in->pts + av_rescale_q(in->nb_samples, av_make_q(1, outlink->sample_rate), |
449 |
|
|
outlink->time_base); |
450 |
|
|
|
451 |
|
✗ |
if (out != in) |
452 |
|
✗ |
av_frame_free(&in); |
453 |
|
✗ |
return ff_filter_frame(outlink, out); |
454 |
|
|
} |
455 |
|
|
|
456 |
|
✗ |
for (int f = 0; f < ff_inlink_queued_frames(inlink); f++) { |
457 |
|
|
AVFrame *in; |
458 |
|
|
|
459 |
|
✗ |
ret = ff_inlink_consume_frame(inlink, &in); |
460 |
|
✗ |
if (ret < 0) |
461 |
|
✗ |
return ret; |
462 |
|
✗ |
if (ret == 0) |
463 |
|
✗ |
break; |
464 |
|
|
|
465 |
|
✗ |
ff_bufqueue_add(ctx, &s->queue, in); |
466 |
|
|
|
467 |
|
✗ |
for (int ch = 0; ch < inlink->ch_layout.nb_channels; ch++) { |
468 |
|
✗ |
ChannelContext *cc = &s->cc[ch]; |
469 |
|
|
|
470 |
|
✗ |
s->analyze_channel(ctx, cc, in->extended_data[ch], in->nb_samples); |
471 |
|
|
} |
472 |
|
|
} |
473 |
|
|
|
474 |
|
✗ |
return 1; |
475 |
|
|
} |
476 |
|
|
|
477 |
|
✗ |
static int activate(AVFilterContext *ctx) |
478 |
|
|
{ |
479 |
|
✗ |
AVFilterLink *inlink = ctx->inputs[0]; |
480 |
|
✗ |
AVFilterLink *outlink = ctx->outputs[0]; |
481 |
|
✗ |
SpeechNormalizerContext *s = ctx->priv; |
482 |
|
|
int ret, status; |
483 |
|
|
int64_t pts; |
484 |
|
|
|
485 |
|
✗ |
ret = av_channel_layout_copy(&s->ch_layout, &inlink->ch_layout); |
486 |
|
✗ |
if (ret < 0) |
487 |
|
✗ |
return ret; |
488 |
|
✗ |
if (strcmp(s->ch_layout_str, "all")) |
489 |
|
✗ |
av_channel_layout_from_string(&s->ch_layout, |
490 |
|
✗ |
s->ch_layout_str); |
491 |
|
|
|
492 |
|
✗ |
FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink); |
493 |
|
|
|
494 |
|
✗ |
ret = filter_frame(ctx); |
495 |
|
✗ |
if (ret <= 0) |
496 |
|
✗ |
return ret; |
497 |
|
|
|
498 |
|
✗ |
if (!s->eof && ff_inlink_acknowledge_status(inlink, &status, &pts)) { |
499 |
|
✗ |
if (status == AVERROR_EOF) |
500 |
|
✗ |
s->eof = 1; |
501 |
|
|
} |
502 |
|
|
|
503 |
|
✗ |
if (s->eof && ff_inlink_queued_samples(inlink) == 0 && |
504 |
|
✗ |
s->queue.available == 0) { |
505 |
|
✗ |
ff_outlink_set_status(outlink, AVERROR_EOF, s->pts); |
506 |
|
✗ |
return 0; |
507 |
|
|
} |
508 |
|
|
|
509 |
|
✗ |
if (s->queue.available > 0) { |
510 |
|
✗ |
AVFrame *in = ff_bufqueue_peek(&s->queue, 0); |
511 |
|
✗ |
const int nb_samples = available_samples(ctx); |
512 |
|
|
|
513 |
|
✗ |
if (nb_samples >= in->nb_samples || s->eof) { |
514 |
|
✗ |
ff_filter_set_ready(ctx, 10); |
515 |
|
✗ |
return 0; |
516 |
|
|
} |
517 |
|
|
} |
518 |
|
|
|
519 |
|
✗ |
FF_FILTER_FORWARD_WANTED(outlink, inlink); |
520 |
|
|
|
521 |
|
✗ |
return FFERROR_NOT_READY; |
522 |
|
|
} |
523 |
|
|
|
524 |
|
✗ |
static int config_input(AVFilterLink *inlink) |
525 |
|
|
{ |
526 |
|
✗ |
AVFilterContext *ctx = inlink->dst; |
527 |
|
✗ |
SpeechNormalizerContext *s = ctx->priv; |
528 |
|
|
|
529 |
|
✗ |
s->max_period = inlink->sample_rate / 10; |
530 |
|
|
|
531 |
|
✗ |
s->prev_gain = 1.; |
532 |
|
✗ |
s->cc = av_calloc(inlink->ch_layout.nb_channels, sizeof(*s->cc)); |
533 |
|
✗ |
if (!s->cc) |
534 |
|
✗ |
return AVERROR(ENOMEM); |
535 |
|
|
|
536 |
|
✗ |
for (int ch = 0; ch < inlink->ch_layout.nb_channels; ch++) { |
537 |
|
✗ |
ChannelContext *cc = &s->cc[ch]; |
538 |
|
|
|
539 |
|
✗ |
cc->state = -1; |
540 |
|
✗ |
cc->gain_state = s->max_expansion; |
541 |
|
|
} |
542 |
|
|
|
543 |
|
✗ |
switch (inlink->format) { |
544 |
|
✗ |
case AV_SAMPLE_FMT_FLTP: |
545 |
|
✗ |
s->analyze_channel = analyze_channel_flt; |
546 |
|
✗ |
s->filter_channels[0] = filter_channels_flt; |
547 |
|
✗ |
s->filter_channels[1] = filter_link_channels_flt; |
548 |
|
✗ |
break; |
549 |
|
✗ |
case AV_SAMPLE_FMT_DBLP: |
550 |
|
✗ |
s->analyze_channel = analyze_channel_dbl; |
551 |
|
✗ |
s->filter_channels[0] = filter_channels_dbl; |
552 |
|
✗ |
s->filter_channels[1] = filter_link_channels_dbl; |
553 |
|
✗ |
break; |
554 |
|
✗ |
default: |
555 |
|
|
av_assert1(0); |
556 |
|
|
} |
557 |
|
|
|
558 |
|
✗ |
return 0; |
559 |
|
|
} |
560 |
|
|
|
561 |
|
✗ |
static int process_command(AVFilterContext *ctx, const char *cmd, const char *args, |
562 |
|
|
char *res, int res_len, int flags) |
563 |
|
|
{ |
564 |
|
✗ |
SpeechNormalizerContext *s = ctx->priv; |
565 |
|
✗ |
int link = s->link; |
566 |
|
|
int ret; |
567 |
|
|
|
568 |
|
✗ |
ret = ff_filter_process_command(ctx, cmd, args, res, res_len, flags); |
569 |
|
✗ |
if (ret < 0) |
570 |
|
✗ |
return ret; |
571 |
|
✗ |
if (link != s->link) |
572 |
|
✗ |
s->prev_gain = 1.; |
573 |
|
|
|
574 |
|
✗ |
return 0; |
575 |
|
|
} |
576 |
|
|
|
577 |
|
✗ |
static av_cold void uninit(AVFilterContext *ctx) |
578 |
|
|
{ |
579 |
|
✗ |
SpeechNormalizerContext *s = ctx->priv; |
580 |
|
|
|
581 |
|
✗ |
ff_bufqueue_discard_all(&s->queue); |
582 |
|
✗ |
av_channel_layout_uninit(&s->ch_layout); |
583 |
|
✗ |
av_freep(&s->cc); |
584 |
|
✗ |
} |
585 |
|
|
|
586 |
|
|
static const AVFilterPad inputs[] = { |
587 |
|
|
{ |
588 |
|
|
.name = "default", |
589 |
|
|
.type = AVMEDIA_TYPE_AUDIO, |
590 |
|
|
.config_props = config_input, |
591 |
|
|
}, |
592 |
|
|
}; |
593 |
|
|
|
594 |
|
|
const AVFilter ff_af_speechnorm = { |
595 |
|
|
.name = "speechnorm", |
596 |
|
|
.description = NULL_IF_CONFIG_SMALL("Speech Normalizer."), |
597 |
|
|
.priv_size = sizeof(SpeechNormalizerContext), |
598 |
|
|
.priv_class = &speechnorm_class, |
599 |
|
|
.activate = activate, |
600 |
|
|
.uninit = uninit, |
601 |
|
|
FILTER_INPUTS(inputs), |
602 |
|
|
FILTER_OUTPUTS(ff_audio_default_filterpad), |
603 |
|
|
FILTER_SAMPLEFMTS(AV_SAMPLE_FMT_FLTP, AV_SAMPLE_FMT_DBLP), |
604 |
|
|
.flags = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL, |
605 |
|
|
.process_command = process_command, |
606 |
|
|
}; |
607 |
|
|
|