Line |
Branch |
Exec |
Source |
1 |
|
|
/* |
2 |
|
|
* Copyright (c) 2020 Paul B Mahol |
3 |
|
|
* |
4 |
|
|
* Speech Normalizer |
5 |
|
|
* |
6 |
|
|
* This file is part of FFmpeg. |
7 |
|
|
* |
8 |
|
|
* FFmpeg is free software; you can redistribute it and/or |
9 |
|
|
* modify it under the terms of the GNU Lesser General Public |
10 |
|
|
* License as published by the Free Software Foundation; either |
11 |
|
|
* version 2.1 of the License, or (at your option) any later version. |
12 |
|
|
* |
13 |
|
|
* FFmpeg is distributed in the hope that it will be useful, |
14 |
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 |
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
16 |
|
|
* Lesser General Public License for more details. |
17 |
|
|
* |
18 |
|
|
* You should have received a copy of the GNU Lesser General Public |
19 |
|
|
* License along with FFmpeg; if not, write to the Free Software |
20 |
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
21 |
|
|
*/ |
22 |
|
|
|
23 |
|
|
/** |
24 |
|
|
* @file |
25 |
|
|
* Speech Normalizer |
26 |
|
|
*/ |
27 |
|
|
|
28 |
|
|
#include <float.h> |
29 |
|
|
|
30 |
|
|
#include "libavutil/avassert.h" |
31 |
|
|
#include "libavutil/channel_layout.h" |
32 |
|
|
#include "libavutil/mem.h" |
33 |
|
|
#include "libavutil/opt.h" |
34 |
|
|
|
35 |
|
|
#define FF_BUFQUEUE_SIZE (1024) |
36 |
|
|
#include "bufferqueue.h" |
37 |
|
|
|
38 |
|
|
#include "audio.h" |
39 |
|
|
#include "avfilter.h" |
40 |
|
|
#include "filters.h" |
41 |
|
|
|
42 |
|
|
#define MAX_ITEMS 882000 |
43 |
|
|
#define MIN_PEAK (1. / 32768.) |
44 |
|
|
|
45 |
|
|
typedef struct PeriodItem { |
46 |
|
|
int size; |
47 |
|
|
int type; |
48 |
|
|
double max_peak; |
49 |
|
|
double rms_sum; |
50 |
|
|
} PeriodItem; |
51 |
|
|
|
52 |
|
|
typedef struct ChannelContext { |
53 |
|
|
int state; |
54 |
|
|
int bypass; |
55 |
|
|
PeriodItem pi[MAX_ITEMS]; |
56 |
|
|
double gain_state; |
57 |
|
|
double pi_max_peak; |
58 |
|
|
double pi_rms_sum; |
59 |
|
|
int pi_start; |
60 |
|
|
int pi_end; |
61 |
|
|
int pi_size; |
62 |
|
|
} ChannelContext; |
63 |
|
|
|
64 |
|
|
typedef struct SpeechNormalizerContext { |
65 |
|
|
const AVClass *class; |
66 |
|
|
|
67 |
|
|
double rms_value; |
68 |
|
|
double peak_value; |
69 |
|
|
double max_expansion; |
70 |
|
|
double max_compression; |
71 |
|
|
double threshold_value; |
72 |
|
|
double raise_amount; |
73 |
|
|
double fall_amount; |
74 |
|
|
char *ch_layout_str; |
75 |
|
|
AVChannelLayout ch_layout; |
76 |
|
|
int invert; |
77 |
|
|
int link; |
78 |
|
|
|
79 |
|
|
ChannelContext *cc; |
80 |
|
|
double prev_gain; |
81 |
|
|
|
82 |
|
|
int max_period; |
83 |
|
|
int eof; |
84 |
|
|
int64_t pts; |
85 |
|
|
|
86 |
|
|
struct FFBufQueue queue; |
87 |
|
|
|
88 |
|
|
void (*analyze_channel)(AVFilterContext *ctx, ChannelContext *cc, |
89 |
|
|
const uint8_t *srcp, int nb_samples); |
90 |
|
|
void (*filter_channels[2])(AVFilterContext *ctx, |
91 |
|
|
AVFrame *in, AVFrame *out, int nb_samples); |
92 |
|
|
} SpeechNormalizerContext; |
93 |
|
|
|
94 |
|
|
#define OFFSET(x) offsetof(SpeechNormalizerContext, x) |
95 |
|
|
#define FLAGS AV_OPT_FLAG_AUDIO_PARAM|AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_RUNTIME_PARAM |
96 |
|
|
|
97 |
|
|
static const AVOption speechnorm_options[] = { |
98 |
|
|
{ "peak", "set the peak value", OFFSET(peak_value), AV_OPT_TYPE_DOUBLE, {.dbl=0.95}, 0.0, 1.0, FLAGS }, |
99 |
|
|
{ "p", "set the peak value", OFFSET(peak_value), AV_OPT_TYPE_DOUBLE, {.dbl=0.95}, 0.0, 1.0, FLAGS }, |
100 |
|
|
{ "expansion", "set the max expansion factor", OFFSET(max_expansion), AV_OPT_TYPE_DOUBLE, {.dbl=2.0}, 1.0, 50.0, FLAGS }, |
101 |
|
|
{ "e", "set the max expansion factor", OFFSET(max_expansion), AV_OPT_TYPE_DOUBLE, {.dbl=2.0}, 1.0, 50.0, FLAGS }, |
102 |
|
|
{ "compression", "set the max compression factor", OFFSET(max_compression), AV_OPT_TYPE_DOUBLE, {.dbl=2.0}, 1.0, 50.0, FLAGS }, |
103 |
|
|
{ "c", "set the max compression factor", OFFSET(max_compression), AV_OPT_TYPE_DOUBLE, {.dbl=2.0}, 1.0, 50.0, FLAGS }, |
104 |
|
|
{ "threshold", "set the threshold value", OFFSET(threshold_value), AV_OPT_TYPE_DOUBLE, {.dbl=0}, 0.0, 1.0, FLAGS }, |
105 |
|
|
{ "t", "set the threshold value", OFFSET(threshold_value), AV_OPT_TYPE_DOUBLE, {.dbl=0}, 0.0, 1.0, FLAGS }, |
106 |
|
|
{ "raise", "set the expansion raising amount", OFFSET(raise_amount), AV_OPT_TYPE_DOUBLE, {.dbl=0.001}, 0.0, 1.0, FLAGS }, |
107 |
|
|
{ "r", "set the expansion raising amount", OFFSET(raise_amount), AV_OPT_TYPE_DOUBLE, {.dbl=0.001}, 0.0, 1.0, FLAGS }, |
108 |
|
|
{ "fall", "set the compression raising amount", OFFSET(fall_amount), AV_OPT_TYPE_DOUBLE, {.dbl=0.001}, 0.0, 1.0, FLAGS }, |
109 |
|
|
{ "f", "set the compression raising amount", OFFSET(fall_amount), AV_OPT_TYPE_DOUBLE, {.dbl=0.001}, 0.0, 1.0, FLAGS }, |
110 |
|
|
{ "channels", "set channels to filter", OFFSET(ch_layout_str), AV_OPT_TYPE_STRING, {.str="all"}, 0, 0, FLAGS }, |
111 |
|
|
{ "h", "set channels to filter", OFFSET(ch_layout_str), AV_OPT_TYPE_STRING, {.str="all"}, 0, 0, FLAGS }, |
112 |
|
|
{ "invert", "set inverted filtering", OFFSET(invert), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS }, |
113 |
|
|
{ "i", "set inverted filtering", OFFSET(invert), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS }, |
114 |
|
|
{ "link", "set linked channels filtering", OFFSET(link), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS }, |
115 |
|
|
{ "l", "set linked channels filtering", OFFSET(link), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS }, |
116 |
|
|
{ "rms", "set the RMS value", OFFSET(rms_value), AV_OPT_TYPE_DOUBLE, {.dbl=0.0}, 0.0, 1.0, FLAGS }, |
117 |
|
|
{ "m", "set the RMS value", OFFSET(rms_value), AV_OPT_TYPE_DOUBLE, {.dbl=0.0}, 0.0, 1.0, FLAGS }, |
118 |
|
|
{ NULL } |
119 |
|
|
}; |
120 |
|
|
|
121 |
|
|
AVFILTER_DEFINE_CLASS(speechnorm); |
122 |
|
|
|
123 |
|
✗ |
static int get_pi_samples(PeriodItem *pi, int start, int end, int remain) |
124 |
|
|
{ |
125 |
|
|
int sum; |
126 |
|
|
|
127 |
|
✗ |
if (pi[start].type == 0) |
128 |
|
✗ |
return remain; |
129 |
|
|
|
130 |
|
✗ |
sum = remain; |
131 |
|
✗ |
while (start != end) { |
132 |
|
✗ |
start++; |
133 |
|
✗ |
if (start >= MAX_ITEMS) |
134 |
|
✗ |
start = 0; |
135 |
|
✗ |
if (pi[start].type == 0) |
136 |
|
✗ |
break; |
137 |
|
|
av_assert1(pi[start].size > 0); |
138 |
|
✗ |
sum += pi[start].size; |
139 |
|
|
} |
140 |
|
|
|
141 |
|
✗ |
return sum; |
142 |
|
|
} |
143 |
|
|
|
144 |
|
✗ |
static int available_samples(AVFilterContext *ctx) |
145 |
|
|
{ |
146 |
|
✗ |
SpeechNormalizerContext *s = ctx->priv; |
147 |
|
✗ |
AVFilterLink *inlink = ctx->inputs[0]; |
148 |
|
|
int min_pi_nb_samples; |
149 |
|
|
|
150 |
|
✗ |
min_pi_nb_samples = get_pi_samples(s->cc[0].pi, s->cc[0].pi_start, s->cc[0].pi_end, s->cc[0].pi_size); |
151 |
|
✗ |
for (int ch = 1; ch < inlink->ch_layout.nb_channels && min_pi_nb_samples > 0; ch++) { |
152 |
|
✗ |
ChannelContext *cc = &s->cc[ch]; |
153 |
|
|
|
154 |
|
✗ |
min_pi_nb_samples = FFMIN(min_pi_nb_samples, get_pi_samples(cc->pi, cc->pi_start, cc->pi_end, cc->pi_size)); |
155 |
|
|
} |
156 |
|
|
|
157 |
|
✗ |
return min_pi_nb_samples; |
158 |
|
|
} |
159 |
|
|
|
160 |
|
✗ |
static void consume_pi(ChannelContext *cc, int nb_samples) |
161 |
|
|
{ |
162 |
|
✗ |
if (cc->pi_size >= nb_samples) { |
163 |
|
✗ |
cc->pi_size -= nb_samples; |
164 |
|
|
} else { |
165 |
|
|
av_assert1(0); |
166 |
|
|
} |
167 |
|
✗ |
} |
168 |
|
|
|
169 |
|
✗ |
static double next_gain(AVFilterContext *ctx, double pi_max_peak, int bypass, double state, |
170 |
|
|
double pi_rms_sum, int pi_size) |
171 |
|
|
{ |
172 |
|
✗ |
SpeechNormalizerContext *s = ctx->priv; |
173 |
|
✗ |
const double compression = 1. / s->max_compression; |
174 |
|
✗ |
const int type = s->invert ? pi_max_peak <= s->threshold_value : pi_max_peak >= s->threshold_value; |
175 |
|
✗ |
double expansion = FFMIN(s->max_expansion, s->peak_value / pi_max_peak); |
176 |
|
|
|
177 |
|
✗ |
if (s->rms_value > DBL_EPSILON) |
178 |
|
✗ |
expansion = FFMIN(expansion, s->rms_value / sqrt(pi_rms_sum / pi_size)); |
179 |
|
|
|
180 |
|
✗ |
if (bypass) { |
181 |
|
✗ |
return 1.; |
182 |
|
✗ |
} else if (type) { |
183 |
|
✗ |
return FFMIN(expansion, state + s->raise_amount); |
184 |
|
|
} else { |
185 |
|
✗ |
return FFMIN(expansion, FFMAX(compression, state - s->fall_amount)); |
186 |
|
|
} |
187 |
|
|
} |
188 |
|
|
|
189 |
|
✗ |
static void next_pi(AVFilterContext *ctx, ChannelContext *cc, int bypass) |
190 |
|
|
{ |
191 |
|
|
av_assert1(cc->pi_size >= 0); |
192 |
|
✗ |
if (cc->pi_size == 0) { |
193 |
|
✗ |
SpeechNormalizerContext *s = ctx->priv; |
194 |
|
✗ |
int start = cc->pi_start; |
195 |
|
|
|
196 |
|
|
av_assert1(cc->pi[start].size > 0); |
197 |
|
✗ |
av_assert0(cc->pi[start].type > 0 || s->eof); |
198 |
|
✗ |
cc->pi_size = cc->pi[start].size; |
199 |
|
✗ |
cc->pi_rms_sum = cc->pi[start].rms_sum; |
200 |
|
✗ |
cc->pi_max_peak = cc->pi[start].max_peak; |
201 |
|
|
av_assert1(cc->pi_start != cc->pi_end || s->eof); |
202 |
|
✗ |
start++; |
203 |
|
✗ |
if (start >= MAX_ITEMS) |
204 |
|
✗ |
start = 0; |
205 |
|
✗ |
cc->pi_start = start; |
206 |
|
✗ |
cc->gain_state = next_gain(ctx, cc->pi_max_peak, bypass, cc->gain_state, |
207 |
|
|
cc->pi_rms_sum, cc->pi_size); |
208 |
|
|
} |
209 |
|
✗ |
} |
210 |
|
|
|
211 |
|
✗ |
static double min_gain(AVFilterContext *ctx, ChannelContext *cc, int max_size) |
212 |
|
|
{ |
213 |
|
✗ |
SpeechNormalizerContext *s = ctx->priv; |
214 |
|
✗ |
double min_gain = s->max_expansion; |
215 |
|
✗ |
double gain_state = cc->gain_state; |
216 |
|
✗ |
int size = cc->pi_size; |
217 |
|
✗ |
int idx = cc->pi_start; |
218 |
|
|
|
219 |
|
✗ |
min_gain = FFMIN(min_gain, gain_state); |
220 |
|
✗ |
while (size <= max_size) { |
221 |
|
✗ |
if (idx == cc->pi_end) |
222 |
|
✗ |
break; |
223 |
|
✗ |
gain_state = next_gain(ctx, cc->pi[idx].max_peak, 0, gain_state, |
224 |
|
|
cc->pi[idx].rms_sum, cc->pi[idx].size); |
225 |
|
✗ |
min_gain = FFMIN(min_gain, gain_state); |
226 |
|
✗ |
size += cc->pi[idx].size; |
227 |
|
✗ |
idx++; |
228 |
|
✗ |
if (idx >= MAX_ITEMS) |
229 |
|
✗ |
idx = 0; |
230 |
|
|
} |
231 |
|
|
|
232 |
|
✗ |
return min_gain; |
233 |
|
|
} |
234 |
|
|
|
235 |
|
|
#define ANALYZE_CHANNEL(name, ptype, zero, min_peak) \ |
236 |
|
|
static void analyze_channel_## name (AVFilterContext *ctx, ChannelContext *cc, \ |
237 |
|
|
const uint8_t *srcp, int nb_samples) \ |
238 |
|
|
{ \ |
239 |
|
|
SpeechNormalizerContext *s = ctx->priv; \ |
240 |
|
|
const ptype *src = (const ptype *)srcp; \ |
241 |
|
|
const int max_period = s->max_period; \ |
242 |
|
|
PeriodItem *pi = (PeriodItem *)&cc->pi; \ |
243 |
|
|
int pi_end = cc->pi_end; \ |
244 |
|
|
int n = 0; \ |
245 |
|
|
\ |
246 |
|
|
if (cc->state < 0) \ |
247 |
|
|
cc->state = src[0] >= zero; \ |
248 |
|
|
\ |
249 |
|
|
while (n < nb_samples) { \ |
250 |
|
|
ptype new_max_peak; \ |
251 |
|
|
ptype new_rms_sum; \ |
252 |
|
|
int new_size; \ |
253 |
|
|
\ |
254 |
|
|
if ((cc->state != (src[n] >= zero)) || \ |
255 |
|
|
(pi[pi_end].size > max_period)) { \ |
256 |
|
|
ptype max_peak = pi[pi_end].max_peak; \ |
257 |
|
|
ptype rms_sum = pi[pi_end].rms_sum; \ |
258 |
|
|
int state = cc->state; \ |
259 |
|
|
\ |
260 |
|
|
cc->state = src[n] >= zero; \ |
261 |
|
|
av_assert1(pi[pi_end].size > 0); \ |
262 |
|
|
if (max_peak >= min_peak || \ |
263 |
|
|
pi[pi_end].size > max_period) { \ |
264 |
|
|
pi[pi_end].type = 1; \ |
265 |
|
|
pi_end++; \ |
266 |
|
|
if (pi_end >= MAX_ITEMS) \ |
267 |
|
|
pi_end = 0; \ |
268 |
|
|
if (cc->state != state) { \ |
269 |
|
|
pi[pi_end].max_peak = DBL_MIN; \ |
270 |
|
|
pi[pi_end].rms_sum = 0.0; \ |
271 |
|
|
} else { \ |
272 |
|
|
pi[pi_end].max_peak = max_peak; \ |
273 |
|
|
pi[pi_end].rms_sum = rms_sum; \ |
274 |
|
|
} \ |
275 |
|
|
pi[pi_end].type = 0; \ |
276 |
|
|
pi[pi_end].size = 0; \ |
277 |
|
|
av_assert1(pi_end != cc->pi_start); \ |
278 |
|
|
} \ |
279 |
|
|
} \ |
280 |
|
|
\ |
281 |
|
|
new_max_peak = pi[pi_end].max_peak; \ |
282 |
|
|
new_rms_sum = pi[pi_end].rms_sum; \ |
283 |
|
|
new_size = pi[pi_end].size; \ |
284 |
|
|
if (cc->state) { \ |
285 |
|
|
while (src[n] >= zero) { \ |
286 |
|
|
new_max_peak = FFMAX(new_max_peak, src[n]); \ |
287 |
|
|
new_rms_sum += src[n] * src[n]; \ |
288 |
|
|
new_size++; \ |
289 |
|
|
n++; \ |
290 |
|
|
if (n >= nb_samples) \ |
291 |
|
|
break; \ |
292 |
|
|
} \ |
293 |
|
|
} else { \ |
294 |
|
|
while (src[n] < zero) { \ |
295 |
|
|
new_max_peak = FFMAX(new_max_peak, -src[n]); \ |
296 |
|
|
new_rms_sum += src[n] * src[n]; \ |
297 |
|
|
new_size++; \ |
298 |
|
|
n++; \ |
299 |
|
|
if (n >= nb_samples) \ |
300 |
|
|
break; \ |
301 |
|
|
} \ |
302 |
|
|
} \ |
303 |
|
|
\ |
304 |
|
|
pi[pi_end].max_peak = new_max_peak; \ |
305 |
|
|
pi[pi_end].rms_sum = new_rms_sum; \ |
306 |
|
|
pi[pi_end].size = new_size; \ |
307 |
|
|
} \ |
308 |
|
|
cc->pi_end = pi_end; \ |
309 |
|
|
} |
310 |
|
|
|
311 |
|
✗ |
ANALYZE_CHANNEL(dbl, double, 0.0, MIN_PEAK) |
312 |
|
✗ |
ANALYZE_CHANNEL(flt, float, 0.f, (float)MIN_PEAK) |
313 |
|
|
|
314 |
|
|
#define FILTER_CHANNELS(name, ptype) \ |
315 |
|
|
static void filter_channels_## name (AVFilterContext *ctx, \ |
316 |
|
|
AVFrame *in, AVFrame *out, int nb_samples) \ |
317 |
|
|
{ \ |
318 |
|
|
SpeechNormalizerContext *s = ctx->priv; \ |
319 |
|
|
AVFilterLink *inlink = ctx->inputs[0]; \ |
320 |
|
|
\ |
321 |
|
|
for (int ch = 0; ch < inlink->ch_layout.nb_channels; ch++) { \ |
322 |
|
|
ChannelContext *cc = &s->cc[ch]; \ |
323 |
|
|
const ptype *src = (const ptype *)in->extended_data[ch]; \ |
324 |
|
|
ptype *dst = (ptype *)out->extended_data[ch]; \ |
325 |
|
|
enum AVChannel channel = av_channel_layout_channel_from_index(&inlink->ch_layout, ch); \ |
326 |
|
|
const int bypass = av_channel_layout_index_from_channel(&s->ch_layout, channel) < 0; \ |
327 |
|
|
int n = 0; \ |
328 |
|
|
\ |
329 |
|
|
while (n < nb_samples) { \ |
330 |
|
|
ptype gain; \ |
331 |
|
|
int size; \ |
332 |
|
|
\ |
333 |
|
|
next_pi(ctx, cc, bypass); \ |
334 |
|
|
size = FFMIN(nb_samples - n, cc->pi_size); \ |
335 |
|
|
av_assert1(size > 0); \ |
336 |
|
|
gain = cc->gain_state; \ |
337 |
|
|
consume_pi(cc, size); \ |
338 |
|
|
for (int i = n; !ctx->is_disabled && i < n + size; i++) \ |
339 |
|
|
dst[i] = src[i] * gain; \ |
340 |
|
|
n += size; \ |
341 |
|
|
} \ |
342 |
|
|
} \ |
343 |
|
|
} |
344 |
|
|
|
345 |
|
✗ |
FILTER_CHANNELS(dbl, double) |
346 |
|
✗ |
FILTER_CHANNELS(flt, float) |
347 |
|
|
|
348 |
|
✗ |
static double dlerp(double min, double max, double mix) |
349 |
|
|
{ |
350 |
|
✗ |
return min + (max - min) * mix; |
351 |
|
|
} |
352 |
|
|
|
353 |
|
✗ |
static float flerp(float min, float max, float mix) |
354 |
|
|
{ |
355 |
|
✗ |
return min + (max - min) * mix; |
356 |
|
|
} |
357 |
|
|
|
358 |
|
|
#define FILTER_LINK_CHANNELS(name, ptype, tlerp) \ |
359 |
|
|
static void filter_link_channels_## name (AVFilterContext *ctx, \ |
360 |
|
|
AVFrame *in, AVFrame *out, \ |
361 |
|
|
int nb_samples) \ |
362 |
|
|
{ \ |
363 |
|
|
SpeechNormalizerContext *s = ctx->priv; \ |
364 |
|
|
AVFilterLink *inlink = ctx->inputs[0]; \ |
365 |
|
|
int n = 0; \ |
366 |
|
|
\ |
367 |
|
|
while (n < nb_samples) { \ |
368 |
|
|
int min_size = nb_samples - n; \ |
369 |
|
|
ptype gain = s->max_expansion; \ |
370 |
|
|
\ |
371 |
|
|
for (int ch = 0; ch < inlink->ch_layout.nb_channels; ch++) { \ |
372 |
|
|
ChannelContext *cc = &s->cc[ch]; \ |
373 |
|
|
\ |
374 |
|
|
enum AVChannel channel = av_channel_layout_channel_from_index(&inlink->ch_layout, ch); \ |
375 |
|
|
cc->bypass = av_channel_layout_index_from_channel(&s->ch_layout, channel) < 0; \ |
376 |
|
|
\ |
377 |
|
|
next_pi(ctx, cc, cc->bypass); \ |
378 |
|
|
min_size = FFMIN(min_size, cc->pi_size); \ |
379 |
|
|
} \ |
380 |
|
|
\ |
381 |
|
|
av_assert1(min_size > 0); \ |
382 |
|
|
for (int ch = 0; ch < inlink->ch_layout.nb_channels; ch++) { \ |
383 |
|
|
ChannelContext *cc = &s->cc[ch]; \ |
384 |
|
|
\ |
385 |
|
|
if (cc->bypass) \ |
386 |
|
|
continue; \ |
387 |
|
|
gain = FFMIN(gain, min_gain(ctx, cc, min_size)); \ |
388 |
|
|
} \ |
389 |
|
|
\ |
390 |
|
|
for (int ch = 0; ch < inlink->ch_layout.nb_channels; ch++) { \ |
391 |
|
|
ChannelContext *cc = &s->cc[ch]; \ |
392 |
|
|
const ptype *src = (const ptype *)in->extended_data[ch]; \ |
393 |
|
|
ptype *dst = (ptype *)out->extended_data[ch]; \ |
394 |
|
|
\ |
395 |
|
|
consume_pi(cc, min_size); \ |
396 |
|
|
if (cc->bypass) \ |
397 |
|
|
continue; \ |
398 |
|
|
\ |
399 |
|
|
for (int i = n; !ctx->is_disabled && i < n + min_size; i++) { \ |
400 |
|
|
ptype g = tlerp(s->prev_gain, gain, (i - n) / (ptype)min_size); \ |
401 |
|
|
dst[i] = src[i] * g; \ |
402 |
|
|
} \ |
403 |
|
|
} \ |
404 |
|
|
\ |
405 |
|
|
s->prev_gain = gain; \ |
406 |
|
|
n += min_size; \ |
407 |
|
|
} \ |
408 |
|
|
} |
409 |
|
|
|
410 |
|
✗ |
FILTER_LINK_CHANNELS(dbl, double, dlerp) |
411 |
|
✗ |
FILTER_LINK_CHANNELS(flt, float, flerp) |
412 |
|
|
|
413 |
|
✗ |
static int filter_frame(AVFilterContext *ctx) |
414 |
|
|
{ |
415 |
|
✗ |
SpeechNormalizerContext *s = ctx->priv; |
416 |
|
✗ |
AVFilterLink *outlink = ctx->outputs[0]; |
417 |
|
✗ |
AVFilterLink *inlink = ctx->inputs[0]; |
418 |
|
|
int ret; |
419 |
|
|
|
420 |
|
✗ |
while (s->queue.available > 0) { |
421 |
|
|
int min_pi_nb_samples; |
422 |
|
|
AVFrame *in, *out; |
423 |
|
|
|
424 |
|
✗ |
in = ff_bufqueue_peek(&s->queue, 0); |
425 |
|
✗ |
if (!in) |
426 |
|
✗ |
break; |
427 |
|
|
|
428 |
|
✗ |
min_pi_nb_samples = available_samples(ctx); |
429 |
|
✗ |
if (min_pi_nb_samples < in->nb_samples && !s->eof) |
430 |
|
✗ |
break; |
431 |
|
|
|
432 |
|
✗ |
in = ff_bufqueue_get(&s->queue); |
433 |
|
|
|
434 |
|
✗ |
if (av_frame_is_writable(in)) { |
435 |
|
✗ |
out = in; |
436 |
|
|
} else { |
437 |
|
✗ |
out = ff_get_audio_buffer(outlink, in->nb_samples); |
438 |
|
✗ |
if (!out) { |
439 |
|
✗ |
av_frame_free(&in); |
440 |
|
✗ |
return AVERROR(ENOMEM); |
441 |
|
|
} |
442 |
|
✗ |
av_frame_copy_props(out, in); |
443 |
|
|
} |
444 |
|
|
|
445 |
|
✗ |
s->filter_channels[s->link](ctx, in, out, in->nb_samples); |
446 |
|
|
|
447 |
|
✗ |
s->pts = in->pts + av_rescale_q(in->nb_samples, av_make_q(1, outlink->sample_rate), |
448 |
|
|
outlink->time_base); |
449 |
|
|
|
450 |
|
✗ |
if (out != in) |
451 |
|
✗ |
av_frame_free(&in); |
452 |
|
✗ |
return ff_filter_frame(outlink, out); |
453 |
|
|
} |
454 |
|
|
|
455 |
|
✗ |
for (int f = 0; f < ff_inlink_queued_frames(inlink); f++) { |
456 |
|
|
AVFrame *in; |
457 |
|
|
|
458 |
|
✗ |
ret = ff_inlink_consume_frame(inlink, &in); |
459 |
|
✗ |
if (ret < 0) |
460 |
|
✗ |
return ret; |
461 |
|
✗ |
if (ret == 0) |
462 |
|
✗ |
break; |
463 |
|
|
|
464 |
|
✗ |
ff_bufqueue_add(ctx, &s->queue, in); |
465 |
|
|
|
466 |
|
✗ |
for (int ch = 0; ch < inlink->ch_layout.nb_channels; ch++) { |
467 |
|
✗ |
ChannelContext *cc = &s->cc[ch]; |
468 |
|
|
|
469 |
|
✗ |
s->analyze_channel(ctx, cc, in->extended_data[ch], in->nb_samples); |
470 |
|
|
} |
471 |
|
|
} |
472 |
|
|
|
473 |
|
✗ |
return 1; |
474 |
|
|
} |
475 |
|
|
|
476 |
|
✗ |
static int activate(AVFilterContext *ctx) |
477 |
|
|
{ |
478 |
|
✗ |
AVFilterLink *inlink = ctx->inputs[0]; |
479 |
|
✗ |
AVFilterLink *outlink = ctx->outputs[0]; |
480 |
|
✗ |
SpeechNormalizerContext *s = ctx->priv; |
481 |
|
|
int ret, status; |
482 |
|
|
int64_t pts; |
483 |
|
|
|
484 |
|
✗ |
ret = av_channel_layout_copy(&s->ch_layout, &inlink->ch_layout); |
485 |
|
✗ |
if (ret < 0) |
486 |
|
✗ |
return ret; |
487 |
|
✗ |
if (strcmp(s->ch_layout_str, "all")) |
488 |
|
✗ |
av_channel_layout_from_string(&s->ch_layout, |
489 |
|
✗ |
s->ch_layout_str); |
490 |
|
|
|
491 |
|
✗ |
FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink); |
492 |
|
|
|
493 |
|
✗ |
ret = filter_frame(ctx); |
494 |
|
✗ |
if (ret <= 0) |
495 |
|
✗ |
return ret; |
496 |
|
|
|
497 |
|
✗ |
if (!s->eof && ff_inlink_acknowledge_status(inlink, &status, &pts)) { |
498 |
|
✗ |
if (status == AVERROR_EOF) |
499 |
|
✗ |
s->eof = 1; |
500 |
|
|
} |
501 |
|
|
|
502 |
|
✗ |
if (s->eof && ff_inlink_queued_samples(inlink) == 0 && |
503 |
|
✗ |
s->queue.available == 0) { |
504 |
|
✗ |
ff_outlink_set_status(outlink, AVERROR_EOF, s->pts); |
505 |
|
✗ |
return 0; |
506 |
|
|
} |
507 |
|
|
|
508 |
|
✗ |
if (s->queue.available > 0) { |
509 |
|
✗ |
AVFrame *in = ff_bufqueue_peek(&s->queue, 0); |
510 |
|
✗ |
const int nb_samples = available_samples(ctx); |
511 |
|
|
|
512 |
|
✗ |
if (nb_samples >= in->nb_samples || s->eof) { |
513 |
|
✗ |
ff_filter_set_ready(ctx, 10); |
514 |
|
✗ |
return 0; |
515 |
|
|
} |
516 |
|
|
} |
517 |
|
|
|
518 |
|
✗ |
FF_FILTER_FORWARD_WANTED(outlink, inlink); |
519 |
|
|
|
520 |
|
✗ |
return FFERROR_NOT_READY; |
521 |
|
|
} |
522 |
|
|
|
523 |
|
✗ |
static int config_input(AVFilterLink *inlink) |
524 |
|
|
{ |
525 |
|
✗ |
AVFilterContext *ctx = inlink->dst; |
526 |
|
✗ |
SpeechNormalizerContext *s = ctx->priv; |
527 |
|
|
|
528 |
|
✗ |
s->max_period = inlink->sample_rate / 10; |
529 |
|
|
|
530 |
|
✗ |
s->prev_gain = 1.; |
531 |
|
✗ |
s->cc = av_calloc(inlink->ch_layout.nb_channels, sizeof(*s->cc)); |
532 |
|
✗ |
if (!s->cc) |
533 |
|
✗ |
return AVERROR(ENOMEM); |
534 |
|
|
|
535 |
|
✗ |
for (int ch = 0; ch < inlink->ch_layout.nb_channels; ch++) { |
536 |
|
✗ |
ChannelContext *cc = &s->cc[ch]; |
537 |
|
|
|
538 |
|
✗ |
cc->state = -1; |
539 |
|
✗ |
cc->gain_state = s->max_expansion; |
540 |
|
|
} |
541 |
|
|
|
542 |
|
✗ |
switch (inlink->format) { |
543 |
|
✗ |
case AV_SAMPLE_FMT_FLTP: |
544 |
|
✗ |
s->analyze_channel = analyze_channel_flt; |
545 |
|
✗ |
s->filter_channels[0] = filter_channels_flt; |
546 |
|
✗ |
s->filter_channels[1] = filter_link_channels_flt; |
547 |
|
✗ |
break; |
548 |
|
✗ |
case AV_SAMPLE_FMT_DBLP: |
549 |
|
✗ |
s->analyze_channel = analyze_channel_dbl; |
550 |
|
✗ |
s->filter_channels[0] = filter_channels_dbl; |
551 |
|
✗ |
s->filter_channels[1] = filter_link_channels_dbl; |
552 |
|
✗ |
break; |
553 |
|
✗ |
default: |
554 |
|
|
av_assert1(0); |
555 |
|
|
} |
556 |
|
|
|
557 |
|
✗ |
return 0; |
558 |
|
|
} |
559 |
|
|
|
560 |
|
✗ |
static int process_command(AVFilterContext *ctx, const char *cmd, const char *args, |
561 |
|
|
char *res, int res_len, int flags) |
562 |
|
|
{ |
563 |
|
✗ |
SpeechNormalizerContext *s = ctx->priv; |
564 |
|
✗ |
int link = s->link; |
565 |
|
|
int ret; |
566 |
|
|
|
567 |
|
✗ |
ret = ff_filter_process_command(ctx, cmd, args, res, res_len, flags); |
568 |
|
✗ |
if (ret < 0) |
569 |
|
✗ |
return ret; |
570 |
|
✗ |
if (link != s->link) |
571 |
|
✗ |
s->prev_gain = 1.; |
572 |
|
|
|
573 |
|
✗ |
return 0; |
574 |
|
|
} |
575 |
|
|
|
576 |
|
✗ |
static av_cold void uninit(AVFilterContext *ctx) |
577 |
|
|
{ |
578 |
|
✗ |
SpeechNormalizerContext *s = ctx->priv; |
579 |
|
|
|
580 |
|
✗ |
ff_bufqueue_discard_all(&s->queue); |
581 |
|
✗ |
av_channel_layout_uninit(&s->ch_layout); |
582 |
|
✗ |
av_freep(&s->cc); |
583 |
|
✗ |
} |
584 |
|
|
|
585 |
|
|
static const AVFilterPad inputs[] = { |
586 |
|
|
{ |
587 |
|
|
.name = "default", |
588 |
|
|
.type = AVMEDIA_TYPE_AUDIO, |
589 |
|
|
.config_props = config_input, |
590 |
|
|
}, |
591 |
|
|
}; |
592 |
|
|
|
593 |
|
|
const FFFilter ff_af_speechnorm = { |
594 |
|
|
.p.name = "speechnorm", |
595 |
|
|
.p.description = NULL_IF_CONFIG_SMALL("Speech Normalizer."), |
596 |
|
|
.p.priv_class = &speechnorm_class, |
597 |
|
|
.p.flags = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL, |
598 |
|
|
.priv_size = sizeof(SpeechNormalizerContext), |
599 |
|
|
.activate = activate, |
600 |
|
|
.uninit = uninit, |
601 |
|
|
FILTER_INPUTS(inputs), |
602 |
|
|
FILTER_OUTPUTS(ff_audio_default_filterpad), |
603 |
|
|
FILTER_SAMPLEFMTS(AV_SAMPLE_FMT_FLTP, AV_SAMPLE_FMT_DBLP), |
604 |
|
|
.process_command = process_command, |
605 |
|
|
}; |
606 |
|
|
|