GCC Code Coverage Report
Directory: ../../../ffmpeg/ Exec Total Coverage
File: src/libavfilter/af_speechnorm.c Lines: 0 181 0.0 %
Date: 2020-11-28 20:53:16 Branches: 0 241 0.0 %

Line Branch Exec Source
1
/*
2
 * Copyright (c) 2020 Paul B Mahol
3
 *
4
 * Speech Normalizer
5
 *
6
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or
9
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * FFmpeg is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with FFmpeg; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
 */
22
23
/**
24
 * @file
25
 * Speech Normalizer
26
 */
27
28
#include <float.h>
29
30
#include "libavutil/avassert.h"
31
#include "libavutil/opt.h"
32
33
#define FF_BUFQUEUE_SIZE (1024)
34
#include "bufferqueue.h"
35
36
#include "audio.h"
37
#include "avfilter.h"
38
#include "filters.h"
39
#include "internal.h"
40
41
#define MAX_ITEMS  882000
42
#define MIN_PEAK (1. / 32768.)
43
44
typedef struct PeriodItem {
45
    int size;
46
    int type;
47
    double max_peak;
48
} PeriodItem;
49
50
typedef struct ChannelContext {
51
    int state;
52
    int bypass;
53
    PeriodItem pi[MAX_ITEMS];
54
    double gain_state;
55
    double pi_max_peak;
56
    int pi_start;
57
    int pi_end;
58
    int pi_size;
59
} ChannelContext;
60
61
typedef struct SpeechNormalizerContext {
62
    const AVClass *class;
63
64
    double peak_value;
65
    double max_expansion;
66
    double max_compression;
67
    double threshold_value;
68
    double raise_amount;
69
    double fall_amount;
70
    uint64_t channels;
71
    int invert;
72
    int link;
73
74
    ChannelContext *cc;
75
    double prev_gain;
76
77
    int max_period;
78
    int eof;
79
    int64_t pts;
80
81
    struct FFBufQueue queue;
82
83
    void (*analyze_channel)(AVFilterContext *ctx, ChannelContext *cc,
84
                            const uint8_t *srcp, int nb_samples);
85
    void (*filter_channels[2])(AVFilterContext *ctx,
86
                               AVFrame *in, int nb_samples);
87
} SpeechNormalizerContext;
88
89
#define OFFSET(x) offsetof(SpeechNormalizerContext, x)
90
#define FLAGS AV_OPT_FLAG_AUDIO_PARAM|AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_RUNTIME_PARAM
91
92
static const AVOption speechnorm_options[] = {
93
    { "peak", "set the peak value", OFFSET(peak_value), AV_OPT_TYPE_DOUBLE, {.dbl=0.95}, 0.0, 1.0, FLAGS },
94
    { "p",    "set the peak value", OFFSET(peak_value), AV_OPT_TYPE_DOUBLE, {.dbl=0.95}, 0.0, 1.0, FLAGS },
95
    { "expansion", "set the max expansion factor", OFFSET(max_expansion), AV_OPT_TYPE_DOUBLE, {.dbl=2.0}, 1.0, 50.0, FLAGS },
96
    { "e",         "set the max expansion factor", OFFSET(max_expansion), AV_OPT_TYPE_DOUBLE, {.dbl=2.0}, 1.0, 50.0, FLAGS },
97
    { "compression", "set the max compression factor", OFFSET(max_compression), AV_OPT_TYPE_DOUBLE, {.dbl=2.0}, 1.0, 50.0, FLAGS },
98
    { "c",           "set the max compression factor", OFFSET(max_compression), AV_OPT_TYPE_DOUBLE, {.dbl=2.0}, 1.0, 50.0, FLAGS },
99
    { "threshold", "set the threshold value", OFFSET(threshold_value), AV_OPT_TYPE_DOUBLE, {.dbl=0}, 0.0, 1.0, FLAGS },
100
    { "t",         "set the threshold value", OFFSET(threshold_value), AV_OPT_TYPE_DOUBLE, {.dbl=0}, 0.0, 1.0, FLAGS },
101
    { "raise", "set the expansion raising amount", OFFSET(raise_amount), AV_OPT_TYPE_DOUBLE, {.dbl=0.001}, 0.0, 1.0, FLAGS },
102
    { "r",     "set the expansion raising amount", OFFSET(raise_amount), AV_OPT_TYPE_DOUBLE, {.dbl=0.001}, 0.0, 1.0, FLAGS },
103
    { "fall", "set the compression raising amount", OFFSET(fall_amount), AV_OPT_TYPE_DOUBLE, {.dbl=0.001}, 0.0, 1.0, FLAGS },
104
    { "f",    "set the compression raising amount", OFFSET(fall_amount), AV_OPT_TYPE_DOUBLE, {.dbl=0.001}, 0.0, 1.0, FLAGS },
105
    { "channels", "set channels to filter", OFFSET(channels), AV_OPT_TYPE_CHANNEL_LAYOUT, {.i64=-1}, INT64_MIN, INT64_MAX, FLAGS },
106
    { "h",        "set channels to filter", OFFSET(channels), AV_OPT_TYPE_CHANNEL_LAYOUT, {.i64=-1}, INT64_MIN, INT64_MAX, FLAGS },
107
    { "invert", "set inverted filtering", OFFSET(invert), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS },
108
    { "i",      "set inverted filtering", OFFSET(invert), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS },
109
    { "link", "set linked channels filtering", OFFSET(link), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS },
110
    { "l",    "set linked channels filtering", OFFSET(link), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS },
111
    { NULL }
112
};
113
114
AVFILTER_DEFINE_CLASS(speechnorm);
115
116
static int query_formats(AVFilterContext *ctx)
117
{
118
    AVFilterFormats *formats;
119
    AVFilterChannelLayouts *layouts;
120
    static const enum AVSampleFormat sample_fmts[] = {
121
        AV_SAMPLE_FMT_FLTP, AV_SAMPLE_FMT_DBLP,
122
        AV_SAMPLE_FMT_NONE
123
    };
124
    int ret;
125
126
    layouts = ff_all_channel_counts();
127
    if (!layouts)
128
        return AVERROR(ENOMEM);
129
    ret = ff_set_common_channel_layouts(ctx, layouts);
130
    if (ret < 0)
131
        return ret;
132
133
    formats = ff_make_format_list(sample_fmts);
134
    if (!formats)
135
        return AVERROR(ENOMEM);
136
    ret = ff_set_common_formats(ctx, formats);
137
    if (ret < 0)
138
        return ret;
139
140
    formats = ff_all_samplerates();
141
    if (!formats)
142
        return AVERROR(ENOMEM);
143
    return ff_set_common_samplerates(ctx, formats);
144
}
145
146
static int get_pi_samples(PeriodItem *pi, int start, int end, int remain)
147
{
148
    int sum;
149
150
    if (pi[start].type == 0)
151
        return remain;
152
153
    sum = remain;
154
    while (start != end) {
155
        start++;
156
        if (start >= MAX_ITEMS)
157
            start = 0;
158
        if (pi[start].type == 0)
159
            break;
160
        av_assert0(pi[start].size > 0);
161
        sum += pi[start].size;
162
    }
163
164
    return sum;
165
}
166
167
static int available_samples(AVFilterContext *ctx)
168
{
169
    SpeechNormalizerContext *s = ctx->priv;
170
    AVFilterLink *inlink = ctx->inputs[0];
171
    int min_pi_nb_samples;
172
173
    min_pi_nb_samples = get_pi_samples(s->cc[0].pi, s->cc[0].pi_start, s->cc[0].pi_end, s->cc[0].pi_size);
174
    for (int ch = 1; ch < inlink->channels && min_pi_nb_samples > 0; ch++) {
175
        ChannelContext *cc = &s->cc[ch];
176
177
        min_pi_nb_samples = FFMIN(min_pi_nb_samples, get_pi_samples(cc->pi, cc->pi_start, cc->pi_end, cc->pi_size));
178
    }
179
180
    return min_pi_nb_samples;
181
}
182
183
static void consume_pi(ChannelContext *cc, int nb_samples)
184
{
185
    if (cc->pi_size >= nb_samples) {
186
        cc->pi_size -= nb_samples;
187
    } else {
188
        av_assert0(0);
189
    }
190
}
191
192
static double next_gain(AVFilterContext *ctx, double pi_max_peak, int bypass, double state)
193
{
194
    SpeechNormalizerContext *s = ctx->priv;
195
    const double expansion = FFMIN(s->max_expansion, s->peak_value / pi_max_peak);
196
    const double compression = 1. / s->max_compression;
197
    const int type = s->invert ? pi_max_peak <= s->threshold_value : pi_max_peak >= s->threshold_value;
198
199
    if (bypass) {
200
        return 1.;
201
    } else if (type) {
202
        return FFMIN(expansion, state + s->raise_amount);
203
    } else {
204
        return FFMIN(expansion, FFMAX(compression, state - s->fall_amount));
205
    }
206
}
207
208
static void next_pi(AVFilterContext *ctx, ChannelContext *cc, int bypass)
209
{
210
    av_assert0(cc->pi_size >= 0);
211
    if (cc->pi_size == 0) {
212
        SpeechNormalizerContext *s = ctx->priv;
213
        int start = cc->pi_start;
214
215
        av_assert0(cc->pi[start].size > 0);
216
        av_assert0(cc->pi[start].type > 0 || s->eof);
217
        cc->pi_size = cc->pi[start].size;
218
        cc->pi_max_peak = cc->pi[start].max_peak;
219
        av_assert0(cc->pi_start != cc->pi_end || s->eof);
220
        start++;
221
        if (start >= MAX_ITEMS)
222
            start = 0;
223
        cc->pi_start = start;
224
        cc->gain_state = next_gain(ctx, cc->pi_max_peak, bypass, cc->gain_state);
225
    }
226
}
227
228
static double min_gain(AVFilterContext *ctx, ChannelContext *cc, int max_size)
229
{
230
    SpeechNormalizerContext *s = ctx->priv;
231
    double min_gain = s->max_expansion;
232
    double gain_state = cc->gain_state;
233
    int size = cc->pi_size;
234
    int idx = cc->pi_start;
235
236
    min_gain = FFMIN(min_gain, gain_state);
237
    while (size <= max_size) {
238
        if (idx == cc->pi_end)
239
            break;
240
        gain_state = next_gain(ctx, cc->pi[idx].max_peak, 0, gain_state);
241
        min_gain = FFMIN(min_gain, gain_state);
242
        size += cc->pi[idx].size;
243
        idx++;
244
        if (idx >= MAX_ITEMS)
245
            idx = 0;
246
    }
247
248
    return min_gain;
249
}
250
251
#define ANALYZE_CHANNEL(name, ptype, zero)                                                 \
252
static void analyze_channel_## name (AVFilterContext *ctx, ChannelContext *cc,             \
253
                                     const uint8_t *srcp, int nb_samples)                  \
254
{                                                                                          \
255
    SpeechNormalizerContext *s = ctx->priv;                                                \
256
    const ptype *src = (const ptype *)srcp;                                                \
257
    int n = 0;                                                                             \
258
                                                                                           \
259
    if (cc->state < 0)                                                                     \
260
        cc->state = src[0] >= zero;                                                        \
261
                                                                                           \
262
    while (n < nb_samples) {                                                               \
263
        if ((cc->state != (src[n] >= zero)) ||                                             \
264
            (cc->pi[cc->pi_end].size > s->max_period)) {                                   \
265
            double max_peak = cc->pi[cc->pi_end].max_peak;                                 \
266
            int state = cc->state;                                                         \
267
            cc->state = src[n] >= zero;                                                    \
268
            av_assert0(cc->pi[cc->pi_end].size > 0);                                       \
269
            if (cc->pi[cc->pi_end].max_peak >= MIN_PEAK ||                                 \
270
                cc->pi[cc->pi_end].size > s->max_period) {                                 \
271
                cc->pi[cc->pi_end].type = 1;                                               \
272
                cc->pi_end++;                                                              \
273
                if (cc->pi_end >= MAX_ITEMS)                                               \
274
                    cc->pi_end = 0;                                                        \
275
                if (cc->state != state)                                                    \
276
                    cc->pi[cc->pi_end].max_peak = DBL_MIN;                                 \
277
                else                                                                       \
278
                    cc->pi[cc->pi_end].max_peak = max_peak;                                \
279
                cc->pi[cc->pi_end].type = 0;                                               \
280
                cc->pi[cc->pi_end].size = 0;                                               \
281
                av_assert0(cc->pi_end != cc->pi_start);                                    \
282
            }                                                                              \
283
        }                                                                                  \
284
                                                                                           \
285
        if (cc->state) {                                                                   \
286
            while (src[n] >= zero) {                                                       \
287
                cc->pi[cc->pi_end].max_peak = FFMAX(cc->pi[cc->pi_end].max_peak,  src[n]); \
288
                cc->pi[cc->pi_end].size++;                                                 \
289
                n++;                                                                       \
290
                if (n >= nb_samples)                                                       \
291
                    break;                                                                 \
292
            }                                                                              \
293
        } else {                                                                           \
294
            while (src[n] < zero) {                                                        \
295
                cc->pi[cc->pi_end].max_peak = FFMAX(cc->pi[cc->pi_end].max_peak, -src[n]); \
296
                cc->pi[cc->pi_end].size++;                                                 \
297
                n++;                                                                       \
298
                if (n >= nb_samples)                                                       \
299
                    break;                                                                 \
300
            }                                                                              \
301
        }                                                                                  \
302
    }                                                                                      \
303
}
304
305
ANALYZE_CHANNEL(dbl, double, 0.0)
306
ANALYZE_CHANNEL(flt, float,  0.f)
307
308
#define FILTER_CHANNELS(name, ptype)                                            \
309
static void filter_channels_## name (AVFilterContext *ctx,                      \
310
                                     AVFrame *in, int nb_samples)               \
311
{                                                                               \
312
    SpeechNormalizerContext *s = ctx->priv;                                     \
313
    AVFilterLink *inlink = ctx->inputs[0];                                      \
314
                                                                                \
315
    for (int ch = 0; ch < inlink->channels; ch++) {                             \
316
        ChannelContext *cc = &s->cc[ch];                                        \
317
        ptype *dst = (ptype *)in->extended_data[ch];                            \
318
        const int bypass = !(av_channel_layout_extract_channel(inlink->channel_layout, ch) & s->channels); \
319
        int n = 0;                                                              \
320
                                                                                \
321
        while (n < nb_samples) {                                                \
322
            ptype gain;                                                         \
323
            int size;                                                           \
324
                                                                                \
325
            next_pi(ctx, cc, bypass);                                           \
326
            size = FFMIN(nb_samples - n, cc->pi_size);                          \
327
            av_assert0(size > 0);                                               \
328
            gain = cc->gain_state;                                              \
329
            consume_pi(cc, size);                                               \
330
            for (int i = n; i < n + size; i++)                                  \
331
                dst[i] *= gain;                                                 \
332
            n += size;                                                          \
333
        }                                                                       \
334
    }                                                                           \
335
}
336
337
FILTER_CHANNELS(dbl, double)
338
FILTER_CHANNELS(flt, float)
339
340
static double lerp(double min, double max, double mix)
341
{
342
    return min + (max - min) * mix;
343
}
344
345
#define FILTER_LINK_CHANNELS(name, ptype)                                       \
346
static void filter_link_channels_## name (AVFilterContext *ctx,                 \
347
                                          AVFrame *in, int nb_samples)          \
348
{                                                                               \
349
    SpeechNormalizerContext *s = ctx->priv;                                     \
350
    AVFilterLink *inlink = ctx->inputs[0];                                      \
351
    int n = 0;                                                                  \
352
                                                                                \
353
    while (n < nb_samples) {                                                    \
354
        int min_size = nb_samples - n;                                          \
355
        int max_size = 1;                                                       \
356
        ptype gain = s->max_expansion;                                          \
357
                                                                                \
358
        for (int ch = 0; ch < inlink->channels; ch++) {                         \
359
            ChannelContext *cc = &s->cc[ch];                                    \
360
                                                                                \
361
            cc->bypass = !(av_channel_layout_extract_channel(inlink->channel_layout, ch) & s->channels); \
362
                                                                                \
363
            next_pi(ctx, cc, cc->bypass);                                       \
364
            min_size = FFMIN(min_size, cc->pi_size);                            \
365
            max_size = FFMAX(max_size, cc->pi_size);                            \
366
        }                                                                       \
367
                                                                                \
368
        av_assert0(min_size > 0);                                               \
369
        for (int ch = 0; ch < inlink->channels; ch++) {                         \
370
            ChannelContext *cc = &s->cc[ch];                                    \
371
                                                                                \
372
            if (cc->bypass)                                                     \
373
                continue;                                                       \
374
            gain = FFMIN(gain, min_gain(ctx, cc, max_size));                    \
375
        }                                                                       \
376
                                                                                \
377
        for (int ch = 0; ch < inlink->channels; ch++) {                         \
378
            ChannelContext *cc = &s->cc[ch];                                    \
379
            ptype *dst = (ptype *)in->extended_data[ch];                        \
380
                                                                                \
381
            consume_pi(cc, min_size);                                           \
382
            if (cc->bypass)                                                     \
383
                continue;                                                       \
384
                                                                                \
385
            for (int i = n; i < n + min_size; i++) {                            \
386
                ptype g = lerp(s->prev_gain, gain, (i - n) / (double)min_size); \
387
                dst[i] *= g;                                                    \
388
            }                                                                   \
389
        }                                                                       \
390
                                                                                \
391
        s->prev_gain = gain;                                                    \
392
        n += min_size;                                                          \
393
    }                                                                           \
394
}
395
396
FILTER_LINK_CHANNELS(dbl, double)
397
FILTER_LINK_CHANNELS(flt, float)
398
399
static int filter_frame(AVFilterContext *ctx)
400
{
401
    SpeechNormalizerContext *s = ctx->priv;
402
    AVFilterLink *outlink = ctx->outputs[0];
403
    AVFilterLink *inlink = ctx->inputs[0];
404
    int ret;
405
406
    while (s->queue.available > 0) {
407
        int min_pi_nb_samples;
408
        AVFrame *in;
409
410
        in = ff_bufqueue_peek(&s->queue, 0);
411
        if (!in)
412
            break;
413
414
        min_pi_nb_samples = available_samples(ctx);
415
        if (min_pi_nb_samples < in->nb_samples && !s->eof)
416
            break;
417
418
        in = ff_bufqueue_get(&s->queue);
419
420
        av_frame_make_writable(in);
421
422
        s->filter_channels[s->link](ctx, in, in->nb_samples);
423
424
        s->pts = in->pts + in->nb_samples;
425
426
        return ff_filter_frame(outlink, in);
427
    }
428
429
    for (int f = 0; f < ff_inlink_queued_frames(inlink); f++) {
430
        AVFrame *in;
431
432
        ret = ff_inlink_consume_frame(inlink, &in);
433
        if (ret < 0)
434
            return ret;
435
        if (ret == 0)
436
            break;
437
438
        ff_bufqueue_add(ctx, &s->queue, in);
439
440
        for (int ch = 0; ch < inlink->channels; ch++) {
441
            ChannelContext *cc = &s->cc[ch];
442
443
            s->analyze_channel(ctx, cc, in->extended_data[ch], in->nb_samples);
444
        }
445
    }
446
447
    return 1;
448
}
449
450
static int activate(AVFilterContext *ctx)
451
{
452
    AVFilterLink *inlink = ctx->inputs[0];
453
    AVFilterLink *outlink = ctx->outputs[0];
454
    SpeechNormalizerContext *s = ctx->priv;
455
    int ret, status;
456
    int64_t pts;
457
458
    FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);
459
460
    ret = filter_frame(ctx);
461
    if (ret <= 0)
462
        return ret;
463
464
    if (!s->eof && ff_inlink_acknowledge_status(inlink, &status, &pts)) {
465
        if (status == AVERROR_EOF)
466
            s->eof = 1;
467
    }
468
469
    if (s->eof && ff_inlink_queued_samples(inlink) == 0 &&
470
        s->queue.available == 0) {
471
        ff_outlink_set_status(outlink, AVERROR_EOF, s->pts);
472
        return 0;
473
    }
474
475
    if (s->queue.available > 0) {
476
        AVFrame *in = ff_bufqueue_peek(&s->queue, 0);
477
        const int nb_samples = available_samples(ctx);
478
479
        if (nb_samples >= in->nb_samples || s->eof) {
480
            ff_filter_set_ready(ctx, 10);
481
            return 0;
482
        }
483
    }
484
485
    FF_FILTER_FORWARD_WANTED(outlink, inlink);
486
487
    return FFERROR_NOT_READY;
488
}
489
490
static int config_input(AVFilterLink *inlink)
491
{
492
    AVFilterContext *ctx = inlink->dst;
493
    SpeechNormalizerContext *s = ctx->priv;
494
495
    s->max_period = inlink->sample_rate / 10;
496
497
    s->prev_gain = 1.;
498
    s->cc = av_calloc(inlink->channels, sizeof(*s->cc));
499
    if (!s->cc)
500
        return AVERROR(ENOMEM);
501
502
    for (int ch = 0; ch < inlink->channels; ch++) {
503
        ChannelContext *cc = &s->cc[ch];
504
505
        cc->state = -1;
506
        cc->gain_state = 1.;
507
    }
508
509
    switch (inlink->format) {
510
    case AV_SAMPLE_FMT_FLTP:
511
        s->analyze_channel = analyze_channel_flt;
512
        s->filter_channels[0] = filter_channels_flt;
513
        s->filter_channels[1] = filter_link_channels_flt;
514
        break;
515
    case AV_SAMPLE_FMT_DBLP:
516
        s->analyze_channel = analyze_channel_dbl;
517
        s->filter_channels[0] = filter_channels_dbl;
518
        s->filter_channels[1] = filter_link_channels_dbl;
519
        break;
520
    default:
521
        av_assert0(0);
522
    }
523
524
    return 0;
525
}
526
527
static int process_command(AVFilterContext *ctx, const char *cmd, const char *args,
528
                           char *res, int res_len, int flags)
529
{
530
    SpeechNormalizerContext *s = ctx->priv;
531
    int link = s->link;
532
    int ret;
533
534
    ret = ff_filter_process_command(ctx, cmd, args, res, res_len, flags);
535
    if (ret < 0)
536
        return ret;
537
    if (link != s->link)
538
        s->prev_gain = 1.;
539
540
    return 0;
541
}
542
543
static av_cold void uninit(AVFilterContext *ctx)
544
{
545
    SpeechNormalizerContext *s = ctx->priv;
546
547
    ff_bufqueue_discard_all(&s->queue);
548
    av_freep(&s->cc);
549
}
550
551
static const AVFilterPad inputs[] = {
552
    {
553
        .name         = "default",
554
        .type         = AVMEDIA_TYPE_AUDIO,
555
        .config_props = config_input,
556
    },
557
    { NULL }
558
};
559
560
static const AVFilterPad outputs[] = {
561
    {
562
        .name = "default",
563
        .type = AVMEDIA_TYPE_AUDIO,
564
    },
565
    { NULL }
566
};
567
568
AVFilter ff_af_speechnorm = {
569
    .name            = "speechnorm",
570
    .description     = NULL_IF_CONFIG_SMALL("Speech Normalizer."),
571
    .query_formats   = query_formats,
572
    .priv_size       = sizeof(SpeechNormalizerContext),
573
    .priv_class      = &speechnorm_class,
574
    .activate        = activate,
575
    .uninit          = uninit,
576
    .inputs          = inputs,
577
    .outputs         = outputs,
578
    .process_command = process_command,
579
};