FFmpeg coverage


Directory: ../../../ffmpeg/
File: src/libavfilter/af_arnndn.c
Date: 2025-01-20 09:27:23
Exec Total Coverage
Lines: 0 759 0.0%
Functions: 0 41 0.0%
Branches: 0 698 0.0%

Line Branch Exec Source
1 /*
2 * Copyright (c) 2018 Gregor Richards
3 * Copyright (c) 2017 Mozilla
4 * Copyright (c) 2005-2009 Xiph.Org Foundation
5 * Copyright (c) 2007-2008 CSIRO
6 * Copyright (c) 2008-2011 Octasic Inc.
7 * Copyright (c) Jean-Marc Valin
8 * Copyright (c) 2019 Paul B Mahol
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * - Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * - Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
25 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 #include "libavutil/avassert.h"
35 #include "libavutil/file_open.h"
36 #include "libavutil/float_dsp.h"
37 #include "libavutil/mem.h"
38 #include "libavutil/mem_internal.h"
39 #include "libavutil/opt.h"
40 #include "libavutil/tx.h"
41 #include "avfilter.h"
42 #include "audio.h"
43 #include "filters.h"
44 #include "formats.h"
45
46 #define FRAME_SIZE_SHIFT 2
47 #define FRAME_SIZE (120<<FRAME_SIZE_SHIFT)
48 #define WINDOW_SIZE (2*FRAME_SIZE)
49 #define FREQ_SIZE (FRAME_SIZE + 1)
50
51 #define PITCH_MIN_PERIOD 60
52 #define PITCH_MAX_PERIOD 768
53 #define PITCH_FRAME_SIZE 960
54 #define PITCH_BUF_SIZE (PITCH_MAX_PERIOD+PITCH_FRAME_SIZE)
55
56 #define SQUARE(x) ((x)*(x))
57
58 #define NB_BANDS 22
59
60 #define CEPS_MEM 8
61 #define NB_DELTA_CEPS 6
62
63 #define NB_FEATURES (NB_BANDS+3*NB_DELTA_CEPS+2)
64
65 #define WEIGHTS_SCALE (1.f/256)
66
67 #define MAX_NEURONS 128
68
69 #define ACTIVATION_TANH 0
70 #define ACTIVATION_SIGMOID 1
71 #define ACTIVATION_RELU 2
72
73 #define Q15ONE 1.0f
74
75 typedef struct DenseLayer {
76 const float *bias;
77 const float *input_weights;
78 int nb_inputs;
79 int nb_neurons;
80 int activation;
81 } DenseLayer;
82
83 typedef struct GRULayer {
84 const float *bias;
85 const float *input_weights;
86 const float *recurrent_weights;
87 int nb_inputs;
88 int nb_neurons;
89 int activation;
90 } GRULayer;
91
92 typedef struct RNNModel {
93 int input_dense_size;
94 const DenseLayer *input_dense;
95
96 int vad_gru_size;
97 const GRULayer *vad_gru;
98
99 int noise_gru_size;
100 const GRULayer *noise_gru;
101
102 int denoise_gru_size;
103 const GRULayer *denoise_gru;
104
105 int denoise_output_size;
106 const DenseLayer *denoise_output;
107
108 int vad_output_size;
109 const DenseLayer *vad_output;
110 } RNNModel;
111
112 typedef struct RNNState {
113 float *vad_gru_state;
114 float *noise_gru_state;
115 float *denoise_gru_state;
116 RNNModel *model;
117 } RNNState;
118
119 typedef struct DenoiseState {
120 float analysis_mem[FRAME_SIZE];
121 float cepstral_mem[CEPS_MEM][NB_BANDS];
122 int memid;
123 DECLARE_ALIGNED(32, float, synthesis_mem)[FRAME_SIZE];
124 float pitch_buf[PITCH_BUF_SIZE];
125 float pitch_enh_buf[PITCH_BUF_SIZE];
126 float last_gain;
127 int last_period;
128 float mem_hp_x[2];
129 float lastg[NB_BANDS];
130 float history[FRAME_SIZE];
131 RNNState rnn[2];
132 AVTXContext *tx, *txi;
133 av_tx_fn tx_fn, txi_fn;
134 } DenoiseState;
135
136 typedef struct AudioRNNContext {
137 const AVClass *class;
138
139 char *model_name;
140 float mix;
141
142 int channels;
143 DenoiseState *st;
144
145 DECLARE_ALIGNED(32, float, window)[WINDOW_SIZE];
146 DECLARE_ALIGNED(32, float, dct_table)[FFALIGN(NB_BANDS, 4)][FFALIGN(NB_BANDS, 4)];
147
148 RNNModel *model[2];
149
150 AVFloatDSPContext *fdsp;
151 } AudioRNNContext;
152
153 #define F_ACTIVATION_TANH 0
154 #define F_ACTIVATION_SIGMOID 1
155 #define F_ACTIVATION_RELU 2
156
157 static void rnnoise_model_free(RNNModel *model)
158 {
159 #define FREE_MAYBE(ptr) do { if (ptr) free(ptr); } while (0)
160 #define FREE_DENSE(name) do { \
161 if (model->name) { \
162 av_free((void *) model->name->input_weights); \
163 av_free((void *) model->name->bias); \
164 av_free((void *) model->name); \
165 } \
166 } while (0)
167 #define FREE_GRU(name) do { \
168 if (model->name) { \
169 av_free((void *) model->name->input_weights); \
170 av_free((void *) model->name->recurrent_weights); \
171 av_free((void *) model->name->bias); \
172 av_free((void *) model->name); \
173 } \
174 } while (0)
175
176 if (!model)
177 return;
178 FREE_DENSE(input_dense);
179 FREE_GRU(vad_gru);
180 FREE_GRU(noise_gru);
181 FREE_GRU(denoise_gru);
182 FREE_DENSE(denoise_output);
183 FREE_DENSE(vad_output);
184 av_free(model);
185 }
186
187 static int rnnoise_model_from_file(FILE *f, RNNModel **rnn)
188 {
189 RNNModel *ret = NULL;
190 DenseLayer *input_dense;
191 GRULayer *vad_gru;
192 GRULayer *noise_gru;
193 GRULayer *denoise_gru;
194 DenseLayer *denoise_output;
195 DenseLayer *vad_output;
196 int in;
197
198 if (fscanf(f, "rnnoise-nu model file version %d\n", &in) != 1 || in != 1)
199 return AVERROR_INVALIDDATA;
200
201 ret = av_calloc(1, sizeof(RNNModel));
202 if (!ret)
203 return AVERROR(ENOMEM);
204
205 #define ALLOC_LAYER(type, name) \
206 name = av_calloc(1, sizeof(type)); \
207 if (!name) { \
208 rnnoise_model_free(ret); \
209 return AVERROR(ENOMEM); \
210 } \
211 ret->name = name
212
213 ALLOC_LAYER(DenseLayer, input_dense);
214 ALLOC_LAYER(GRULayer, vad_gru);
215 ALLOC_LAYER(GRULayer, noise_gru);
216 ALLOC_LAYER(GRULayer, denoise_gru);
217 ALLOC_LAYER(DenseLayer, denoise_output);
218 ALLOC_LAYER(DenseLayer, vad_output);
219
220 #define INPUT_VAL(name) do { \
221 if (fscanf(f, "%d", &in) != 1 || in < 0 || in > 128) { \
222 rnnoise_model_free(ret); \
223 return AVERROR(EINVAL); \
224 } \
225 name = in; \
226 } while (0)
227
228 #define INPUT_ACTIVATION(name) do { \
229 int activation; \
230 INPUT_VAL(activation); \
231 switch (activation) { \
232 case F_ACTIVATION_SIGMOID: \
233 name = ACTIVATION_SIGMOID; \
234 break; \
235 case F_ACTIVATION_RELU: \
236 name = ACTIVATION_RELU; \
237 break; \
238 default: \
239 name = ACTIVATION_TANH; \
240 } \
241 } while (0)
242
243 #define INPUT_ARRAY(name, len) do { \
244 float *values = av_calloc((len), sizeof(float)); \
245 if (!values) { \
246 rnnoise_model_free(ret); \
247 return AVERROR(ENOMEM); \
248 } \
249 name = values; \
250 for (int i = 0; i < (len); i++) { \
251 if (fscanf(f, "%d", &in) != 1) { \
252 rnnoise_model_free(ret); \
253 return AVERROR(EINVAL); \
254 } \
255 values[i] = in; \
256 } \
257 } while (0)
258
259 #define INPUT_ARRAY3(name, len0, len1, len2) do { \
260 float *values = av_calloc(FFALIGN((len0), 4) * FFALIGN((len1), 4) * (len2), sizeof(float)); \
261 if (!values) { \
262 rnnoise_model_free(ret); \
263 return AVERROR(ENOMEM); \
264 } \
265 name = values; \
266 for (int k = 0; k < (len0); k++) { \
267 for (int i = 0; i < (len2); i++) { \
268 for (int j = 0; j < (len1); j++) { \
269 if (fscanf(f, "%d", &in) != 1) { \
270 rnnoise_model_free(ret); \
271 return AVERROR(EINVAL); \
272 } \
273 values[j * (len2) * FFALIGN((len0), 4) + i * FFALIGN((len0), 4) + k] = in; \
274 } \
275 } \
276 } \
277 } while (0)
278
279 #define NEW_LINE() do { \
280 int c; \
281 while ((c = fgetc(f)) != EOF) { \
282 if (c == '\n') \
283 break; \
284 } \
285 } while (0)
286
287 #define INPUT_DENSE(name) do { \
288 INPUT_VAL(name->nb_inputs); \
289 INPUT_VAL(name->nb_neurons); \
290 ret->name ## _size = name->nb_neurons; \
291 INPUT_ACTIVATION(name->activation); \
292 NEW_LINE(); \
293 INPUT_ARRAY(name->input_weights, name->nb_inputs * name->nb_neurons); \
294 NEW_LINE(); \
295 INPUT_ARRAY(name->bias, name->nb_neurons); \
296 NEW_LINE(); \
297 } while (0)
298
299 #define INPUT_GRU(name) do { \
300 INPUT_VAL(name->nb_inputs); \
301 INPUT_VAL(name->nb_neurons); \
302 ret->name ## _size = name->nb_neurons; \
303 INPUT_ACTIVATION(name->activation); \
304 NEW_LINE(); \
305 INPUT_ARRAY3(name->input_weights, name->nb_inputs, name->nb_neurons, 3); \
306 NEW_LINE(); \
307 INPUT_ARRAY3(name->recurrent_weights, name->nb_neurons, name->nb_neurons, 3); \
308 NEW_LINE(); \
309 INPUT_ARRAY(name->bias, name->nb_neurons * 3); \
310 NEW_LINE(); \
311 } while (0)
312
313 INPUT_DENSE(input_dense);
314 INPUT_GRU(vad_gru);
315 INPUT_GRU(noise_gru);
316 INPUT_GRU(denoise_gru);
317 INPUT_DENSE(denoise_output);
318 INPUT_DENSE(vad_output);
319
320 if (vad_output->nb_neurons != 1) {
321 rnnoise_model_free(ret);
322 return AVERROR(EINVAL);
323 }
324
325 *rnn = ret;
326
327 return 0;
328 }
329
330 static int query_formats(const AVFilterContext *ctx,
331 AVFilterFormatsConfig **cfg_in,
332 AVFilterFormatsConfig **cfg_out)
333 {
334 static const enum AVSampleFormat sample_fmts[] = {
335 AV_SAMPLE_FMT_FLTP,
336 AV_SAMPLE_FMT_NONE
337 };
338 int ret, sample_rates[] = { 48000, -1 };
339
340 ret = ff_set_common_formats_from_list2(ctx, cfg_in, cfg_out, sample_fmts);
341 if (ret < 0)
342 return ret;
343
344 return ff_set_common_samplerates_from_list2(ctx, cfg_in, cfg_out, sample_rates);
345 }
346
347 static int config_input(AVFilterLink *inlink)
348 {
349 AVFilterContext *ctx = inlink->dst;
350 AudioRNNContext *s = ctx->priv;
351 int ret = 0;
352
353 s->channels = inlink->ch_layout.nb_channels;
354
355 if (!s->st)
356 s->st = av_calloc(s->channels, sizeof(DenoiseState));
357 if (!s->st)
358 return AVERROR(ENOMEM);
359
360 for (int i = 0; i < s->channels; i++) {
361 DenoiseState *st = &s->st[i];
362
363 st->rnn[0].model = s->model[0];
364 st->rnn[0].vad_gru_state = av_calloc(sizeof(float), FFALIGN(s->model[0]->vad_gru_size, 16));
365 st->rnn[0].noise_gru_state = av_calloc(sizeof(float), FFALIGN(s->model[0]->noise_gru_size, 16));
366 st->rnn[0].denoise_gru_state = av_calloc(sizeof(float), FFALIGN(s->model[0]->denoise_gru_size, 16));
367 if (!st->rnn[0].vad_gru_state ||
368 !st->rnn[0].noise_gru_state ||
369 !st->rnn[0].denoise_gru_state)
370 return AVERROR(ENOMEM);
371 }
372
373 for (int i = 0; i < s->channels; i++) {
374 DenoiseState *st = &s->st[i];
375 float scale = 1.f;
376
377 if (!st->tx)
378 ret = av_tx_init(&st->tx, &st->tx_fn, AV_TX_FLOAT_FFT, 0, WINDOW_SIZE, &scale, 0);
379 if (ret < 0)
380 return ret;
381
382 if (!st->txi)
383 ret = av_tx_init(&st->txi, &st->txi_fn, AV_TX_FLOAT_FFT, 1, WINDOW_SIZE, &scale, 0);
384 if (ret < 0)
385 return ret;
386 }
387
388 return ret;
389 }
390
391 static void biquad(float *y, float mem[2], const float *x,
392 const float *b, const float *a, int N)
393 {
394 for (int i = 0; i < N; i++) {
395 float xi, yi;
396
397 xi = x[i];
398 yi = x[i] + mem[0];
399 mem[0] = mem[1] + (b[0]*xi - a[0]*yi);
400 mem[1] = (b[1]*xi - a[1]*yi);
401 y[i] = yi;
402 }
403 }
404
405 #define RNN_MOVE(dst, src, n) (memmove((dst), (src), (n)*sizeof(*(dst)) + 0*((dst)-(src)) ))
406 #define RNN_CLEAR(dst, n) (memset((dst), 0, (n)*sizeof(*(dst))))
407 #define RNN_COPY(dst, src, n) (memcpy((dst), (src), (n)*sizeof(*(dst)) + 0*((dst)-(src)) ))
408
409 static void forward_transform(DenoiseState *st, AVComplexFloat *out, const float *in)
410 {
411 AVComplexFloat x[WINDOW_SIZE];
412 AVComplexFloat y[WINDOW_SIZE];
413
414 for (int i = 0; i < WINDOW_SIZE; i++) {
415 x[i].re = in[i];
416 x[i].im = 0;
417 }
418
419 st->tx_fn(st->tx, y, x, sizeof(AVComplexFloat));
420
421 RNN_COPY(out, y, FREQ_SIZE);
422 }
423
424 static void inverse_transform(DenoiseState *st, float *out, const AVComplexFloat *in)
425 {
426 AVComplexFloat x[WINDOW_SIZE];
427 AVComplexFloat y[WINDOW_SIZE];
428
429 RNN_COPY(x, in, FREQ_SIZE);
430
431 for (int i = FREQ_SIZE; i < WINDOW_SIZE; i++) {
432 x[i].re = x[WINDOW_SIZE - i].re;
433 x[i].im = -x[WINDOW_SIZE - i].im;
434 }
435
436 st->txi_fn(st->txi, y, x, sizeof(AVComplexFloat));
437
438 for (int i = 0; i < WINDOW_SIZE; i++)
439 out[i] = y[i].re / WINDOW_SIZE;
440 }
441
442 static const uint8_t eband5ms[] = {
443 /*0 200 400 600 800 1k 1.2 1.4 1.6 2k 2.4 2.8 3.2 4k 4.8 5.6 6.8 8k 9.6 12k 15.6 20k*/
444 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 34, 40, 48, 60, 78, 100
445 };
446
447 static void compute_band_energy(float *bandE, const AVComplexFloat *X)
448 {
449 float sum[NB_BANDS] = {0};
450
451 for (int i = 0; i < NB_BANDS - 1; i++) {
452 int band_size;
453
454 band_size = (eband5ms[i + 1] - eband5ms[i]) << FRAME_SIZE_SHIFT;
455 for (int j = 0; j < band_size; j++) {
456 float tmp, frac = (float)j / band_size;
457
458 tmp = SQUARE(X[(eband5ms[i] << FRAME_SIZE_SHIFT) + j].re);
459 tmp += SQUARE(X[(eband5ms[i] << FRAME_SIZE_SHIFT) + j].im);
460 sum[i] += (1.f - frac) * tmp;
461 sum[i + 1] += frac * tmp;
462 }
463 }
464
465 sum[0] *= 2;
466 sum[NB_BANDS - 1] *= 2;
467
468 for (int i = 0; i < NB_BANDS; i++)
469 bandE[i] = sum[i];
470 }
471
472 static void compute_band_corr(float *bandE, const AVComplexFloat *X, const AVComplexFloat *P)
473 {
474 float sum[NB_BANDS] = { 0 };
475
476 for (int i = 0; i < NB_BANDS - 1; i++) {
477 int band_size;
478
479 band_size = (eband5ms[i + 1] - eband5ms[i]) << FRAME_SIZE_SHIFT;
480 for (int j = 0; j < band_size; j++) {
481 float tmp, frac = (float)j / band_size;
482
483 tmp = X[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].re * P[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].re;
484 tmp += X[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].im * P[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].im;
485 sum[i] += (1 - frac) * tmp;
486 sum[i + 1] += frac * tmp;
487 }
488 }
489
490 sum[0] *= 2;
491 sum[NB_BANDS-1] *= 2;
492
493 for (int i = 0; i < NB_BANDS; i++)
494 bandE[i] = sum[i];
495 }
496
497 static void frame_analysis(AudioRNNContext *s, DenoiseState *st, AVComplexFloat *X, float *Ex, const float *in)
498 {
499 LOCAL_ALIGNED_32(float, x, [WINDOW_SIZE]);
500
501 RNN_COPY(x, st->analysis_mem, FRAME_SIZE);
502 RNN_COPY(x + FRAME_SIZE, in, FRAME_SIZE);
503 RNN_COPY(st->analysis_mem, in, FRAME_SIZE);
504 s->fdsp->vector_fmul(x, x, s->window, WINDOW_SIZE);
505 forward_transform(st, X, x);
506 compute_band_energy(Ex, X);
507 }
508
509 static void frame_synthesis(AudioRNNContext *s, DenoiseState *st, float *out, const AVComplexFloat *y)
510 {
511 LOCAL_ALIGNED_32(float, x, [WINDOW_SIZE]);
512 const float *src = st->history;
513 const float mix = s->mix;
514 const float imix = 1.f - FFMAX(mix, 0.f);
515
516 inverse_transform(st, x, y);
517 s->fdsp->vector_fmul(x, x, s->window, WINDOW_SIZE);
518 s->fdsp->vector_fmac_scalar(x, st->synthesis_mem, 1.f, FRAME_SIZE);
519 RNN_COPY(out, x, FRAME_SIZE);
520 RNN_COPY(st->synthesis_mem, &x[FRAME_SIZE], FRAME_SIZE);
521
522 for (int n = 0; n < FRAME_SIZE; n++)
523 out[n] = out[n] * mix + src[n] * imix;
524 }
525
526 static inline void xcorr_kernel(const float *x, const float *y, float sum[4], int len)
527 {
528 float y_0, y_1, y_2, y_3 = 0;
529 int j;
530
531 y_0 = *y++;
532 y_1 = *y++;
533 y_2 = *y++;
534
535 for (j = 0; j < len - 3; j += 4) {
536 float tmp;
537
538 tmp = *x++;
539 y_3 = *y++;
540 sum[0] += tmp * y_0;
541 sum[1] += tmp * y_1;
542 sum[2] += tmp * y_2;
543 sum[3] += tmp * y_3;
544 tmp = *x++;
545 y_0 = *y++;
546 sum[0] += tmp * y_1;
547 sum[1] += tmp * y_2;
548 sum[2] += tmp * y_3;
549 sum[3] += tmp * y_0;
550 tmp = *x++;
551 y_1 = *y++;
552 sum[0] += tmp * y_2;
553 sum[1] += tmp * y_3;
554 sum[2] += tmp * y_0;
555 sum[3] += tmp * y_1;
556 tmp = *x++;
557 y_2 = *y++;
558 sum[0] += tmp * y_3;
559 sum[1] += tmp * y_0;
560 sum[2] += tmp * y_1;
561 sum[3] += tmp * y_2;
562 }
563
564 if (j++ < len) {
565 float tmp = *x++;
566
567 y_3 = *y++;
568 sum[0] += tmp * y_0;
569 sum[1] += tmp * y_1;
570 sum[2] += tmp * y_2;
571 sum[3] += tmp * y_3;
572 }
573
574 if (j++ < len) {
575 float tmp=*x++;
576
577 y_0 = *y++;
578 sum[0] += tmp * y_1;
579 sum[1] += tmp * y_2;
580 sum[2] += tmp * y_3;
581 sum[3] += tmp * y_0;
582 }
583
584 if (j < len) {
585 float tmp=*x++;
586
587 y_1 = *y++;
588 sum[0] += tmp * y_2;
589 sum[1] += tmp * y_3;
590 sum[2] += tmp * y_0;
591 sum[3] += tmp * y_1;
592 }
593 }
594
595 static inline float celt_inner_prod(const float *x,
596 const float *y, int N)
597 {
598 float xy = 0.f;
599
600 for (int i = 0; i < N; i++)
601 xy += x[i] * y[i];
602
603 return xy;
604 }
605
606 static void celt_pitch_xcorr(const float *x, const float *y,
607 float *xcorr, int len, int max_pitch)
608 {
609 int i;
610
611 for (i = 0; i < max_pitch - 3; i += 4) {
612 float sum[4] = { 0, 0, 0, 0};
613
614 xcorr_kernel(x, y + i, sum, len);
615
616 xcorr[i] = sum[0];
617 xcorr[i + 1] = sum[1];
618 xcorr[i + 2] = sum[2];
619 xcorr[i + 3] = sum[3];
620 }
621 /* In case max_pitch isn't a multiple of 4, do non-unrolled version. */
622 for (; i < max_pitch; i++) {
623 xcorr[i] = celt_inner_prod(x, y + i, len);
624 }
625 }
626
627 static int celt_autocorr(const float *x, /* in: [0...n-1] samples x */
628 float *ac, /* out: [0...lag-1] ac values */
629 const float *window,
630 int overlap,
631 int lag,
632 int n)
633 {
634 int fastN = n - lag;
635 int shift;
636 const float *xptr;
637 float xx[PITCH_BUF_SIZE>>1];
638
639 if (overlap == 0) {
640 xptr = x;
641 } else {
642 for (int i = 0; i < n; i++)
643 xx[i] = x[i];
644 for (int i = 0; i < overlap; i++) {
645 xx[i] = x[i] * window[i];
646 xx[n-i-1] = x[n-i-1] * window[i];
647 }
648 xptr = xx;
649 }
650
651 shift = 0;
652 celt_pitch_xcorr(xptr, xptr, ac, fastN, lag+1);
653
654 for (int k = 0; k <= lag; k++) {
655 float d = 0.f;
656
657 for (int i = k + fastN; i < n; i++)
658 d += xptr[i] * xptr[i-k];
659 ac[k] += d;
660 }
661
662 return shift;
663 }
664
665 static void celt_lpc(float *lpc, /* out: [0...p-1] LPC coefficients */
666 const float *ac, /* in: [0...p] autocorrelation values */
667 int p)
668 {
669 float r, error = ac[0];
670
671 RNN_CLEAR(lpc, p);
672 if (ac[0] != 0) {
673 for (int i = 0; i < p; i++) {
674 /* Sum up this iteration's reflection coefficient */
675 float rr = 0;
676 for (int j = 0; j < i; j++)
677 rr += (lpc[j] * ac[i - j]);
678 rr += ac[i + 1];
679 r = -rr/error;
680 /* Update LPC coefficients and total error */
681 lpc[i] = r;
682 for (int j = 0; j < (i + 1) >> 1; j++) {
683 float tmp1, tmp2;
684 tmp1 = lpc[j];
685 tmp2 = lpc[i-1-j];
686 lpc[j] = tmp1 + (r*tmp2);
687 lpc[i-1-j] = tmp2 + (r*tmp1);
688 }
689
690 error = error - (r * r *error);
691 /* Bail out once we get 30 dB gain */
692 if (error < .001f * ac[0])
693 break;
694 }
695 }
696 }
697
698 static void celt_fir5(const float *x,
699 const float *num,
700 float *y,
701 int N,
702 float *mem)
703 {
704 float num0, num1, num2, num3, num4;
705 float mem0, mem1, mem2, mem3, mem4;
706
707 num0 = num[0];
708 num1 = num[1];
709 num2 = num[2];
710 num3 = num[3];
711 num4 = num[4];
712 mem0 = mem[0];
713 mem1 = mem[1];
714 mem2 = mem[2];
715 mem3 = mem[3];
716 mem4 = mem[4];
717
718 for (int i = 0; i < N; i++) {
719 float sum = x[i];
720
721 sum += (num0*mem0);
722 sum += (num1*mem1);
723 sum += (num2*mem2);
724 sum += (num3*mem3);
725 sum += (num4*mem4);
726 mem4 = mem3;
727 mem3 = mem2;
728 mem2 = mem1;
729 mem1 = mem0;
730 mem0 = x[i];
731 y[i] = sum;
732 }
733
734 mem[0] = mem0;
735 mem[1] = mem1;
736 mem[2] = mem2;
737 mem[3] = mem3;
738 mem[4] = mem4;
739 }
740
741 static void pitch_downsample(float *x[], float *x_lp,
742 int len, int C)
743 {
744 float ac[5];
745 float tmp=Q15ONE;
746 float lpc[4], mem[5]={0,0,0,0,0};
747 float lpc2[5];
748 float c1 = .8f;
749
750 for (int i = 1; i < len >> 1; i++)
751 x_lp[i] = .5f * (.5f * (x[0][(2*i-1)]+x[0][(2*i+1)])+x[0][2*i]);
752 x_lp[0] = .5f * (.5f * (x[0][1])+x[0][0]);
753 if (C==2) {
754 for (int i = 1; i < len >> 1; i++)
755 x_lp[i] += (.5f * (.5f * (x[1][(2*i-1)]+x[1][(2*i+1)])+x[1][2*i]));
756 x_lp[0] += .5f * (.5f * (x[1][1])+x[1][0]);
757 }
758
759 celt_autocorr(x_lp, ac, NULL, 0, 4, len>>1);
760
761 /* Noise floor -40 dB */
762 ac[0] *= 1.0001f;
763 /* Lag windowing */
764 for (int i = 1; i <= 4; i++) {
765 /*ac[i] *= exp(-.5*(2*M_PI*.002*i)*(2*M_PI*.002*i));*/
766 ac[i] -= ac[i]*(.008f*i)*(.008f*i);
767 }
768
769 celt_lpc(lpc, ac, 4);
770 for (int i = 0; i < 4; i++) {
771 tmp = .9f * tmp;
772 lpc[i] = (lpc[i] * tmp);
773 }
774 /* Add a zero */
775 lpc2[0] = lpc[0] + .8f;
776 lpc2[1] = lpc[1] + (c1 * lpc[0]);
777 lpc2[2] = lpc[2] + (c1 * lpc[1]);
778 lpc2[3] = lpc[3] + (c1 * lpc[2]);
779 lpc2[4] = (c1 * lpc[3]);
780 celt_fir5(x_lp, lpc2, x_lp, len>>1, mem);
781 }
782
783 static inline void dual_inner_prod(const float *x, const float *y01, const float *y02,
784 int N, float *xy1, float *xy2)
785 {
786 float xy01 = 0, xy02 = 0;
787
788 for (int i = 0; i < N; i++) {
789 xy01 += (x[i] * y01[i]);
790 xy02 += (x[i] * y02[i]);
791 }
792
793 *xy1 = xy01;
794 *xy2 = xy02;
795 }
796
797 static float compute_pitch_gain(float xy, float xx, float yy)
798 {
799 return xy / sqrtf(1.f + xx * yy);
800 }
801
802 static const uint8_t second_check[16] = {0, 0, 3, 2, 3, 2, 5, 2, 3, 2, 3, 2, 5, 2, 3, 2};
803 static float remove_doubling(float *x, int maxperiod, int minperiod, int N,
804 int *T0_, int prev_period, float prev_gain)
805 {
806 int k, i, T, T0;
807 float g, g0;
808 float pg;
809 float xy,xx,yy,xy2;
810 float xcorr[3];
811 float best_xy, best_yy;
812 int offset;
813 int minperiod0;
814 float yy_lookup[PITCH_MAX_PERIOD+1];
815
816 minperiod0 = minperiod;
817 maxperiod /= 2;
818 minperiod /= 2;
819 *T0_ /= 2;
820 prev_period /= 2;
821 N /= 2;
822 x += maxperiod;
823 if (*T0_>=maxperiod)
824 *T0_=maxperiod-1;
825
826 T = T0 = *T0_;
827 dual_inner_prod(x, x, x-T0, N, &xx, &xy);
828 yy_lookup[0] = xx;
829 yy=xx;
830 for (i = 1; i <= maxperiod; i++) {
831 yy = yy+(x[-i] * x[-i])-(x[N-i] * x[N-i]);
832 yy_lookup[i] = FFMAX(0, yy);
833 }
834 yy = yy_lookup[T0];
835 best_xy = xy;
836 best_yy = yy;
837 g = g0 = compute_pitch_gain(xy, xx, yy);
838 /* Look for any pitch at T/k */
839 for (k = 2; k <= 15; k++) {
840 int T1, T1b;
841 float g1;
842 float cont=0;
843 float thresh;
844 T1 = (2*T0+k)/(2*k);
845 if (T1 < minperiod)
846 break;
847 /* Look for another strong correlation at T1b */
848 if (k==2)
849 {
850 if (T1+T0>maxperiod)
851 T1b = T0;
852 else
853 T1b = T0+T1;
854 } else
855 {
856 T1b = (2*second_check[k]*T0+k)/(2*k);
857 }
858 dual_inner_prod(x, &x[-T1], &x[-T1b], N, &xy, &xy2);
859 xy = .5f * (xy + xy2);
860 yy = .5f * (yy_lookup[T1] + yy_lookup[T1b]);
861 g1 = compute_pitch_gain(xy, xx, yy);
862 if (FFABS(T1-prev_period)<=1)
863 cont = prev_gain;
864 else if (FFABS(T1-prev_period)<=2 && 5 * k * k < T0)
865 cont = prev_gain * .5f;
866 else
867 cont = 0;
868 thresh = FFMAX(.3f, (.7f * g0) - cont);
869 /* Bias against very high pitch (very short period) to avoid false-positives
870 due to short-term correlation */
871 if (T1<3*minperiod)
872 thresh = FFMAX(.4f, (.85f * g0) - cont);
873 else if (T1<2*minperiod)
874 thresh = FFMAX(.5f, (.9f * g0) - cont);
875 if (g1 > thresh)
876 {
877 best_xy = xy;
878 best_yy = yy;
879 T = T1;
880 g = g1;
881 }
882 }
883 best_xy = FFMAX(0, best_xy);
884 if (best_yy <= best_xy)
885 pg = Q15ONE;
886 else
887 pg = best_xy/(best_yy + 1);
888
889 for (k = 0; k < 3; k++)
890 xcorr[k] = celt_inner_prod(x, x-(T+k-1), N);
891 if ((xcorr[2]-xcorr[0]) > .7f * (xcorr[1]-xcorr[0]))
892 offset = 1;
893 else if ((xcorr[0]-xcorr[2]) > (.7f * (xcorr[1] - xcorr[2])))
894 offset = -1;
895 else
896 offset = 0;
897 if (pg > g)
898 pg = g;
899 *T0_ = 2*T+offset;
900
901 if (*T0_<minperiod0)
902 *T0_=minperiod0;
903 return pg;
904 }
905
906 static void find_best_pitch(float *xcorr, float *y, int len,
907 int max_pitch, int *best_pitch)
908 {
909 float best_num[2];
910 float best_den[2];
911 float Syy = 1.f;
912
913 best_num[0] = -1;
914 best_num[1] = -1;
915 best_den[0] = 0;
916 best_den[1] = 0;
917 best_pitch[0] = 0;
918 best_pitch[1] = 1;
919
920 for (int j = 0; j < len; j++)
921 Syy += y[j] * y[j];
922
923 for (int i = 0; i < max_pitch; i++) {
924 if (xcorr[i]>0) {
925 float num;
926 float xcorr16;
927
928 xcorr16 = xcorr[i];
929 /* Considering the range of xcorr16, this should avoid both underflows
930 and overflows (inf) when squaring xcorr16 */
931 xcorr16 *= 1e-12f;
932 num = xcorr16 * xcorr16;
933 if ((num * best_den[1]) > (best_num[1] * Syy)) {
934 if ((num * best_den[0]) > (best_num[0] * Syy)) {
935 best_num[1] = best_num[0];
936 best_den[1] = best_den[0];
937 best_pitch[1] = best_pitch[0];
938 best_num[0] = num;
939 best_den[0] = Syy;
940 best_pitch[0] = i;
941 } else {
942 best_num[1] = num;
943 best_den[1] = Syy;
944 best_pitch[1] = i;
945 }
946 }
947 }
948 Syy += y[i+len]*y[i+len] - y[i] * y[i];
949 Syy = FFMAX(1, Syy);
950 }
951 }
952
953 static void pitch_search(const float *x_lp, float *y,
954 int len, int max_pitch, int *pitch)
955 {
956 int lag;
957 int best_pitch[2]={0,0};
958 int offset;
959
960 float x_lp4[WINDOW_SIZE];
961 float y_lp4[WINDOW_SIZE];
962 float xcorr[WINDOW_SIZE];
963
964 lag = len+max_pitch;
965
966 /* Downsample by 2 again */
967 for (int j = 0; j < len >> 2; j++)
968 x_lp4[j] = x_lp[2*j];
969 for (int j = 0; j < lag >> 2; j++)
970 y_lp4[j] = y[2*j];
971
972 /* Coarse search with 4x decimation */
973
974 celt_pitch_xcorr(x_lp4, y_lp4, xcorr, len>>2, max_pitch>>2);
975
976 find_best_pitch(xcorr, y_lp4, len>>2, max_pitch>>2, best_pitch);
977
978 /* Finer search with 2x decimation */
979 for (int i = 0; i < max_pitch >> 1; i++) {
980 float sum;
981 xcorr[i] = 0;
982 if (FFABS(i-2*best_pitch[0])>2 && FFABS(i-2*best_pitch[1])>2)
983 continue;
984 sum = celt_inner_prod(x_lp, y+i, len>>1);
985 xcorr[i] = FFMAX(-1, sum);
986 }
987
988 find_best_pitch(xcorr, y, len>>1, max_pitch>>1, best_pitch);
989
990 /* Refine by pseudo-interpolation */
991 if (best_pitch[0] > 0 && best_pitch[0] < (max_pitch >> 1) - 1) {
992 float a, b, c;
993
994 a = xcorr[best_pitch[0] - 1];
995 b = xcorr[best_pitch[0]];
996 c = xcorr[best_pitch[0] + 1];
997 if (c - a > .7f * (b - a))
998 offset = 1;
999 else if (a - c > .7f * (b-c))
1000 offset = -1;
1001 else
1002 offset = 0;
1003 } else {
1004 offset = 0;
1005 }
1006
1007 *pitch = 2 * best_pitch[0] - offset;
1008 }
1009
1010 static void dct(AudioRNNContext *s, float *out, const float *in)
1011 {
1012 for (int i = 0; i < NB_BANDS; i++) {
1013 float sum;
1014
1015 sum = s->fdsp->scalarproduct_float(in, s->dct_table[i], FFALIGN(NB_BANDS, 4));
1016 out[i] = sum * sqrtf(2.f / 22);
1017 }
1018 }
1019
1020 static int compute_frame_features(AudioRNNContext *s, DenoiseState *st, AVComplexFloat *X, AVComplexFloat *P,
1021 float *Ex, float *Ep, float *Exp, float *features, const float *in)
1022 {
1023 float E = 0;
1024 float *ceps_0, *ceps_1, *ceps_2;
1025 float spec_variability = 0;
1026 LOCAL_ALIGNED_32(float, Ly, [NB_BANDS]);
1027 LOCAL_ALIGNED_32(float, p, [WINDOW_SIZE]);
1028 float pitch_buf[PITCH_BUF_SIZE>>1];
1029 int pitch_index;
1030 float gain;
1031 float *(pre[1]);
1032 float tmp[NB_BANDS];
1033 float follow, logMax;
1034
1035 frame_analysis(s, st, X, Ex, in);
1036 RNN_MOVE(st->pitch_buf, &st->pitch_buf[FRAME_SIZE], PITCH_BUF_SIZE-FRAME_SIZE);
1037 RNN_COPY(&st->pitch_buf[PITCH_BUF_SIZE-FRAME_SIZE], in, FRAME_SIZE);
1038 pre[0] = &st->pitch_buf[0];
1039 pitch_downsample(pre, pitch_buf, PITCH_BUF_SIZE, 1);
1040 pitch_search(pitch_buf+(PITCH_MAX_PERIOD>>1), pitch_buf, PITCH_FRAME_SIZE,
1041 PITCH_MAX_PERIOD-3*PITCH_MIN_PERIOD, &pitch_index);
1042 pitch_index = PITCH_MAX_PERIOD-pitch_index;
1043
1044 gain = remove_doubling(pitch_buf, PITCH_MAX_PERIOD, PITCH_MIN_PERIOD,
1045 PITCH_FRAME_SIZE, &pitch_index, st->last_period, st->last_gain);
1046 st->last_period = pitch_index;
1047 st->last_gain = gain;
1048
1049 for (int i = 0; i < WINDOW_SIZE; i++)
1050 p[i] = st->pitch_buf[PITCH_BUF_SIZE-WINDOW_SIZE-pitch_index+i];
1051
1052 s->fdsp->vector_fmul(p, p, s->window, WINDOW_SIZE);
1053 forward_transform(st, P, p);
1054 compute_band_energy(Ep, P);
1055 compute_band_corr(Exp, X, P);
1056
1057 for (int i = 0; i < NB_BANDS; i++)
1058 Exp[i] = Exp[i] / sqrtf(.001f+Ex[i]*Ep[i]);
1059
1060 dct(s, tmp, Exp);
1061
1062 for (int i = 0; i < NB_DELTA_CEPS; i++)
1063 features[NB_BANDS+2*NB_DELTA_CEPS+i] = tmp[i];
1064
1065 features[NB_BANDS+2*NB_DELTA_CEPS] -= 1.3;
1066 features[NB_BANDS+2*NB_DELTA_CEPS+1] -= 0.9;
1067 features[NB_BANDS+3*NB_DELTA_CEPS] = .01*(pitch_index-300);
1068 logMax = -2;
1069 follow = -2;
1070
1071 for (int i = 0; i < NB_BANDS; i++) {
1072 Ly[i] = log10f(1e-2f + Ex[i]);
1073 Ly[i] = FFMAX(logMax-7, FFMAX(follow-1.5, Ly[i]));
1074 logMax = FFMAX(logMax, Ly[i]);
1075 follow = FFMAX(follow-1.5, Ly[i]);
1076 E += Ex[i];
1077 }
1078
1079 if (E < 0.04f) {
1080 /* If there's no audio, avoid messing up the state. */
1081 RNN_CLEAR(features, NB_FEATURES);
1082 return 1;
1083 }
1084
1085 dct(s, features, Ly);
1086 features[0] -= 12;
1087 features[1] -= 4;
1088 ceps_0 = st->cepstral_mem[st->memid];
1089 ceps_1 = (st->memid < 1) ? st->cepstral_mem[CEPS_MEM+st->memid-1] : st->cepstral_mem[st->memid-1];
1090 ceps_2 = (st->memid < 2) ? st->cepstral_mem[CEPS_MEM+st->memid-2] : st->cepstral_mem[st->memid-2];
1091
1092 for (int i = 0; i < NB_BANDS; i++)
1093 ceps_0[i] = features[i];
1094
1095 st->memid++;
1096 for (int i = 0; i < NB_DELTA_CEPS; i++) {
1097 features[i] = ceps_0[i] + ceps_1[i] + ceps_2[i];
1098 features[NB_BANDS+i] = ceps_0[i] - ceps_2[i];
1099 features[NB_BANDS+NB_DELTA_CEPS+i] = ceps_0[i] - 2*ceps_1[i] + ceps_2[i];
1100 }
1101 /* Spectral variability features. */
1102 if (st->memid == CEPS_MEM)
1103 st->memid = 0;
1104
1105 for (int i = 0; i < CEPS_MEM; i++) {
1106 float mindist = 1e15f;
1107 for (int j = 0; j < CEPS_MEM; j++) {
1108 float dist = 0.f;
1109 for (int k = 0; k < NB_BANDS; k++) {
1110 float tmp;
1111
1112 tmp = st->cepstral_mem[i][k] - st->cepstral_mem[j][k];
1113 dist += tmp*tmp;
1114 }
1115
1116 if (j != i)
1117 mindist = FFMIN(mindist, dist);
1118 }
1119
1120 spec_variability += mindist;
1121 }
1122
1123 features[NB_BANDS+3*NB_DELTA_CEPS+1] = spec_variability/CEPS_MEM-2.1;
1124
1125 return 0;
1126 }
1127
1128 static void interp_band_gain(float *g, const float *bandE)
1129 {
1130 memset(g, 0, sizeof(*g) * FREQ_SIZE);
1131
1132 for (int i = 0; i < NB_BANDS - 1; i++) {
1133 const int band_size = (eband5ms[i + 1] - eband5ms[i]) << FRAME_SIZE_SHIFT;
1134
1135 for (int j = 0; j < band_size; j++) {
1136 float frac = (float)j / band_size;
1137
1138 g[(eband5ms[i] << FRAME_SIZE_SHIFT) + j] = (1.f - frac) * bandE[i] + frac * bandE[i + 1];
1139 }
1140 }
1141 }
1142
1143 static void pitch_filter(AVComplexFloat *X, const AVComplexFloat *P, const float *Ex, const float *Ep,
1144 const float *Exp, const float *g)
1145 {
1146 float newE[NB_BANDS];
1147 float r[NB_BANDS];
1148 float norm[NB_BANDS];
1149 float rf[FREQ_SIZE] = {0};
1150 float normf[FREQ_SIZE]={0};
1151
1152 for (int i = 0; i < NB_BANDS; i++) {
1153 if (Exp[i]>g[i]) r[i] = 1;
1154 else r[i] = SQUARE(Exp[i])*(1-SQUARE(g[i]))/(.001 + SQUARE(g[i])*(1-SQUARE(Exp[i])));
1155 r[i] = sqrtf(av_clipf(r[i], 0, 1));
1156 r[i] *= sqrtf(Ex[i]/(1e-8+Ep[i]));
1157 }
1158 interp_band_gain(rf, r);
1159 for (int i = 0; i < FREQ_SIZE; i++) {
1160 X[i].re += rf[i]*P[i].re;
1161 X[i].im += rf[i]*P[i].im;
1162 }
1163 compute_band_energy(newE, X);
1164 for (int i = 0; i < NB_BANDS; i++) {
1165 norm[i] = sqrtf(Ex[i] / (1e-8+newE[i]));
1166 }
1167 interp_band_gain(normf, norm);
1168 for (int i = 0; i < FREQ_SIZE; i++) {
1169 X[i].re *= normf[i];
1170 X[i].im *= normf[i];
1171 }
1172 }
1173
1174 static const float tansig_table[201] = {
1175 0.000000f, 0.039979f, 0.079830f, 0.119427f, 0.158649f,
1176 0.197375f, 0.235496f, 0.272905f, 0.309507f, 0.345214f,
1177 0.379949f, 0.413644f, 0.446244f, 0.477700f, 0.507977f,
1178 0.537050f, 0.564900f, 0.591519f, 0.616909f, 0.641077f,
1179 0.664037f, 0.685809f, 0.706419f, 0.725897f, 0.744277f,
1180 0.761594f, 0.777888f, 0.793199f, 0.807569f, 0.821040f,
1181 0.833655f, 0.845456f, 0.856485f, 0.866784f, 0.876393f,
1182 0.885352f, 0.893698f, 0.901468f, 0.908698f, 0.915420f,
1183 0.921669f, 0.927473f, 0.932862f, 0.937863f, 0.942503f,
1184 0.946806f, 0.950795f, 0.954492f, 0.957917f, 0.961090f,
1185 0.964028f, 0.966747f, 0.969265f, 0.971594f, 0.973749f,
1186 0.975743f, 0.977587f, 0.979293f, 0.980869f, 0.982327f,
1187 0.983675f, 0.984921f, 0.986072f, 0.987136f, 0.988119f,
1188 0.989027f, 0.989867f, 0.990642f, 0.991359f, 0.992020f,
1189 0.992631f, 0.993196f, 0.993718f, 0.994199f, 0.994644f,
1190 0.995055f, 0.995434f, 0.995784f, 0.996108f, 0.996407f,
1191 0.996682f, 0.996937f, 0.997172f, 0.997389f, 0.997590f,
1192 0.997775f, 0.997946f, 0.998104f, 0.998249f, 0.998384f,
1193 0.998508f, 0.998623f, 0.998728f, 0.998826f, 0.998916f,
1194 0.999000f, 0.999076f, 0.999147f, 0.999213f, 0.999273f,
1195 0.999329f, 0.999381f, 0.999428f, 0.999472f, 0.999513f,
1196 0.999550f, 0.999585f, 0.999617f, 0.999646f, 0.999673f,
1197 0.999699f, 0.999722f, 0.999743f, 0.999763f, 0.999781f,
1198 0.999798f, 0.999813f, 0.999828f, 0.999841f, 0.999853f,
1199 0.999865f, 0.999875f, 0.999885f, 0.999893f, 0.999902f,
1200 0.999909f, 0.999916f, 0.999923f, 0.999929f, 0.999934f,
1201 0.999939f, 0.999944f, 0.999948f, 0.999952f, 0.999956f,
1202 0.999959f, 0.999962f, 0.999965f, 0.999968f, 0.999970f,
1203 0.999973f, 0.999975f, 0.999977f, 0.999978f, 0.999980f,
1204 0.999982f, 0.999983f, 0.999984f, 0.999986f, 0.999987f,
1205 0.999988f, 0.999989f, 0.999990f, 0.999990f, 0.999991f,
1206 0.999992f, 0.999992f, 0.999993f, 0.999994f, 0.999994f,
1207 0.999994f, 0.999995f, 0.999995f, 0.999996f, 0.999996f,
1208 0.999996f, 0.999997f, 0.999997f, 0.999997f, 0.999997f,
1209 0.999997f, 0.999998f, 0.999998f, 0.999998f, 0.999998f,
1210 0.999998f, 0.999998f, 0.999999f, 0.999999f, 0.999999f,
1211 0.999999f, 0.999999f, 0.999999f, 0.999999f, 0.999999f,
1212 0.999999f, 0.999999f, 0.999999f, 0.999999f, 0.999999f,
1213 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1214 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1215 1.000000f,
1216 };
1217
1218 static inline float tansig_approx(float x)
1219 {
1220 float y, dy;
1221 float sign=1;
1222 int i;
1223
1224 /* Tests are reversed to catch NaNs */
1225 if (!(x<8))
1226 return 1;
1227 if (!(x>-8))
1228 return -1;
1229 /* Another check in case of -ffast-math */
1230
1231 if (isnan(x))
1232 return 0;
1233
1234 if (x < 0) {
1235 x=-x;
1236 sign=-1;
1237 }
1238 i = (int)floor(.5f+25*x);
1239 x -= .04f*i;
1240 y = tansig_table[i];
1241 dy = 1-y*y;
1242 y = y + x*dy*(1 - y*x);
1243 return sign*y;
1244 }
1245
1246 static inline float sigmoid_approx(float x)
1247 {
1248 return .5f + .5f*tansig_approx(.5f*x);
1249 }
1250
1251 static void compute_dense(const DenseLayer *layer, float *output, const float *input)
1252 {
1253 const int N = layer->nb_neurons, M = layer->nb_inputs, stride = N;
1254
1255 for (int i = 0; i < N; i++) {
1256 /* Compute update gate. */
1257 float sum = layer->bias[i];
1258
1259 for (int j = 0; j < M; j++)
1260 sum += layer->input_weights[j * stride + i] * input[j];
1261
1262 output[i] = WEIGHTS_SCALE * sum;
1263 }
1264
1265 if (layer->activation == ACTIVATION_SIGMOID) {
1266 for (int i = 0; i < N; i++)
1267 output[i] = sigmoid_approx(output[i]);
1268 } else if (layer->activation == ACTIVATION_TANH) {
1269 for (int i = 0; i < N; i++)
1270 output[i] = tansig_approx(output[i]);
1271 } else if (layer->activation == ACTIVATION_RELU) {
1272 for (int i = 0; i < N; i++)
1273 output[i] = FFMAX(0, output[i]);
1274 } else {
1275 av_assert0(0);
1276 }
1277 }
1278
1279 static void compute_gru(AudioRNNContext *s, const GRULayer *gru, float *state, const float *input)
1280 {
1281 LOCAL_ALIGNED_32(float, z, [MAX_NEURONS]);
1282 LOCAL_ALIGNED_32(float, r, [MAX_NEURONS]);
1283 LOCAL_ALIGNED_32(float, h, [MAX_NEURONS]);
1284 const int M = gru->nb_inputs;
1285 const int N = gru->nb_neurons;
1286 const int AN = FFALIGN(N, 4);
1287 const int AM = FFALIGN(M, 4);
1288 const int stride = 3 * AN, istride = 3 * AM;
1289
1290 for (int i = 0; i < N; i++) {
1291 /* Compute update gate. */
1292 float sum = gru->bias[i];
1293
1294 sum += s->fdsp->scalarproduct_float(gru->input_weights + i * istride, input, AM);
1295 sum += s->fdsp->scalarproduct_float(gru->recurrent_weights + i * stride, state, AN);
1296 z[i] = sigmoid_approx(WEIGHTS_SCALE * sum);
1297 }
1298
1299 for (int i = 0; i < N; i++) {
1300 /* Compute reset gate. */
1301 float sum = gru->bias[N + i];
1302
1303 sum += s->fdsp->scalarproduct_float(gru->input_weights + AM + i * istride, input, AM);
1304 sum += s->fdsp->scalarproduct_float(gru->recurrent_weights + AN + i * stride, state, AN);
1305 r[i] = sigmoid_approx(WEIGHTS_SCALE * sum);
1306 }
1307
1308 for (int i = 0; i < N; i++) {
1309 /* Compute output. */
1310 float sum = gru->bias[2 * N + i];
1311
1312 sum += s->fdsp->scalarproduct_float(gru->input_weights + 2 * AM + i * istride, input, AM);
1313 for (int j = 0; j < N; j++)
1314 sum += gru->recurrent_weights[2 * AN + i * stride + j] * state[j] * r[j];
1315
1316 if (gru->activation == ACTIVATION_SIGMOID)
1317 sum = sigmoid_approx(WEIGHTS_SCALE * sum);
1318 else if (gru->activation == ACTIVATION_TANH)
1319 sum = tansig_approx(WEIGHTS_SCALE * sum);
1320 else if (gru->activation == ACTIVATION_RELU)
1321 sum = FFMAX(0, WEIGHTS_SCALE * sum);
1322 else
1323 av_assert0(0);
1324 h[i] = z[i] * state[i] + (1.f - z[i]) * sum;
1325 }
1326
1327 RNN_COPY(state, h, N);
1328 }
1329
1330 #define INPUT_SIZE 42
1331
1332 static void compute_rnn(AudioRNNContext *s, RNNState *rnn, float *gains, float *vad, const float *input)
1333 {
1334 LOCAL_ALIGNED_32(float, dense_out, [MAX_NEURONS]);
1335 LOCAL_ALIGNED_32(float, noise_input, [MAX_NEURONS * 3]);
1336 LOCAL_ALIGNED_32(float, denoise_input, [MAX_NEURONS * 3]);
1337
1338 compute_dense(rnn->model->input_dense, dense_out, input);
1339 compute_gru(s, rnn->model->vad_gru, rnn->vad_gru_state, dense_out);
1340 compute_dense(rnn->model->vad_output, vad, rnn->vad_gru_state);
1341
1342 memcpy(noise_input, dense_out, rnn->model->input_dense_size * sizeof(float));
1343 memcpy(noise_input + rnn->model->input_dense_size,
1344 rnn->vad_gru_state, rnn->model->vad_gru_size * sizeof(float));
1345 memcpy(noise_input + rnn->model->input_dense_size + rnn->model->vad_gru_size,
1346 input, INPUT_SIZE * sizeof(float));
1347
1348 compute_gru(s, rnn->model->noise_gru, rnn->noise_gru_state, noise_input);
1349
1350 memcpy(denoise_input, rnn->vad_gru_state, rnn->model->vad_gru_size * sizeof(float));
1351 memcpy(denoise_input + rnn->model->vad_gru_size,
1352 rnn->noise_gru_state, rnn->model->noise_gru_size * sizeof(float));
1353 memcpy(denoise_input + rnn->model->vad_gru_size + rnn->model->noise_gru_size,
1354 input, INPUT_SIZE * sizeof(float));
1355
1356 compute_gru(s, rnn->model->denoise_gru, rnn->denoise_gru_state, denoise_input);
1357 compute_dense(rnn->model->denoise_output, gains, rnn->denoise_gru_state);
1358 }
1359
1360 static float rnnoise_channel(AudioRNNContext *s, DenoiseState *st, float *out, const float *in,
1361 int disabled)
1362 {
1363 AVComplexFloat X[FREQ_SIZE];
1364 AVComplexFloat P[WINDOW_SIZE];
1365 float x[FRAME_SIZE];
1366 float Ex[NB_BANDS], Ep[NB_BANDS];
1367 LOCAL_ALIGNED_32(float, Exp, [NB_BANDS]);
1368 float features[NB_FEATURES];
1369 float g[NB_BANDS];
1370 float gf[FREQ_SIZE];
1371 float vad_prob = 0;
1372 float *history = st->history;
1373 static const float a_hp[2] = {-1.99599, 0.99600};
1374 static const float b_hp[2] = {-2, 1};
1375 int silence;
1376
1377 biquad(x, st->mem_hp_x, in, b_hp, a_hp, FRAME_SIZE);
1378 silence = compute_frame_features(s, st, X, P, Ex, Ep, Exp, features, x);
1379
1380 if (!silence && !disabled) {
1381 compute_rnn(s, &st->rnn[0], g, &vad_prob, features);
1382 pitch_filter(X, P, Ex, Ep, Exp, g);
1383 for (int i = 0; i < NB_BANDS; i++) {
1384 float alpha = .6f;
1385
1386 g[i] = FFMAX(g[i], alpha * st->lastg[i]);
1387 st->lastg[i] = g[i];
1388 }
1389
1390 interp_band_gain(gf, g);
1391
1392 for (int i = 0; i < FREQ_SIZE; i++) {
1393 X[i].re *= gf[i];
1394 X[i].im *= gf[i];
1395 }
1396 }
1397
1398 frame_synthesis(s, st, out, X);
1399 memcpy(history, in, FRAME_SIZE * sizeof(*history));
1400
1401 return vad_prob;
1402 }
1403
1404 typedef struct ThreadData {
1405 AVFrame *in, *out;
1406 } ThreadData;
1407
1408 static int rnnoise_channels(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
1409 {
1410 AudioRNNContext *s = ctx->priv;
1411 ThreadData *td = arg;
1412 AVFrame *in = td->in;
1413 AVFrame *out = td->out;
1414 const int start = (out->ch_layout.nb_channels * jobnr) / nb_jobs;
1415 const int end = (out->ch_layout.nb_channels * (jobnr+1)) / nb_jobs;
1416
1417 for (int ch = start; ch < end; ch++) {
1418 rnnoise_channel(s, &s->st[ch],
1419 (float *)out->extended_data[ch],
1420 (const float *)in->extended_data[ch],
1421 ctx->is_disabled);
1422 }
1423
1424 return 0;
1425 }
1426
1427 static int filter_frame(AVFilterLink *inlink, AVFrame *in)
1428 {
1429 AVFilterContext *ctx = inlink->dst;
1430 AVFilterLink *outlink = ctx->outputs[0];
1431 AVFrame *out = NULL;
1432 ThreadData td;
1433
1434 out = ff_get_audio_buffer(outlink, FRAME_SIZE);
1435 if (!out) {
1436 av_frame_free(&in);
1437 return AVERROR(ENOMEM);
1438 }
1439 av_frame_copy_props(out, in);
1440
1441 td.in = in; td.out = out;
1442 ff_filter_execute(ctx, rnnoise_channels, &td, NULL,
1443 FFMIN(outlink->ch_layout.nb_channels, ff_filter_get_nb_threads(ctx)));
1444
1445 av_frame_free(&in);
1446 return ff_filter_frame(outlink, out);
1447 }
1448
1449 static int activate(AVFilterContext *ctx)
1450 {
1451 AVFilterLink *inlink = ctx->inputs[0];
1452 AVFilterLink *outlink = ctx->outputs[0];
1453 AVFrame *in = NULL;
1454 int ret;
1455
1456 FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);
1457
1458 ret = ff_inlink_consume_samples(inlink, FRAME_SIZE, FRAME_SIZE, &in);
1459 if (ret < 0)
1460 return ret;
1461
1462 if (ret > 0)
1463 return filter_frame(inlink, in);
1464
1465 FF_FILTER_FORWARD_STATUS(inlink, outlink);
1466 FF_FILTER_FORWARD_WANTED(outlink, inlink);
1467
1468 return FFERROR_NOT_READY;
1469 }
1470
1471 static int open_model(AVFilterContext *ctx, RNNModel **model)
1472 {
1473 AudioRNNContext *s = ctx->priv;
1474 int ret;
1475 FILE *f;
1476
1477 if (!s->model_name)
1478 return AVERROR(EINVAL);
1479 f = avpriv_fopen_utf8(s->model_name, "r");
1480 if (!f) {
1481 av_log(ctx, AV_LOG_ERROR, "Failed to open model file: %s\n", s->model_name);
1482 return AVERROR(EINVAL);
1483 }
1484
1485 ret = rnnoise_model_from_file(f, model);
1486 fclose(f);
1487 if (!*model || ret < 0)
1488 return ret;
1489
1490 return 0;
1491 }
1492
1493 static av_cold int init(AVFilterContext *ctx)
1494 {
1495 AudioRNNContext *s = ctx->priv;
1496 int ret;
1497
1498 s->fdsp = avpriv_float_dsp_alloc(0);
1499 if (!s->fdsp)
1500 return AVERROR(ENOMEM);
1501
1502 ret = open_model(ctx, &s->model[0]);
1503 if (ret < 0)
1504 return ret;
1505
1506 for (int i = 0; i < FRAME_SIZE; i++) {
1507 s->window[i] = sin(.5*M_PI*sin(.5*M_PI*(i+.5)/FRAME_SIZE) * sin(.5*M_PI*(i+.5)/FRAME_SIZE));
1508 s->window[WINDOW_SIZE - 1 - i] = s->window[i];
1509 }
1510
1511 for (int i = 0; i < NB_BANDS; i++) {
1512 for (int j = 0; j < NB_BANDS; j++) {
1513 s->dct_table[j][i] = cosf((i + .5f) * j * M_PI / NB_BANDS);
1514 if (j == 0)
1515 s->dct_table[j][i] *= sqrtf(.5);
1516 }
1517 }
1518
1519 return 0;
1520 }
1521
1522 static void free_model(AVFilterContext *ctx, int n)
1523 {
1524 AudioRNNContext *s = ctx->priv;
1525
1526 rnnoise_model_free(s->model[n]);
1527 s->model[n] = NULL;
1528
1529 for (int ch = 0; ch < s->channels && s->st; ch++) {
1530 av_freep(&s->st[ch].rnn[n].vad_gru_state);
1531 av_freep(&s->st[ch].rnn[n].noise_gru_state);
1532 av_freep(&s->st[ch].rnn[n].denoise_gru_state);
1533 }
1534 }
1535
1536 static int process_command(AVFilterContext *ctx, const char *cmd, const char *args,
1537 char *res, int res_len, int flags)
1538 {
1539 AudioRNNContext *s = ctx->priv;
1540 int ret;
1541
1542 ret = ff_filter_process_command(ctx, cmd, args, res, res_len, flags);
1543 if (ret < 0)
1544 return ret;
1545
1546 ret = open_model(ctx, &s->model[1]);
1547 if (ret < 0)
1548 return ret;
1549
1550 FFSWAP(RNNModel *, s->model[0], s->model[1]);
1551 for (int ch = 0; ch < s->channels; ch++)
1552 FFSWAP(RNNState, s->st[ch].rnn[0], s->st[ch].rnn[1]);
1553
1554 ret = config_input(ctx->inputs[0]);
1555 if (ret < 0) {
1556 for (int ch = 0; ch < s->channels; ch++)
1557 FFSWAP(RNNState, s->st[ch].rnn[0], s->st[ch].rnn[1]);
1558 FFSWAP(RNNModel *, s->model[0], s->model[1]);
1559 return ret;
1560 }
1561
1562 free_model(ctx, 1);
1563 return 0;
1564 }
1565
1566 static av_cold void uninit(AVFilterContext *ctx)
1567 {
1568 AudioRNNContext *s = ctx->priv;
1569
1570 av_freep(&s->fdsp);
1571 free_model(ctx, 0);
1572 for (int ch = 0; ch < s->channels && s->st; ch++) {
1573 av_tx_uninit(&s->st[ch].tx);
1574 av_tx_uninit(&s->st[ch].txi);
1575 }
1576 av_freep(&s->st);
1577 }
1578
1579 static const AVFilterPad inputs[] = {
1580 {
1581 .name = "default",
1582 .type = AVMEDIA_TYPE_AUDIO,
1583 .config_props = config_input,
1584 },
1585 };
1586
1587 #define OFFSET(x) offsetof(AudioRNNContext, x)
1588 #define AF AV_OPT_FLAG_AUDIO_PARAM|AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_RUNTIME_PARAM
1589
1590 static const AVOption arnndn_options[] = {
1591 { "model", "set model name", OFFSET(model_name), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, AF },
1592 { "m", "set model name", OFFSET(model_name), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, AF },
1593 { "mix", "set output vs input mix", OFFSET(mix), AV_OPT_TYPE_FLOAT, {.dbl=1.0},-1, 1, AF },
1594 { NULL }
1595 };
1596
1597 AVFILTER_DEFINE_CLASS(arnndn);
1598
1599 const FFFilter ff_af_arnndn = {
1600 .p.name = "arnndn",
1601 .p.description = NULL_IF_CONFIG_SMALL("Reduce noise from speech using Recurrent Neural Networks."),
1602 .p.priv_class = &arnndn_class,
1603 .p.flags = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL |
1604 AVFILTER_FLAG_SLICE_THREADS,
1605 .priv_size = sizeof(AudioRNNContext),
1606 .activate = activate,
1607 .init = init,
1608 .uninit = uninit,
1609 FILTER_INPUTS(inputs),
1610 FILTER_OUTPUTS(ff_audio_default_filterpad),
1611 FILTER_QUERY_FUNC2(query_formats),
1612 .process_command = process_command,
1613 };
1614