GCC Code Coverage Report
Directory: ../../../ffmpeg/ Exec Total Coverage
File: src/libavcodec/wmavoice.c Lines: 664 752 88.3 %
Date: 2020-09-25 23:16:12 Branches: 301 382 78.8 %

Line Branch Exec Source
1
/*
2
 * Windows Media Audio Voice decoder.
3
 * Copyright (c) 2009 Ronald S. Bultje
4
 *
5
 * This file is part of FFmpeg.
6
 *
7
 * FFmpeg is free software; you can redistribute it and/or
8
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11
 *
12
 * FFmpeg is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with FFmpeg; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
 */
21
22
/**
23
 * @file
24
 * @brief Windows Media Audio Voice compatible decoder
25
 * @author Ronald S. Bultje <rsbultje@gmail.com>
26
 */
27
28
#include <math.h>
29
30
#include "libavutil/channel_layout.h"
31
#include "libavutil/float_dsp.h"
32
#include "libavutil/mem.h"
33
#include "libavutil/thread.h"
34
#include "avcodec.h"
35
#include "internal.h"
36
#include "get_bits.h"
37
#include "put_bits.h"
38
#include "wmavoice_data.h"
39
#include "celp_filters.h"
40
#include "acelp_vectors.h"
41
#include "acelp_filters.h"
42
#include "lsp.h"
43
#include "dct.h"
44
#include "rdft.h"
45
#include "sinewin.h"
46
47
#define MAX_BLOCKS           8   ///< maximum number of blocks per frame
48
#define MAX_LSPS             16  ///< maximum filter order
49
#define MAX_LSPS_ALIGN16     16  ///< same as #MAX_LSPS; needs to be multiple
50
                                 ///< of 16 for ASM input buffer alignment
51
#define MAX_FRAMES           3   ///< maximum number of frames per superframe
52
#define MAX_FRAMESIZE        160 ///< maximum number of samples per frame
53
#define MAX_SIGNAL_HISTORY   416 ///< maximum excitation signal history
54
#define MAX_SFRAMESIZE       (MAX_FRAMESIZE * MAX_FRAMES)
55
                                 ///< maximum number of samples per superframe
56
#define SFRAME_CACHE_MAXSIZE 256 ///< maximum cache size for frame data that
57
                                 ///< was split over two packets
58
#define VLC_NBITS            6   ///< number of bits to read per VLC iteration
59
60
/**
61
 * Frame type VLC coding.
62
 */
63
static VLC frame_type_vlc;
64
65
/**
66
 * Adaptive codebook types.
67
 */
68
enum {
69
    ACB_TYPE_NONE       = 0, ///< no adaptive codebook (only hardcoded fixed)
70
    ACB_TYPE_ASYMMETRIC = 1, ///< adaptive codebook with per-frame pitch, which
71
                             ///< we interpolate to get a per-sample pitch.
72
                             ///< Signal is generated using an asymmetric sinc
73
                             ///< window function
74
                             ///< @note see #wmavoice_ipol1_coeffs
75
    ACB_TYPE_HAMMING    = 2  ///< Per-block pitch with signal generation using
76
                             ///< a Hamming sinc window function
77
                             ///< @note see #wmavoice_ipol2_coeffs
78
};
79
80
/**
81
 * Fixed codebook types.
82
 */
83
enum {
84
    FCB_TYPE_SILENCE    = 0, ///< comfort noise during silence
85
                             ///< generated from a hardcoded (fixed) codebook
86
                             ///< with per-frame (low) gain values
87
    FCB_TYPE_HARDCODED  = 1, ///< hardcoded (fixed) codebook with per-block
88
                             ///< gain values
89
    FCB_TYPE_AW_PULSES  = 2, ///< Pitch-adaptive window (AW) pulse signals,
90
                             ///< used in particular for low-bitrate streams
91
    FCB_TYPE_EXC_PULSES = 3, ///< Innovation (fixed) codebook pulse sets in
92
                             ///< combinations of either single pulses or
93
                             ///< pulse pairs
94
};
95
96
/**
97
 * Description of frame types.
98
 */
99
static const struct frame_type_desc {
100
    uint8_t n_blocks;     ///< amount of blocks per frame (each block
101
                          ///< (contains 160/#n_blocks samples)
102
    uint8_t log_n_blocks; ///< log2(#n_blocks)
103
    uint8_t acb_type;     ///< Adaptive codebook type (ACB_TYPE_*)
104
    uint8_t fcb_type;     ///< Fixed codebook type (FCB_TYPE_*)
105
    uint8_t dbl_pulses;   ///< how many pulse vectors have pulse pairs
106
                          ///< (rather than just one single pulse)
107
                          ///< only if #fcb_type == #FCB_TYPE_EXC_PULSES
108
} frame_descs[17] = {
109
    { 1, 0, ACB_TYPE_NONE,       FCB_TYPE_SILENCE,    0 },
110
    { 2, 1, ACB_TYPE_NONE,       FCB_TYPE_HARDCODED,  0 },
111
    { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_AW_PULSES,  0 },
112
    { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2 },
113
    { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5 },
114
    { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 0 },
115
    { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2 },
116
    { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5 },
117
    { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0 },
118
    { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2 },
119
    { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5 },
120
    { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0 },
121
    { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2 },
122
    { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5 },
123
    { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0 },
124
    { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2 },
125
    { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5 }
126
};
127
128
/**
129
 * WMA Voice decoding context.
130
 */
131
typedef struct WMAVoiceContext {
132
    /**
133
     * @name Global values specified in the stream header / extradata or used all over.
134
     * @{
135
     */
136
    GetBitContext gb;             ///< packet bitreader. During decoder init,
137
                                  ///< it contains the extradata from the
138
                                  ///< demuxer. During decoding, it contains
139
                                  ///< packet data.
140
    int8_t vbm_tree[25];          ///< converts VLC codes to frame type
141
142
    int spillover_bitsize;        ///< number of bits used to specify
143
                                  ///< #spillover_nbits in the packet header
144
                                  ///< = ceil(log2(ctx->block_align << 3))
145
    int history_nsamples;         ///< number of samples in history for signal
146
                                  ///< prediction (through ACB)
147
148
    /* postfilter specific values */
149
    int do_apf;                   ///< whether to apply the averaged
150
                                  ///< projection filter (APF)
151
    int denoise_strength;         ///< strength of denoising in Wiener filter
152
                                  ///< [0-11]
153
    int denoise_tilt_corr;        ///< Whether to apply tilt correction to the
154
                                  ///< Wiener filter coefficients (postfilter)
155
    int dc_level;                 ///< Predicted amount of DC noise, based
156
                                  ///< on which a DC removal filter is used
157
158
    int lsps;                     ///< number of LSPs per frame [10 or 16]
159
    int lsp_q_mode;               ///< defines quantizer defaults [0, 1]
160
    int lsp_def_mode;             ///< defines different sets of LSP defaults
161
                                  ///< [0, 1]
162
163
    int min_pitch_val;            ///< base value for pitch parsing code
164
    int max_pitch_val;            ///< max value + 1 for pitch parsing
165
    int pitch_nbits;              ///< number of bits used to specify the
166
                                  ///< pitch value in the frame header
167
    int block_pitch_nbits;        ///< number of bits used to specify the
168
                                  ///< first block's pitch value
169
    int block_pitch_range;        ///< range of the block pitch
170
    int block_delta_pitch_nbits;  ///< number of bits used to specify the
171
                                  ///< delta pitch between this and the last
172
                                  ///< block's pitch value, used in all but
173
                                  ///< first block
174
    int block_delta_pitch_hrange; ///< 1/2 range of the delta (full range is
175
                                  ///< from -this to +this-1)
176
    uint16_t block_conv_table[4]; ///< boundaries for block pitch unit/scale
177
                                  ///< conversion
178
179
    /**
180
     * @}
181
     *
182
     * @name Packet values specified in the packet header or related to a packet.
183
     *
184
     * A packet is considered to be a single unit of data provided to this
185
     * decoder by the demuxer.
186
     * @{
187
     */
188
    int spillover_nbits;          ///< number of bits of the previous packet's
189
                                  ///< last superframe preceding this
190
                                  ///< packet's first full superframe (useful
191
                                  ///< for re-synchronization also)
192
    int has_residual_lsps;        ///< if set, superframes contain one set of
193
                                  ///< LSPs that cover all frames, encoded as
194
                                  ///< independent and residual LSPs; if not
195
                                  ///< set, each frame contains its own, fully
196
                                  ///< independent, LSPs
197
    int skip_bits_next;           ///< number of bits to skip at the next call
198
                                  ///< to #wmavoice_decode_packet() (since
199
                                  ///< they're part of the previous superframe)
200
201
    uint8_t sframe_cache[SFRAME_CACHE_MAXSIZE + AV_INPUT_BUFFER_PADDING_SIZE];
202
                                  ///< cache for superframe data split over
203
                                  ///< multiple packets
204
    int sframe_cache_size;        ///< set to >0 if we have data from an
205
                                  ///< (incomplete) superframe from a previous
206
                                  ///< packet that spilled over in the current
207
                                  ///< packet; specifies the amount of bits in
208
                                  ///< #sframe_cache
209
    PutBitContext pb;             ///< bitstream writer for #sframe_cache
210
211
    /**
212
     * @}
213
     *
214
     * @name Frame and superframe values
215
     * Superframe and frame data - these can change from frame to frame,
216
     * although some of them do in that case serve as a cache / history for
217
     * the next frame or superframe.
218
     * @{
219
     */
220
    double prev_lsps[MAX_LSPS];   ///< LSPs of the last frame of the previous
221
                                  ///< superframe
222
    int last_pitch_val;           ///< pitch value of the previous frame
223
    int last_acb_type;            ///< frame type [0-2] of the previous frame
224
    int pitch_diff_sh16;          ///< ((cur_pitch_val - #last_pitch_val)
225
                                  ///< << 16) / #MAX_FRAMESIZE
226
    float silence_gain;           ///< set for use in blocks if #ACB_TYPE_NONE
227
228
    int aw_idx_is_ext;            ///< whether the AW index was encoded in
229
                                  ///< 8 bits (instead of 6)
230
    int aw_pulse_range;           ///< the range over which #aw_pulse_set1()
231
                                  ///< can apply the pulse, relative to the
232
                                  ///< value in aw_first_pulse_off. The exact
233
                                  ///< position of the first AW-pulse is within
234
                                  ///< [pulse_off, pulse_off + this], and
235
                                  ///< depends on bitstream values; [16 or 24]
236
    int aw_n_pulses[2];           ///< number of AW-pulses in each block; note
237
                                  ///< that this number can be negative (in
238
                                  ///< which case it basically means "zero")
239
    int aw_first_pulse_off[2];    ///< index of first sample to which to
240
                                  ///< apply AW-pulses, or -0xff if unset
241
    int aw_next_pulse_off_cache;  ///< the position (relative to start of the
242
                                  ///< second block) at which pulses should
243
                                  ///< start to be positioned, serves as a
244
                                  ///< cache for pitch-adaptive window pulses
245
                                  ///< between blocks
246
247
    int frame_cntr;               ///< current frame index [0 - 0xFFFE]; is
248
                                  ///< only used for comfort noise in #pRNG()
249
    int nb_superframes;           ///< number of superframes in current packet
250
    float gain_pred_err[6];       ///< cache for gain prediction
251
    float excitation_history[MAX_SIGNAL_HISTORY];
252
                                  ///< cache of the signal of previous
253
                                  ///< superframes, used as a history for
254
                                  ///< signal generation
255
    float synth_history[MAX_LSPS]; ///< see #excitation_history
256
    /**
257
     * @}
258
     *
259
     * @name Postfilter values
260
     *
261
     * Variables used for postfilter implementation, mostly history for
262
     * smoothing and so on, and context variables for FFT/iFFT.
263
     * @{
264
     */
265
    RDFTContext rdft, irdft;      ///< contexts for FFT-calculation in the
266
                                  ///< postfilter (for denoise filter)
267
    DCTContext dct, dst;          ///< contexts for phase shift (in Hilbert
268
                                  ///< transform, part of postfilter)
269
    float sin[511], cos[511];     ///< 8-bit cosine/sine windows over [-pi,pi]
270
                                  ///< range
271
    float postfilter_agc;         ///< gain control memory, used in
272
                                  ///< #adaptive_gain_control()
273
    float dcf_mem[2];             ///< DC filter history
274
    float zero_exc_pf[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE];
275
                                  ///< zero filter output (i.e. excitation)
276
                                  ///< by postfilter
277
    float denoise_filter_cache[MAX_FRAMESIZE];
278
    int   denoise_filter_cache_size; ///< samples in #denoise_filter_cache
279
    DECLARE_ALIGNED(32, float, tilted_lpcs_pf)[0x80];
280
                                  ///< aligned buffer for LPC tilting
281
    DECLARE_ALIGNED(32, float, denoise_coeffs_pf)[0x80];
282
                                  ///< aligned buffer for denoise coefficients
283
    DECLARE_ALIGNED(32, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16];
284
                                  ///< aligned buffer for postfilter speech
285
                                  ///< synthesis
286
    /**
287
     * @}
288
     */
289
} WMAVoiceContext;
290
291
/**
292
 * Set up the variable bit mode (VBM) tree from container extradata.
293
 * @param gb bit I/O context.
294
 *           The bit context (s->gb) should be loaded with byte 23-46 of the
295
 *           container extradata (i.e. the ones containing the VBM tree).
296
 * @param vbm_tree pointer to array to which the decoded VBM tree will be
297
 *                 written.
298
 * @return 0 on success, <0 on error.
299
 */
300
8
static av_cold int decode_vbmtree(GetBitContext *gb, int8_t vbm_tree[25])
301
{
302
8
    int cntr[8] = { 0 }, n, res;
303
304
8
    memset(vbm_tree, 0xff, sizeof(vbm_tree[0]) * 25);
305
144
    for (n = 0; n < 17; n++) {
306
136
        res = get_bits(gb, 3);
307
136
        if (cntr[res] > 3) // should be >= 3 + (res == 7))
308
            return -1;
309
136
        vbm_tree[res * 3 + cntr[res]++] = n;
310
    }
311
8
    return 0;
312
}
313
314
5
static av_cold void wmavoice_init_static_data(void)
315
{
316
    static const uint8_t bits[] = {
317
         2,  2,  2,  4,  4,  4,
318
         6,  6,  6,  8,  8,  8,
319
        10, 10, 10, 12, 12, 12,
320
        14, 14, 14, 14
321
    };
322
    static const uint16_t codes[] = {
323
          0x0000, 0x0001, 0x0002,        //              00/01/10
324
          0x000c, 0x000d, 0x000e,        //           11+00/01/10
325
          0x003c, 0x003d, 0x003e,        //         1111+00/01/10
326
          0x00fc, 0x00fd, 0x00fe,        //       111111+00/01/10
327
          0x03fc, 0x03fd, 0x03fe,        //     11111111+00/01/10
328
          0x0ffc, 0x0ffd, 0x0ffe,        //   1111111111+00/01/10
329
          0x3ffc, 0x3ffd, 0x3ffe, 0x3fff // 111111111111+xx
330
    };
331
332
5
    INIT_VLC_STATIC(&frame_type_vlc, VLC_NBITS, sizeof(bits),
333
                    bits, 1, 1, codes, 2, 2, 132);
334
5
}
335
336
static av_cold void wmavoice_flush(AVCodecContext *ctx)
337
{
338
    WMAVoiceContext *s = ctx->priv_data;
339
    int n;
340
341
    s->postfilter_agc    = 0;
342
    s->sframe_cache_size = 0;
343
    s->skip_bits_next    = 0;
344
    for (n = 0; n < s->lsps; n++)
345
        s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
346
    memset(s->excitation_history, 0,
347
           sizeof(*s->excitation_history) * MAX_SIGNAL_HISTORY);
348
    memset(s->synth_history,      0,
349
           sizeof(*s->synth_history)      * MAX_LSPS);
350
    memset(s->gain_pred_err,      0,
351
           sizeof(s->gain_pred_err));
352
353
    if (s->do_apf) {
354
        memset(&s->synth_filter_out_buf[MAX_LSPS_ALIGN16 - s->lsps], 0,
355
               sizeof(*s->synth_filter_out_buf) * s->lsps);
356
        memset(s->dcf_mem,              0,
357
               sizeof(*s->dcf_mem)              * 2);
358
        memset(s->zero_exc_pf,          0,
359
               sizeof(*s->zero_exc_pf)          * s->history_nsamples);
360
        memset(s->denoise_filter_cache, 0, sizeof(s->denoise_filter_cache));
361
    }
362
}
363
364
/**
365
 * Set up decoder with parameters from demuxer (extradata etc.).
366
 */
367
8
static av_cold int wmavoice_decode_init(AVCodecContext *ctx)
368
{
369
    static AVOnce init_static_once = AV_ONCE_INIT;
370
    int n, flags, pitch_range, lsp16_flag;
371
8
    WMAVoiceContext *s = ctx->priv_data;
372
373
8
    ff_thread_once(&init_static_once, wmavoice_init_static_data);
374
375
    /**
376
     * Extradata layout:
377
     * - byte  0-18: WMAPro-in-WMAVoice extradata (see wmaprodec.c),
378
     * - byte 19-22: flags field (annoyingly in LE; see below for known
379
     *               values),
380
     * - byte 23-46: variable bitmode tree (really just 17 * 3 bits,
381
     *               rest is 0).
382
     */
383
8
    if (ctx->extradata_size != 46) {
384
        av_log(ctx, AV_LOG_ERROR,
385
               "Invalid extradata size %d (should be 46)\n",
386
               ctx->extradata_size);
387
        return AVERROR_INVALIDDATA;
388
    }
389

8
    if (ctx->block_align <= 0 || ctx->block_align > (1<<22)) {
390
        av_log(ctx, AV_LOG_ERROR, "Invalid block alignment %d.\n", ctx->block_align);
391
        return AVERROR_INVALIDDATA;
392
    }
393
394
8
    flags                = AV_RL32(ctx->extradata + 18);
395
8
    s->spillover_bitsize = 3 + av_ceil_log2(ctx->block_align);
396
8
    s->do_apf            =    flags & 0x1;
397
8
    if (s->do_apf) {
398
8
        ff_rdft_init(&s->rdft,  7, DFT_R2C);
399
8
        ff_rdft_init(&s->irdft, 7, IDFT_C2R);
400
8
        ff_dct_init(&s->dct,  6, DCT_I);
401
8
        ff_dct_init(&s->dst,  6, DST_I);
402
403
8
        ff_sine_window_init(s->cos, 256);
404
8
        memcpy(&s->sin[255], s->cos, 256 * sizeof(s->cos[0]));
405
2048
        for (n = 0; n < 255; n++) {
406
2040
            s->sin[n]       = -s->sin[510 - n];
407
2040
            s->cos[510 - n] =  s->cos[n];
408
        }
409
    }
410
8
    s->denoise_strength  =   (flags >> 2) & 0xF;
411
8
    if (s->denoise_strength >= 12) {
412
        av_log(ctx, AV_LOG_ERROR,
413
               "Invalid denoise filter strength %d (max=11)\n",
414
               s->denoise_strength);
415
        return AVERROR_INVALIDDATA;
416
    }
417
8
    s->denoise_tilt_corr = !!(flags & 0x40);
418
8
    s->dc_level          =   (flags >> 7) & 0xF;
419
8
    s->lsp_q_mode        = !!(flags & 0x2000);
420
8
    s->lsp_def_mode      = !!(flags & 0x4000);
421
8
    lsp16_flag           =    flags & 0x1000;
422
8
    if (lsp16_flag) {
423
4
        s->lsps               = 16;
424
    } else {
425
4
        s->lsps               = 10;
426
    }
427
112
    for (n = 0; n < s->lsps; n++)
428
104
        s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
429
430
8
    init_get_bits(&s->gb, ctx->extradata + 22, (ctx->extradata_size - 22) << 3);
431
8
    if (decode_vbmtree(&s->gb, s->vbm_tree) < 0) {
432
        av_log(ctx, AV_LOG_ERROR, "Invalid VBM tree; broken extradata?\n");
433
        return AVERROR_INVALIDDATA;
434
    }
435
436
8
    if (ctx->sample_rate >= INT_MAX / (256 * 37))
437
        return AVERROR_INVALIDDATA;
438
439
8
    s->min_pitch_val    = ((ctx->sample_rate << 8)      /  400 + 50) >> 8;
440
8
    s->max_pitch_val    = ((ctx->sample_rate << 8) * 37 / 2000 + 50) >> 8;
441
8
    pitch_range         = s->max_pitch_val - s->min_pitch_val;
442
8
    if (pitch_range <= 0) {
443
        av_log(ctx, AV_LOG_ERROR, "Invalid pitch range; broken extradata?\n");
444
        return AVERROR_INVALIDDATA;
445
    }
446
8
    s->pitch_nbits      = av_ceil_log2(pitch_range);
447
8
    s->last_pitch_val   = 40;
448
8
    s->last_acb_type    = ACB_TYPE_NONE;
449
8
    s->history_nsamples = s->max_pitch_val + 8;
450
451

8
    if (s->min_pitch_val < 1 || s->history_nsamples > MAX_SIGNAL_HISTORY) {
452
        int min_sr = ((((1 << 8) - 50) * 400) + 0xFF) >> 8,
453
            max_sr = ((((MAX_SIGNAL_HISTORY - 8) << 8) + 205) * 2000 / 37) >> 8;
454
455
        av_log(ctx, AV_LOG_ERROR,
456
               "Unsupported samplerate %d (min=%d, max=%d)\n",
457
               ctx->sample_rate, min_sr, max_sr); // 322-22097 Hz
458
459
        return AVERROR(ENOSYS);
460
    }
461
462
8
    s->block_conv_table[0]      = s->min_pitch_val;
463
8
    s->block_conv_table[1]      = (pitch_range * 25) >> 6;
464
8
    s->block_conv_table[2]      = (pitch_range * 44) >> 6;
465
8
    s->block_conv_table[3]      = s->max_pitch_val - 1;
466
8
    s->block_delta_pitch_hrange = (pitch_range >> 3) & ~0xF;
467
8
    if (s->block_delta_pitch_hrange <= 0) {
468
        av_log(ctx, AV_LOG_ERROR, "Invalid delta pitch hrange; broken extradata?\n");
469
        return AVERROR_INVALIDDATA;
470
    }
471
8
    s->block_delta_pitch_nbits  = 1 + av_ceil_log2(s->block_delta_pitch_hrange);
472
8
    s->block_pitch_range        = s->block_conv_table[2] +
473
8
                                  s->block_conv_table[3] + 1 +
474
8
                                  2 * (s->block_conv_table[1] - 2 * s->min_pitch_val);
475
8
    s->block_pitch_nbits        = av_ceil_log2(s->block_pitch_range);
476
477
8
    ctx->channels               = 1;
478
8
    ctx->channel_layout         = AV_CH_LAYOUT_MONO;
479
8
    ctx->sample_fmt             = AV_SAMPLE_FMT_FLT;
480
481
8
    return 0;
482
}
483
484
/**
485
 * @name Postfilter functions
486
 * Postfilter functions (gain control, wiener denoise filter, DC filter,
487
 * kalman smoothening, plus surrounding code to wrap it)
488
 * @{
489
 */
490
/**
491
 * Adaptive gain control (as used in postfilter).
492
 *
493
 * Identical to #ff_adaptive_gain_control() in acelp_vectors.c, except
494
 * that the energy here is calculated using sum(abs(...)), whereas the
495
 * other codecs (e.g. AMR-NB, SIPRO) use sqrt(dotproduct(...)).
496
 *
497
 * @param out output buffer for filtered samples
498
 * @param in input buffer containing the samples as they are after the
499
 *           postfilter steps so far
500
 * @param speech_synth input buffer containing speech synth before postfilter
501
 * @param size input buffer size
502
 * @param alpha exponential filter factor
503
 * @param gain_mem pointer to filter memory (single float)
504
 */
505
6612
static void adaptive_gain_control(float *out, const float *in,
506
                                  const float *speech_synth,
507
                                  int size, float alpha, float *gain_mem)
508
{
509
    int i;
510
6612
    float speech_energy = 0.0, postfilter_energy = 0.0, gain_scale_factor;
511
6612
    float mem = *gain_mem;
512
513
535572
    for (i = 0; i < size; i++) {
514
528960
        speech_energy     += fabsf(speech_synth[i]);
515
528960
        postfilter_energy += fabsf(in[i]);
516
    }
517
6612
    gain_scale_factor = postfilter_energy == 0.0 ? 0.0 :
518
6612
                        (1.0 - alpha) * speech_energy / postfilter_energy;
519
520
535572
    for (i = 0; i < size; i++) {
521
528960
        mem = alpha * mem + gain_scale_factor;
522
528960
        out[i] = in[i] * mem;
523
    }
524
525
6612
    *gain_mem = mem;
526
6612
}
527
528
/**
529
 * Kalman smoothing function.
530
 *
531
 * This function looks back pitch +/- 3 samples back into history to find
532
 * the best fitting curve (that one giving the optimal gain of the two
533
 * signals, i.e. the highest dot product between the two), and then
534
 * uses that signal history to smoothen the output of the speech synthesis
535
 * filter.
536
 *
537
 * @param s WMA Voice decoding context
538
 * @param pitch pitch of the speech signal
539
 * @param in input speech signal
540
 * @param out output pointer for smoothened signal
541
 * @param size input/output buffer size
542
 *
543
 * @returns -1 if no smoothening took place, e.g. because no optimal
544
 *          fit could be found, or 0 on success.
545
 */
546
5070
static int kalman_smoothen(WMAVoiceContext *s, int pitch,
547
                           const float *in, float *out, int size)
548
{
549
    int n;
550
5070
    float optimal_gain = 0, dot;
551
5070
    const float *ptr = &in[-FFMAX(s->min_pitch_val, pitch - 3)],
552
5070
                *end = &in[-FFMIN(s->max_pitch_val, pitch + 3)],
553
5070
                *best_hist_ptr = NULL;
554
555
    /* find best fitting point in history */
556
    do {
557
35388
        dot = avpriv_scalarproduct_float_c(in, ptr, size);
558
35388
        if (dot > optimal_gain) {
559
12328
            optimal_gain  = dot;
560
12328
            best_hist_ptr = ptr;
561
        }
562
35388
    } while (--ptr >= end);
563
564
5070
    if (optimal_gain <= 0)
565
26
        return -1;
566
5044
    dot = avpriv_scalarproduct_float_c(best_hist_ptr, best_hist_ptr, size);
567
5044
    if (dot <= 0) // would be 1.0
568
        return -1;
569
570
5044
    if (optimal_gain <= dot) {
571
4872
        dot = dot / (dot + 0.6 * optimal_gain); // 0.625-1.000
572
    } else
573
172
        dot = 0.625;
574
575
    /* actual smoothing */
576
408564
    for (n = 0; n < size; n++)
577
403520
        out[n] = best_hist_ptr[n] + dot * (in[n] - best_hist_ptr[n]);
578
579
5044
    return 0;
580
}
581
582
/**
583
 * Get the tilt factor of a formant filter from its transfer function
584
 * @see #tilt_factor() in amrnbdec.c, which does essentially the same,
585
 *      but somehow (??) it does a speech synthesis filter in the
586
 *      middle, which is missing here
587
 *
588
 * @param lpcs LPC coefficients
589
 * @param n_lpcs Size of LPC buffer
590
 * @returns the tilt factor
591
 */
592
7098
static float tilt_factor(const float *lpcs, int n_lpcs)
593
{
594
    float rh0, rh1;
595
596
7098
    rh0 = 1.0     + avpriv_scalarproduct_float_c(lpcs,  lpcs,    n_lpcs);
597
7098
    rh1 = lpcs[0] + avpriv_scalarproduct_float_c(lpcs, &lpcs[1], n_lpcs - 1);
598
599
7098
    return rh1 / rh0;
600
}
601
602
/**
603
 * Derive denoise filter coefficients (in real domain) from the LPCs.
604
 */
605
5614
static void calc_input_response(WMAVoiceContext *s, float *lpcs,
606
                                int fcb_type, float *coeffs, int remainder)
607
{
608
5614
    float last_coeff, min = 15.0, max = -15.0;
609
    float irange, angle_mul, gain_mul, range, sq;
610
    int n, idx;
611
612
    /* Create frequency power spectrum of speech input (i.e. RDFT of LPCs) */
613
5614
    s->rdft.rdft_calc(&s->rdft, lpcs);
614
#define log_range(var, assign) do { \
615
        float tmp = log10f(assign);  var = tmp; \
616
        max       = FFMAX(max, tmp); min = FFMIN(min, tmp); \
617
    } while (0)
618

5614
    log_range(last_coeff,  lpcs[1]         * lpcs[1]);
619
359296
    for (n = 1; n < 64; n++)
620

353682
        log_range(lpcs[n], lpcs[n * 2]     * lpcs[n * 2] +
621
                           lpcs[n * 2 + 1] * lpcs[n * 2 + 1]);
622

5614
    log_range(lpcs[0],     lpcs[0]         * lpcs[0]);
623
#undef log_range
624
5614
    range    = max - min;
625
5614
    lpcs[64] = last_coeff;
626
627
    /* Now, use this spectrum to pick out these frequencies with higher
628
     * (relative) power/energy (which we then take to be "not noise"),
629
     * and set up a table (still in lpc[]) of (relative) gains per frequency.
630
     * These frequencies will be maintained, while others ("noise") will be
631
     * decreased in the filter output. */
632
5614
    irange    = 64.0 / range; // so irange*(max-value) is in the range [0, 63]
633
5614
    gain_mul  = range * (fcb_type == FCB_TYPE_HARDCODED ? (5.0 / 13.0) :
634
                                                          (5.0 / 14.7));
635
5614
    angle_mul = gain_mul * (8.0 * M_LN10 / M_PI);
636
370524
    for (n = 0; n <= 64; n++) {
637
        float pwr;
638
639
364910
        idx = lrint((max - lpcs[n]) * irange - 1);
640
364910
        idx = FFMAX(0, idx);
641
364910
        pwr = wmavoice_denoise_power_table[s->denoise_strength][idx];
642
364910
        lpcs[n] = angle_mul * pwr;
643
644
        /* 70.57 =~ 1/log10(1.0331663) */
645
364910
        idx = av_clipf((pwr * gain_mul - 0.0295) * 70.570526123, 0, INT_MAX / 2);
646
647
364910
        if (idx > 127) { // fall back if index falls outside table range
648
8557
            coeffs[n] = wmavoice_energy_table[127] *
649
8557
                        powf(1.0331663, idx - 127);
650
        } else
651
356353
            coeffs[n] = wmavoice_energy_table[FFMAX(0, idx)];
652
    }
653
654
    /* calculate the Hilbert transform of the gains, which we do (since this
655
     * is a sine input) by doing a phase shift (in theory, H(sin())=cos()).
656
     * Hilbert_Transform(RDFT(x)) = Laplace_Transform(x), which calculates the
657
     * "moment" of the LPCs in this filter. */
658
5614
    s->dct.dct_calc(&s->dct, lpcs);
659
5614
    s->dst.dct_calc(&s->dst, lpcs);
660
661
    /* Split out the coefficient indexes into phase/magnitude pairs */
662
5614
    idx = 255 + av_clip(lpcs[64],               -255, 255);
663
5614
    coeffs[0]  = coeffs[0]  * s->cos[idx];
664
5614
    idx = 255 + av_clip(lpcs[64] - 2 * lpcs[63], -255, 255);
665
5614
    last_coeff = coeffs[64] * s->cos[idx];
666
5614
    for (n = 63;; n--) {
667
179648
        idx = 255 + av_clip(-lpcs[64] - 2 * lpcs[n - 1], -255, 255);
668
179648
        coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
669
179648
        coeffs[n * 2]     = coeffs[n] * s->cos[idx];
670
671
179648
        if (!--n) break;
672
673
174034
        idx = 255 + av_clip( lpcs[64] - 2 * lpcs[n - 1], -255, 255);
674
174034
        coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
675
174034
        coeffs[n * 2]     = coeffs[n] * s->cos[idx];
676
    }
677
5614
    coeffs[1] = last_coeff;
678
679
    /* move into real domain */
680
5614
    s->irdft.rdft_calc(&s->irdft, coeffs);
681
682
    /* tilt correction and normalize scale */
683
5614
    memset(&coeffs[remainder], 0, sizeof(coeffs[0]) * (128 - remainder));
684
5614
    if (s->denoise_tilt_corr) {
685
1484
        float tilt_mem = 0;
686
687
1484
        coeffs[remainder - 1] = 0;
688
1484
        ff_tilt_compensation(&tilt_mem,
689
1484
                             -1.8 * tilt_factor(coeffs, remainder - 1),
690
                             coeffs, remainder);
691
    }
692
5614
    sq = (1.0 / 64.0) * sqrtf(1 / avpriv_scalarproduct_float_c(coeffs, coeffs,
693
                                                               remainder));
694
269472
    for (n = 0; n < remainder; n++)
695
263858
        coeffs[n] *= sq;
696
5614
}
697
698
/**
699
 * This function applies a Wiener filter on the (noisy) speech signal as
700
 * a means to denoise it.
701
 *
702
 * - take RDFT of LPCs to get the power spectrum of the noise + speech;
703
 * - using this power spectrum, calculate (for each frequency) the Wiener
704
 *    filter gain, which depends on the frequency power and desired level
705
 *    of noise subtraction (when set too high, this leads to artifacts)
706
 *    We can do this symmetrically over the X-axis (so 0-4kHz is the inverse
707
 *    of 4-8kHz);
708
 * - by doing a phase shift, calculate the Hilbert transform of this array
709
 *    of per-frequency filter-gains to get the filtering coefficients;
710
 * - smoothen/normalize/de-tilt these filter coefficients as desired;
711
 * - take RDFT of noisy sound, apply the coefficients and take its IRDFT
712
 *    to get the denoised speech signal;
713
 * - the leftover (i.e. output of the IRDFT on denoised speech data beyond
714
 *    the frame boundary) are saved and applied to subsequent frames by an
715
 *    overlap-add method (otherwise you get clicking-artifacts).
716
 *
717
 * @param s WMA Voice decoding context
718
 * @param fcb_type Frame (codebook) type
719
 * @param synth_pf input: the noisy speech signal, output: denoised speech
720
 *                 data; should be 16-byte aligned (for ASM purposes)
721
 * @param size size of the speech data
722
 * @param lpcs LPCs used to synthesize this frame's speech data
723
 */
724
6612
static void wiener_denoise(WMAVoiceContext *s, int fcb_type,
725
                           float *synth_pf, int size,
726
                           const float *lpcs)
727
{
728
    int remainder, lim, n;
729
730
6612
    if (fcb_type != FCB_TYPE_SILENCE) {
731
5614
        float *tilted_lpcs = s->tilted_lpcs_pf,
732
5614
              *coeffs = s->denoise_coeffs_pf, tilt_mem = 0;
733
734
5614
        tilted_lpcs[0]           = 1.0;
735
5614
        memcpy(&tilted_lpcs[1], lpcs, sizeof(lpcs[0]) * s->lsps);
736
5614
        memset(&tilted_lpcs[s->lsps + 1], 0,
737
5614
               sizeof(tilted_lpcs[0]) * (128 - s->lsps - 1));
738
5614
        ff_tilt_compensation(&tilt_mem, 0.7 * tilt_factor(lpcs, s->lsps),
739
5614
                             tilted_lpcs, s->lsps + 2);
740
741
        /* The IRDFT output (127 samples for 7-bit filter) beyond the frame
742
         * size is applied to the next frame. All input beyond this is zero,
743
         * and thus all output beyond this will go towards zero, hence we can
744
         * limit to min(size-1, 127-size) as a performance consideration. */
745
5614
        remainder = FFMIN(127 - size, size - 1);
746
5614
        calc_input_response(s, tilted_lpcs, fcb_type, coeffs, remainder);
747
748
        /* apply coefficients (in frequency spectrum domain), i.e. complex
749
         * number multiplication */
750
5614
        memset(&synth_pf[size], 0, sizeof(synth_pf[0]) * (128 - size));
751
5614
        s->rdft.rdft_calc(&s->rdft, synth_pf);
752
5614
        s->rdft.rdft_calc(&s->rdft, coeffs);
753
5614
        synth_pf[0] *= coeffs[0];
754
5614
        synth_pf[1] *= coeffs[1];
755
359296
        for (n = 1; n < 64; n++) {
756
353682
            float v1 = synth_pf[n * 2], v2 = synth_pf[n * 2 + 1];
757
353682
            synth_pf[n * 2]     = v1 * coeffs[n * 2] - v2 * coeffs[n * 2 + 1];
758
353682
            synth_pf[n * 2 + 1] = v2 * coeffs[n * 2] + v1 * coeffs[n * 2 + 1];
759
        }
760
5614
        s->irdft.rdft_calc(&s->irdft, synth_pf);
761
    }
762
763
    /* merge filter output with the history of previous runs */
764
6612
    if (s->denoise_filter_cache_size) {
765
5612
        lim = FFMIN(s->denoise_filter_cache_size, size);
766
269376
        for (n = 0; n < lim; n++)
767
263764
            synth_pf[n] += s->denoise_filter_cache[n];
768
5612
        s->denoise_filter_cache_size -= lim;
769
5612
        memmove(s->denoise_filter_cache, &s->denoise_filter_cache[size],
770
5612
                sizeof(s->denoise_filter_cache[0]) * s->denoise_filter_cache_size);
771
    }
772
773
    /* move remainder of filter output into a cache for future runs */
774
6612
    if (fcb_type != FCB_TYPE_SILENCE) {
775
5614
        lim = FFMIN(remainder, s->denoise_filter_cache_size);
776
5614
        for (n = 0; n < lim; n++)
777
            s->denoise_filter_cache[n] += synth_pf[size + n];
778
5614
        if (lim < remainder) {
779
5614
            memcpy(&s->denoise_filter_cache[lim], &synth_pf[size + lim],
780
5614
                   sizeof(s->denoise_filter_cache[0]) * (remainder - lim));
781
5614
            s->denoise_filter_cache_size = remainder;
782
        }
783
    }
784
6612
}
785
786
/**
787
 * Averaging projection filter, the postfilter used in WMAVoice.
788
 *
789
 * This uses the following steps:
790
 * - A zero-synthesis filter (generate excitation from synth signal)
791
 * - Kalman smoothing on excitation, based on pitch
792
 * - Re-synthesized smoothened output
793
 * - Iterative Wiener denoise filter
794
 * - Adaptive gain filter
795
 * - DC filter
796
 *
797
 * @param s WMAVoice decoding context
798
 * @param synth Speech synthesis output (before postfilter)
799
 * @param samples Output buffer for filtered samples
800
 * @param size Buffer size of synth & samples
801
 * @param lpcs Generated LPCs used for speech synthesis
802
 * @param zero_exc_pf destination for zero synthesis filter (16-byte aligned)
803
 * @param fcb_type Frame type (silence, hardcoded, AW-pulses or FCB-pulses)
804
 * @param pitch Pitch of the input signal
805
 */
806
6612
static void postfilter(WMAVoiceContext *s, const float *synth,
807
                       float *samples,    int size,
808
                       const float *lpcs, float *zero_exc_pf,
809
                       int fcb_type,      int pitch)
810
{
811
    float synth_filter_in_buf[MAX_FRAMESIZE / 2],
812
6612
          *synth_pf = &s->synth_filter_out_buf[MAX_LSPS_ALIGN16],
813
6612
          *synth_filter_in = zero_exc_pf;
814
815
6612
    av_assert0(size <= MAX_FRAMESIZE / 2);
816
817
    /* generate excitation from input signal */
818
6612
    ff_celp_lp_zero_synthesis_filterf(zero_exc_pf, lpcs, synth, size, s->lsps);
819
820

11682
    if (fcb_type >= FCB_TYPE_AW_PULSES &&
821
5070
        !kalman_smoothen(s, pitch, zero_exc_pf, synth_filter_in_buf, size))
822
5044
        synth_filter_in = synth_filter_in_buf;
823
824
    /* re-synthesize speech after smoothening, and keep history */
825
6612
    ff_celp_lp_synthesis_filterf(synth_pf, lpcs,
826
                                 synth_filter_in, size, s->lsps);
827
6612
    memcpy(&synth_pf[-s->lsps], &synth_pf[size - s->lsps],
828
6612
           sizeof(synth_pf[0]) * s->lsps);
829
830
6612
    wiener_denoise(s, fcb_type, synth_pf, size, lpcs);
831
832
6612
    adaptive_gain_control(samples, synth_pf, synth, size, 0.99,
833
                          &s->postfilter_agc);
834
835
6612
    if (s->dc_level > 8) {
836
        /* remove ultra-low frequency DC noise / highpass filter;
837
         * coefficients are identical to those used in SIPR decoding,
838
         * and very closely resemble those used in AMR-NB decoding. */
839
        ff_acelp_apply_order_2_transfer_function(samples, samples,
840
            (const float[2]) { -1.99997,      1.0 },
841
            (const float[2]) { -1.9330735188, 0.93589198496 },
842
            0.93980580475, s->dcf_mem, size);
843
    }
844
6612
}
845
/**
846
 * @}
847
 */
848
849
/**
850
 * Dequantize LSPs
851
 * @param lsps output pointer to the array that will hold the LSPs
852
 * @param num number of LSPs to be dequantized
853
 * @param values quantized values, contains n_stages values
854
 * @param sizes range (i.e. max value) of each quantized value
855
 * @param n_stages number of dequantization runs
856
 * @param table dequantization table to be used
857
 * @param mul_q LSF multiplier
858
 * @param base_q base (lowest) LSF values
859
 */
860
4404
static void dequant_lsps(double *lsps, int num,
861
                         const uint16_t *values,
862
                         const uint16_t *sizes,
863
                         int n_stages, const uint8_t *table,
864
                         const double *mul_q,
865
                         const double *base_q)
866
{
867
    int n, m;
868
869
4404
    memset(lsps, 0, num * sizeof(*lsps));
870
12668
    for (n = 0; n < n_stages; n++) {
871
8264
        const uint8_t *t_off = &table[values[n] * num];
872
8264
        double base = base_q[n], mul = mul_q[n];
873
874
95364
        for (m = 0; m < num; m++)
875
87100
            lsps[m] += base + mul * t_off[m];
876
877
8264
        table += sizes[n] * num;
878
    }
879
4404
}
880
881
/**
882
 * @name LSP dequantization routines
883
 * LSP dequantization routines, for 10/16LSPs and independent/residual coding.
884
 * lsp10i() consumes 24 bits; lsp10r() consumes an additional 24 bits;
885
 * lsp16i() consumes 34 bits; lsp16r() consumes an additional 26 bits.
886
 * @{
887
 */
888
/**
889
 * Parse 10 independently-coded LSPs.
890
 */
891
552
static void dequant_lsp10i(GetBitContext *gb, double *lsps)
892
{
893
    static const uint16_t vec_sizes[4] = { 256, 64, 32, 32 };
894
    static const double mul_lsf[4] = {
895
        5.2187144800e-3,    1.4626986422e-3,
896
        9.6179549166e-4,    1.1325736225e-3
897
    };
898
    static const double base_lsf[4] = {
899
        M_PI * -2.15522e-1, M_PI * -6.1646e-2,
900
        M_PI * -3.3486e-2,  M_PI * -5.7408e-2
901
    };
902
    uint16_t v[4];
903
904
552
    v[0] = get_bits(gb, 8);
905
552
    v[1] = get_bits(gb, 6);
906
552
    v[2] = get_bits(gb, 5);
907
552
    v[3] = get_bits(gb, 5);
908
909
552
    dequant_lsps(lsps, 10, v, vec_sizes, 4, wmavoice_dq_lsp10i,
910
                 mul_lsf, base_lsf);
911
552
}
912
913
/**
914
 * Parse 10 independently-coded LSPs, and then derive the tables to
915
 * generate LSPs for the other frames from them (residual coding).
916
 */
917
552
static void dequant_lsp10r(GetBitContext *gb,
918
                           double *i_lsps, const double *old,
919
                           double *a1, double *a2, int q_mode)
920
{
921
    static const uint16_t vec_sizes[3] = { 128, 64, 64 };
922
    static const double mul_lsf[3] = {
923
        2.5807601174e-3,    1.2354460219e-3,   1.1763821673e-3
924
    };
925
    static const double base_lsf[3] = {
926
        M_PI * -1.07448e-1, M_PI * -5.2706e-2, M_PI * -5.1634e-2
927
    };
928
552
    const float (*ipol_tab)[2][10] = q_mode ?
929
552
        wmavoice_lsp10_intercoeff_b : wmavoice_lsp10_intercoeff_a;
930
    uint16_t interpol, v[3];
931
    int n;
932
933
552
    dequant_lsp10i(gb, i_lsps);
934
935
552
    interpol = get_bits(gb, 5);
936
552
    v[0]     = get_bits(gb, 7);
937
552
    v[1]     = get_bits(gb, 6);
938
552
    v[2]     = get_bits(gb, 6);
939
940
6072
    for (n = 0; n < 10; n++) {
941
5520
        double delta = old[n] - i_lsps[n];
942
5520
        a1[n]        = ipol_tab[interpol][0][n] * delta + i_lsps[n];
943
5520
        a1[10 + n]   = ipol_tab[interpol][1][n] * delta + i_lsps[n];
944
    }
945
946
552
    dequant_lsps(a2, 20, v, vec_sizes, 3, wmavoice_dq_lsp10r,
947
                 mul_lsf, base_lsf);
948
552
}
949
950
/**
951
 * Parse 16 independently-coded LSPs.
952
 */
953
550
static void dequant_lsp16i(GetBitContext *gb, double *lsps)
954
{
955
    static const uint16_t vec_sizes[5] = { 256, 64, 128, 64, 128 };
956
    static const double mul_lsf[5] = {
957
        3.3439586280e-3,    6.9908173703e-4,
958
        3.3216608306e-3,    1.0334960326e-3,
959
        3.1899104283e-3
960
    };
961
    static const double base_lsf[5] = {
962
        M_PI * -1.27576e-1, M_PI * -2.4292e-2,
963
        M_PI * -1.28094e-1, M_PI * -3.2128e-2,
964
        M_PI * -1.29816e-1
965
    };
966
    uint16_t v[5];
967
968
550
    v[0] = get_bits(gb, 8);
969
550
    v[1] = get_bits(gb, 6);
970
550
    v[2] = get_bits(gb, 7);
971
550
    v[3] = get_bits(gb, 6);
972
550
    v[4] = get_bits(gb, 7);
973
974
550
    dequant_lsps( lsps,     5,  v,     vec_sizes,    2,
975
                 wmavoice_dq_lsp16i1,  mul_lsf,     base_lsf);
976
550
    dequant_lsps(&lsps[5],  5, &v[2], &vec_sizes[2], 2,
977
                 wmavoice_dq_lsp16i2, &mul_lsf[2], &base_lsf[2]);
978
550
    dequant_lsps(&lsps[10], 6, &v[4], &vec_sizes[4], 1,
979
                 wmavoice_dq_lsp16i3, &mul_lsf[4], &base_lsf[4]);
980
550
}
981
982
/**
983
 * Parse 16 independently-coded LSPs, and then derive the tables to
984
 * generate LSPs for the other frames from them (residual coding).
985
 */
986
550
static void dequant_lsp16r(GetBitContext *gb,
987
                           double *i_lsps, const double *old,
988
                           double *a1, double *a2, int q_mode)
989
{
990
    static const uint16_t vec_sizes[3] = { 128, 128, 128 };
991
    static const double mul_lsf[3] = {
992
        1.2232979501e-3,   1.4062241527e-3,   1.6114744851e-3
993
    };
994
    static const double base_lsf[3] = {
995
        M_PI * -5.5830e-2, M_PI * -5.2908e-2, M_PI * -5.4776e-2
996
    };
997
550
    const float (*ipol_tab)[2][16] = q_mode ?
998
550
        wmavoice_lsp16_intercoeff_b : wmavoice_lsp16_intercoeff_a;
999
    uint16_t interpol, v[3];
1000
    int n;
1001
1002
550
    dequant_lsp16i(gb, i_lsps);
1003
1004
550
    interpol = get_bits(gb, 5);
1005
550
    v[0]     = get_bits(gb, 7);
1006
550
    v[1]     = get_bits(gb, 7);
1007
550
    v[2]     = get_bits(gb, 7);
1008
1009
9350
    for (n = 0; n < 16; n++) {
1010
8800
        double delta = old[n] - i_lsps[n];
1011
8800
        a1[n]        = ipol_tab[interpol][0][n] * delta + i_lsps[n];
1012
8800
        a1[16 + n]   = ipol_tab[interpol][1][n] * delta + i_lsps[n];
1013
    }
1014
1015
550
    dequant_lsps( a2,     10,  v,     vec_sizes,    1,
1016
                 wmavoice_dq_lsp16r1,  mul_lsf,     base_lsf);
1017
550
    dequant_lsps(&a2[10], 10, &v[1], &vec_sizes[1], 1,
1018
                 wmavoice_dq_lsp16r2, &mul_lsf[1], &base_lsf[1]);
1019
550
    dequant_lsps(&a2[20], 12, &v[2], &vec_sizes[2], 1,
1020
                 wmavoice_dq_lsp16r3, &mul_lsf[2], &base_lsf[2]);
1021
550
}
1022
1023
/**
1024
 * @}
1025
 * @name Pitch-adaptive window coding functions
1026
 * The next few functions are for pitch-adaptive window coding.
1027
 * @{
1028
 */
1029
/**
1030
 * Parse the offset of the first pitch-adaptive window pulses, and
1031
 * the distribution of pulses between the two blocks in this frame.
1032
 * @param s WMA Voice decoding context private data
1033
 * @param gb bit I/O context
1034
 * @param pitch pitch for each block in this frame
1035
 */
1036
341
static void aw_parse_coords(WMAVoiceContext *s, GetBitContext *gb,
1037
                            const int *pitch)
1038
{
1039
    static const int16_t start_offset[94] = {
1040
        -11,  -9,  -7,  -5,  -3,  -1,   1,   3,   5,   7,   9,  11,
1041
         13,  15,  18,  17,  19,  20,  21,  22,  23,  24,  25,  26,
1042
         27,  28,  29,  30,  31,  32,  33,  35,  37,  39,  41,  43,
1043
         45,  47,  49,  51,  53,  55,  57,  59,  61,  63,  65,  67,
1044
         69,  71,  73,  75,  77,  79,  81,  83,  85,  87,  89,  91,
1045
         93,  95,  97,  99, 101, 103, 105, 107, 109, 111, 113, 115,
1046
        117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139,
1047
        141, 143, 145, 147, 149, 151, 153, 155, 157, 159
1048
    };
1049
    int bits, offset;
1050
1051
    /* position of pulse */
1052
341
    s->aw_idx_is_ext = 0;
1053
341
    if ((bits = get_bits(gb, 6)) >= 54) {
1054
10
        s->aw_idx_is_ext = 1;
1055
10
        bits += (bits - 54) * 3 + get_bits(gb, 2);
1056
    }
1057
1058
    /* for a repeated pulse at pulse_off with a pitch_lag of pitch[], count
1059
     * the distribution of the pulses in each block contained in this frame. */
1060
341
    s->aw_pulse_range        = FFMIN(pitch[0], pitch[1]) > 32 ? 24 : 16;
1061
391
    for (offset = start_offset[bits]; offset < 0; offset += pitch[0]) ;
1062
341
    s->aw_n_pulses[0]        = (pitch[0] - 1 + MAX_FRAMESIZE / 2 - offset) / pitch[0];
1063
341
    s->aw_first_pulse_off[0] = offset - s->aw_pulse_range / 2;
1064
341
    offset                  += s->aw_n_pulses[0] * pitch[0];
1065
341
    s->aw_n_pulses[1]        = (pitch[1] - 1 + MAX_FRAMESIZE - offset) / pitch[1];
1066
341
    s->aw_first_pulse_off[1] = offset - (MAX_FRAMESIZE + s->aw_pulse_range) / 2;
1067
1068
    /* if continuing from a position before the block, reset position to
1069
     * start of block (when corrected for the range over which it can be
1070
     * spread in aw_pulse_set1()). */
1071
341
    if (start_offset[bits] < MAX_FRAMESIZE / 2) {
1072
387
        while (s->aw_first_pulse_off[1] - pitch[1] + s->aw_pulse_range > 0)
1073
56
            s->aw_first_pulse_off[1] -= pitch[1];
1074
331
        if (start_offset[bits] < 0)
1075
100
            while (s->aw_first_pulse_off[0] - pitch[0] + s->aw_pulse_range > 0)
1076
50
                s->aw_first_pulse_off[0] -= pitch[0];
1077
    }
1078
341
}
1079
1080
/**
1081
 * Apply second set of pitch-adaptive window pulses.
1082
 * @param s WMA Voice decoding context private data
1083
 * @param gb bit I/O context
1084
 * @param block_idx block index in frame [0, 1]
1085
 * @param fcb structure containing fixed codebook vector info
1086
 * @return -1 on error, 0 otherwise
1087
 */
1088
682
static int aw_pulse_set2(WMAVoiceContext *s, GetBitContext *gb,
1089
                         int block_idx, AMRFixed *fcb)
1090
{
1091
    uint16_t use_mask_mem[9]; // only 5 are used, rest is padding
1092
682
    uint16_t *use_mask = use_mask_mem + 2;
1093
    /* in this function, idx is the index in the 80-bit (+ padding) use_mask
1094
     * bit-array. Since use_mask consists of 16-bit values, the lower 4 bits
1095
     * of idx are the position of the bit within a particular item in the
1096
     * array (0 being the most significant bit, and 15 being the least
1097
     * significant bit), and the remainder (>> 4) is the index in the
1098
     * use_mask[]-array. This is faster and uses less memory than using a
1099
     * 80-byte/80-int array. */
1100
682
    int pulse_off = s->aw_first_pulse_off[block_idx],
1101
682
        pulse_start, n, idx, range, aidx, start_off = 0;
1102
1103
    /* set offset of first pulse to within this block */
1104
682
    if (s->aw_n_pulses[block_idx] > 0)
1105
657
        while (pulse_off + s->aw_pulse_range < 1)
1106
            pulse_off += fcb->pitch_lag;
1107
1108
    /* find range per pulse */
1109
682
    if (s->aw_n_pulses[0] > 0) {
1110
646
        if (block_idx == 0) {
1111
323
            range = 32;
1112
        } else /* block_idx = 1 */ {
1113
323
            range = 8;
1114
323
            if (s->aw_n_pulses[block_idx] > 0)
1115
316
                pulse_off = s->aw_next_pulse_off_cache;
1116
        }
1117
    } else
1118
36
        range = 16;
1119
682
    pulse_start = s->aw_n_pulses[block_idx] > 0 ? pulse_off - range / 2 : 0;
1120
1121
    /* aw_pulse_set1() already applies pulses around pulse_off (to be exactly,
1122
     * in the range of [pulse_off, pulse_off + s->aw_pulse_range], and thus
1123
     * we exclude that range from being pulsed again in this function. */
1124
682
    memset(&use_mask[-2], 0, 2 * sizeof(use_mask[0]));
1125
682
    memset( use_mask,   -1, 5 * sizeof(use_mask[0]));
1126
682
    memset(&use_mask[5], 0, 2 * sizeof(use_mask[0]));
1127
682
    if (s->aw_n_pulses[block_idx] > 0)
1128
1568
        for (idx = pulse_off; idx < MAX_FRAMESIZE / 2; idx += fcb->pitch_lag) {
1129
911
            int excl_range         = s->aw_pulse_range; // always 16 or 24
1130
911
            uint16_t *use_mask_ptr = &use_mask[idx >> 4];
1131
911
            int first_sh           = 16 - (idx & 15);
1132
911
            *use_mask_ptr++       &= 0xFFFFu << first_sh;
1133
911
            excl_range            -= first_sh;
1134
911
            if (excl_range >= 16) {
1135
468
                *use_mask_ptr++    = 0;
1136
468
                *use_mask_ptr     &= 0xFFFF >> (excl_range - 16);
1137
            } else
1138
443
                *use_mask_ptr     &= 0xFFFF >> excl_range;
1139
        }
1140
1141
    /* find the 'aidx'th offset that is not excluded */
1142
682
    aidx = get_bits(gb, s->aw_n_pulses[0] > 0 ? 5 - 2 * block_idx : 4);
1143
16825
    for (n = 0; n <= aidx; pulse_start++) {
1144
18458
        for (idx = pulse_start; idx < 0; idx += fcb->pitch_lag) ;
1145
16143
        if (idx >= MAX_FRAMESIZE / 2) { // find from zero
1146
538
            if (use_mask[0])      idx = 0x0F;
1147
123
            else if (use_mask[1]) idx = 0x1F;
1148
18
            else if (use_mask[2]) idx = 0x2F;
1149
            else if (use_mask[3]) idx = 0x3F;
1150
            else if (use_mask[4]) idx = 0x4F;
1151
            else return -1;
1152
538
            idx -= av_log2_16bit(use_mask[idx >> 4]);
1153
        }
1154
16143
        if (use_mask[idx >> 4] & (0x8000 >> (idx & 15))) {
1155
7465
            use_mask[idx >> 4] &= ~(0x8000 >> (idx & 15));
1156
7465
            n++;
1157
7465
            start_off = idx;
1158
        }
1159
    }
1160
1161
682
    fcb->x[fcb->n] = start_off;
1162
682
    fcb->y[fcb->n] = get_bits1(gb) ? -1.0 : 1.0;
1163
682
    fcb->n++;
1164
1165
    /* set offset for next block, relative to start of that block */
1166
682
    n = (MAX_FRAMESIZE / 2 - start_off) % fcb->pitch_lag;
1167
682
    s->aw_next_pulse_off_cache = n ? fcb->pitch_lag - n : 0;
1168
682
    return 0;
1169
}
1170
1171
/**
1172
 * Apply first set of pitch-adaptive window pulses.
1173
 * @param s WMA Voice decoding context private data
1174
 * @param gb bit I/O context
1175
 * @param block_idx block index in frame [0, 1]
1176
 * @param fcb storage location for fixed codebook pulse info
1177
 */
1178
682
static void aw_pulse_set1(WMAVoiceContext *s, GetBitContext *gb,
1179
                          int block_idx, AMRFixed *fcb)
1180
{
1181

682
    int val = get_bits(gb, 12 - 2 * (s->aw_idx_is_ext && !block_idx));
1182
    float v;
1183
1184
682
    if (s->aw_n_pulses[block_idx] > 0) {
1185
        int n, v_mask, i_mask, sh, n_pulses;
1186
1187
657
        if (s->aw_pulse_range == 24) { // 3 pulses, 1:sign + 3:index each
1188
652
            n_pulses = 3;
1189
652
            v_mask   = 8;
1190
652
            i_mask   = 7;
1191
652
            sh       = 4;
1192
        } else { // 4 pulses, 1:sign + 2:index each
1193
5
            n_pulses = 4;
1194
5
            v_mask   = 4;
1195
5
            i_mask   = 3;
1196
5
            sh       = 3;
1197
        }
1198
1199
2633
        for (n = n_pulses - 1; n >= 0; n--, val >>= sh) {
1200
1976
            fcb->y[fcb->n] = (val & v_mask) ? -1.0 : 1.0;
1201
1976
            fcb->x[fcb->n] = (val & i_mask) * n_pulses + n +
1202
1976
                                 s->aw_first_pulse_off[block_idx];
1203
2217
            while (fcb->x[fcb->n] < 0)
1204
241
                fcb->x[fcb->n] += fcb->pitch_lag;
1205
1976
            if (fcb->x[fcb->n] < MAX_FRAMESIZE / 2)
1206
1959
                fcb->n++;
1207
        }
1208
    } else {
1209
25
        int num2 = (val & 0x1FF) >> 1, delta, idx;
1210
1211
25
        if (num2 < 1 * 79)      { delta = 1; idx = num2 + 1; }
1212
21
        else if (num2 < 2 * 78) { delta = 3; idx = num2 + 1 - 1 * 77; }
1213
15
        else if (num2 < 3 * 77) { delta = 5; idx = num2 + 1 - 2 * 76; }
1214
5
        else                    { delta = 7; idx = num2 + 1 - 3 * 75; }
1215
25
        v = (val & 0x200) ? -1.0 : 1.0;
1216
1217
25
        fcb->no_repeat_mask |= 3 << fcb->n;
1218
25
        fcb->x[fcb->n]       = idx - delta;
1219
25
        fcb->y[fcb->n]       = v;
1220
25
        fcb->x[fcb->n + 1]   = idx;
1221
25
        fcb->y[fcb->n + 1]   = (val & 1) ? -v : v;
1222
25
        fcb->n              += 2;
1223
    }
1224
682
}
1225
1226
/**
1227
 * @}
1228
 *
1229
 * Generate a random number from frame_cntr and block_idx, which will live
1230
 * in the range [0, 1000 - block_size] (so it can be used as an index in a
1231
 * table of size 1000 of which you want to read block_size entries).
1232
 *
1233
 * @param frame_cntr current frame number
1234
 * @param block_num current block index
1235
 * @param block_size amount of entries we want to read from a table
1236
 *                   that has 1000 entries
1237
 * @return a (non-)random number in the [0, 1000 - block_size] range.
1238
 */
1239
499
static int pRNG(int frame_cntr, int block_num, int block_size)
1240
{
1241
    /* array to simplify the calculation of z:
1242
     * y = (x % 9) * 5 + 6;
1243
     * z = (49995 * x) / y;
1244
     * Since y only has 9 values, we can remove the division by using a
1245
     * LUT and using FASTDIV-style divisions. For each of the 9 values
1246
     * of y, we can rewrite z as:
1247
     * z = x * (49995 / y) + x * ((49995 % y) / y)
1248
     * In this table, each col represents one possible value of y, the
1249
     * first number is 49995 / y, and the second is the FASTDIV variant
1250
     * of 49995 % y / y. */
1251
    static const unsigned int div_tbl[9][2] = {
1252
        { 8332,  3 * 715827883U }, // y =  6
1253
        { 4545,  0 * 390451573U }, // y = 11
1254
        { 3124, 11 * 268435456U }, // y = 16
1255
        { 2380, 15 * 204522253U }, // y = 21
1256
        { 1922, 23 * 165191050U }, // y = 26
1257
        { 1612, 23 * 138547333U }, // y = 31
1258
        { 1388, 27 * 119304648U }, // y = 36
1259
        { 1219, 16 * 104755300U }, // y = 41
1260
        { 1086, 39 *  93368855U }  // y = 46
1261
    };
1262
499
    unsigned int z, y, x = MUL16(block_num, 1877) + frame_cntr;
1263
499
    if (x >= 0xFFFF) x -= 0xFFFF;   // max value of x is 8*1877+0xFFFE=0x13AA6,
1264
                                    // so this is effectively a modulo (%)
1265
499
    y = x - 9 * MULH(477218589, x); // x % 9
1266
499
    z = (uint16_t) (x * div_tbl[y][0] + UMULH(x, div_tbl[y][1]));
1267
                                    // z = x * 49995 / (y * 5 + 6)
1268
499
    return z % (1000 - block_size);
1269
}
1270
1271
/**
1272
 * Parse hardcoded signal for a single block.
1273
 * @note see #synth_block().
1274
 */
1275
1043
static void synth_block_hardcoded(WMAVoiceContext *s, GetBitContext *gb,
1276
                                 int block_idx, int size,
1277
                                 const struct frame_type_desc *frame_desc,
1278
                                 float *excitation)
1279
{
1280
    float gain;
1281
    int n, r_idx;
1282
1283
1043
    av_assert0(size <= MAX_FRAMESIZE);
1284
1285
    /* Set the offset from which we start reading wmavoice_std_codebook */
1286
1043
    if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
1287
499
        r_idx = pRNG(s->frame_cntr, block_idx, size);
1288
499
        gain  = s->silence_gain;
1289
    } else /* FCB_TYPE_HARDCODED */ {
1290
544
        r_idx = get_bits(gb, 8);
1291
544
        gain  = wmavoice_gain_universal[get_bits(gb, 6)];
1292
    }
1293
1294
    /* Clear gain prediction parameters */
1295
1043
    memset(s->gain_pred_err, 0, sizeof(s->gain_pred_err));
1296
1297
    /* Apply gain to hardcoded codebook and use that as excitation signal */
1298
124403
    for (n = 0; n < size; n++)
1299
123360
        excitation[n] = wmavoice_std_codebook[r_idx + n] * gain;
1300
1043
}
1301
1302
/**
1303
 * Parse FCB/ACB signal for a single block.
1304
 * @note see #synth_block().
1305
 */
1306
9740
static void synth_block_fcb_acb(WMAVoiceContext *s, GetBitContext *gb,
1307
                                int block_idx, int size,
1308
                                int block_pitch_sh2,
1309
                                const struct frame_type_desc *frame_desc,
1310
                                float *excitation)
1311
{
1312
    static const float gain_coeff[6] = {
1313
        0.8169, -0.06545, 0.1726, 0.0185, -0.0359, 0.0458
1314
    };
1315
    float pulses[MAX_FRAMESIZE / 2], pred_err, acb_gain, fcb_gain;
1316
    int n, idx, gain_weight;
1317
    AMRFixed fcb;
1318
1319
9740
    av_assert0(size <= MAX_FRAMESIZE / 2);
1320
9740
    memset(pulses, 0, sizeof(*pulses) * size);
1321
1322
9740
    fcb.pitch_lag      = block_pitch_sh2 >> 2;
1323
9740
    fcb.pitch_fac      = 1.0;
1324
9740
    fcb.no_repeat_mask = 0;
1325
9740
    fcb.n              = 0;
1326
1327
    /* For the other frame types, this is where we apply the innovation
1328
     * (fixed) codebook pulses of the speech signal. */
1329
9740
    if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1330
682
        aw_pulse_set1(s, gb, block_idx, &fcb);
1331
682
        if (aw_pulse_set2(s, gb, block_idx, &fcb)) {
1332
            /* Conceal the block with silence and return.
1333
             * Skip the correct amount of bits to read the next
1334
             * block from the correct offset. */
1335
            int r_idx = pRNG(s->frame_cntr, block_idx, size);
1336
1337
            for (n = 0; n < size; n++)
1338
                excitation[n] =
1339
                    wmavoice_std_codebook[r_idx + n] * s->silence_gain;
1340
            skip_bits(gb, 7 + 1);
1341
            return;
1342
        }
1343
    } else /* FCB_TYPE_EXC_PULSES */ {
1344
9058
        int offset_nbits = 5 - frame_desc->log_n_blocks;
1345
1346
9058
        fcb.no_repeat_mask = -1;
1347
        /* similar to ff_decode_10_pulses_35bits(), but with single pulses
1348
         * (instead of double) for a subset of pulses */
1349
54348
        for (n = 0; n < 5; n++) {
1350
            float sign;
1351
            int pos1, pos2;
1352
1353
45290
            sign           = get_bits1(gb) ? 1.0 : -1.0;
1354
45290
            pos1           = get_bits(gb, offset_nbits);
1355
45290
            fcb.x[fcb.n]   = n + 5 * pos1;
1356
45290
            fcb.y[fcb.n++] = sign;
1357
45290
            if (n < frame_desc->dbl_pulses) {
1358
36270
                pos2           = get_bits(gb, offset_nbits);
1359
36270
                fcb.x[fcb.n]   = n + 5 * pos2;
1360
36270
                fcb.y[fcb.n++] = (pos1 < pos2) ? -sign : sign;
1361
            }
1362
        }
1363
    }
1364
9740
    ff_set_fixed_vector(pulses, &fcb, 1.0, size);
1365
1366
    /* Calculate gain for adaptive & fixed codebook signal.
1367
     * see ff_amr_set_fixed_gain(). */
1368
9740
    idx = get_bits(gb, 7);
1369
9740
    fcb_gain = expf(avpriv_scalarproduct_float_c(s->gain_pred_err,
1370
9740
                                                 gain_coeff, 6) -
1371
9740
                    5.2409161640 + wmavoice_gain_codebook_fcb[idx]);
1372
9740
    acb_gain = wmavoice_gain_codebook_acb[idx];
1373
9740
    pred_err = av_clipf(wmavoice_gain_codebook_fcb[idx],
1374
                        -2.9957322736 /* log(0.05) */,
1375
                         1.6094379124 /* log(5.0)  */);
1376
1377
9740
    gain_weight = 8 >> frame_desc->log_n_blocks;
1378
9740
    memmove(&s->gain_pred_err[gain_weight], s->gain_pred_err,
1379
9740
            sizeof(*s->gain_pred_err) * (6 - gain_weight));
1380
30020
    for (n = 0; n < gain_weight; n++)
1381
20280
        s->gain_pred_err[n] = pred_err;
1382
1383
    /* Calculation of adaptive codebook */
1384
9740
    if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
1385
        int len;
1386
19152
        for (n = 0; n < size; n += len) {
1387
            int next_idx_sh16;
1388
17876
            int abs_idx    = block_idx * size + n;
1389
17876
            int pitch_sh16 = (s->last_pitch_val << 16) +
1390
17876
                             s->pitch_diff_sh16 * abs_idx;
1391
17876
            int pitch      = (pitch_sh16 + 0x6FFF) >> 16;
1392
17876
            int idx_sh16   = ((pitch << 16) - pitch_sh16) * 8 + 0x58000;
1393
17876
            idx            = idx_sh16 >> 16;
1394
17876
            if (s->pitch_diff_sh16) {
1395
17442
                if (s->pitch_diff_sh16 > 0) {
1396
10526
                    next_idx_sh16 = (idx_sh16) &~ 0xFFFF;
1397
                } else
1398
6916
                    next_idx_sh16 = (idx_sh16 + 0x10000) &~ 0xFFFF;
1399
17442
                len = av_clip((idx_sh16 - next_idx_sh16) / s->pitch_diff_sh16 / 8,
1400
                              1, size - n);
1401
            } else
1402
434
                len = size;
1403
1404
17876
            ff_acelp_interpolatef(&excitation[n], &excitation[n - pitch],
1405
                                  wmavoice_ipol1_coeffs, 17,
1406
                                  idx, 9, len);
1407
        }
1408
    } else /* ACB_TYPE_HAMMING */ {
1409
8464
        int block_pitch = block_pitch_sh2 >> 2;
1410
8464
        idx             = block_pitch_sh2 & 3;
1411
8464
        if (idx) {
1412
3652
            ff_acelp_interpolatef(excitation, &excitation[-block_pitch],
1413
                                  wmavoice_ipol2_coeffs, 4,
1414
                                  idx, 8, size);
1415
        } else
1416
4812
            av_memcpy_backptr((uint8_t *) excitation, sizeof(float) * block_pitch,
1417
                              sizeof(float) * size);
1418
    }
1419
1420
    /* Interpolate ACB/FCB and use as excitation signal */
1421
9740
    ff_weighted_vector_sumf(excitation, excitation, pulses,
1422
                            acb_gain, fcb_gain, size);
1423
}
1424
1425
/**
1426
 * Parse data in a single block.
1427
 *
1428
 * @param s WMA Voice decoding context private data
1429
 * @param gb bit I/O context
1430
 * @param block_idx index of the to-be-read block
1431
 * @param size amount of samples to be read in this block
1432
 * @param block_pitch_sh2 pitch for this block << 2
1433
 * @param lsps LSPs for (the end of) this frame
1434
 * @param prev_lsps LSPs for the last frame
1435
 * @param frame_desc frame type descriptor
1436
 * @param excitation target memory for the ACB+FCB interpolated signal
1437
 * @param synth target memory for the speech synthesis filter output
1438
 * @return 0 on success, <0 on error.
1439
 */
1440
10783
static void synth_block(WMAVoiceContext *s, GetBitContext *gb,
1441
                        int block_idx, int size,
1442
                        int block_pitch_sh2,
1443
                        const double *lsps, const double *prev_lsps,
1444
                        const struct frame_type_desc *frame_desc,
1445
                        float *excitation, float *synth)
1446
{
1447
    double i_lsps[MAX_LSPS];
1448
    float lpcs[MAX_LSPS];
1449
    float fac;
1450
    int n;
1451
1452
10783
    if (frame_desc->acb_type == ACB_TYPE_NONE)
1453
1043
        synth_block_hardcoded(s, gb, block_idx, size, frame_desc, excitation);
1454
    else
1455
9740
        synth_block_fcb_acb(s, gb, block_idx, size, block_pitch_sh2,
1456
                            frame_desc, excitation);
1457
1458
    /* convert interpolated LSPs to LPCs */
1459
10783
    fac = (block_idx + 0.5) / frame_desc->n_blocks;
1460
151559
    for (n = 0; n < s->lsps; n++) // LSF -> LSP
1461
140776
        i_lsps[n] = cos(prev_lsps[n] + fac * (lsps[n] - prev_lsps[n]));
1462
10783
    ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1463
1464
    /* Speech synthesis */
1465
10783
    ff_celp_lp_synthesis_filterf(synth, lpcs, excitation, size, s->lsps);
1466
10783
}
1467
1468
/**
1469
 * Synthesize output samples for a single frame.
1470
 *
1471
 * @param ctx WMA Voice decoder context
1472
 * @param gb bit I/O context (s->gb or one for cross-packet superframes)
1473
 * @param frame_idx Frame number within superframe [0-2]
1474
 * @param samples pointer to output sample buffer, has space for at least 160
1475
 *                samples
1476
 * @param lsps LSP array
1477
 * @param prev_lsps array of previous frame's LSPs
1478
 * @param excitation target buffer for excitation signal
1479
 * @param synth target buffer for synthesized speech data
1480
 * @return 0 on success, <0 on error.
1481
 */
1482
3306
static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx,
1483
                       float *samples,
1484
                       const double *lsps, const double *prev_lsps,
1485
                       float *excitation, float *synth)
1486
{
1487
3306
    WMAVoiceContext *s = ctx->priv_data;
1488
3306
    int n, n_blocks_x2, log_n_blocks_x2, av_uninit(cur_pitch_val);
1489
3306
    int pitch[MAX_BLOCKS], av_uninit(last_block_pitch);
1490
1491
    /* Parse frame type ("frame header"), see frame_descs */
1492
3306
    int bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)], block_nsamples;
1493
1494
3306
    if (bd_idx < 0) {
1495
        av_log(ctx, AV_LOG_ERROR,
1496
               "Invalid frame type VLC code, skipping\n");
1497
        return AVERROR_INVALIDDATA;
1498
    }
1499
1500
3306
    block_nsamples = MAX_FRAMESIZE / frame_descs[bd_idx].n_blocks;
1501
1502
    /* Pitch calculation for ACB_TYPE_ASYMMETRIC ("pitch-per-frame") */
1503
3306
    if (frame_descs[bd_idx].acb_type == ACB_TYPE_ASYMMETRIC) {
1504
        /* Pitch is provided per frame, which is interpreted as the pitch of
1505
         * the last sample of the last block of this frame. We can interpolate
1506
         * the pitch of other blocks (and even pitch-per-sample) by gradually
1507
         * incrementing/decrementing prev_frame_pitch to cur_pitch_val. */
1508
560
        n_blocks_x2      = frame_descs[bd_idx].n_blocks << 1;
1509
560
        log_n_blocks_x2  = frame_descs[bd_idx].log_n_blocks + 1;
1510
560
        cur_pitch_val    = s->min_pitch_val + get_bits(gb, s->pitch_nbits);
1511
560
        cur_pitch_val    = FFMIN(cur_pitch_val, s->max_pitch_val - 1);
1512
560
        if (s->last_acb_type == ACB_TYPE_NONE ||
1513
524
            20 * abs(cur_pitch_val - s->last_pitch_val) >
1514
524
                (cur_pitch_val + s->last_pitch_val))
1515
138
            s->last_pitch_val = cur_pitch_val;
1516
1517
        /* pitch per block */
1518
1836
        for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1519
1276
            int fac = n * 2 + 1;
1520
1521
1276
            pitch[n] = (MUL16(fac,                 cur_pitch_val) +
1522
1276
                        MUL16((n_blocks_x2 - fac), s->last_pitch_val) +
1523
1276
                        frame_descs[bd_idx].n_blocks) >> log_n_blocks_x2;
1524
        }
1525
1526
        /* "pitch-diff-per-sample" for calculation of pitch per sample */
1527
560
        s->pitch_diff_sh16 =
1528
560
            (cur_pitch_val - s->last_pitch_val) * (1 << 16) / MAX_FRAMESIZE;
1529
    }
1530
1531
    /* Global gain (if silence) and pitch-adaptive window coordinates */
1532
3306
    switch (frame_descs[bd_idx].fcb_type) {
1533
499
    case FCB_TYPE_SILENCE:
1534
499
        s->silence_gain = wmavoice_gain_silence[get_bits(gb, 8)];
1535
499
        break;
1536
341
    case FCB_TYPE_AW_PULSES:
1537
341
        aw_parse_coords(s, gb, pitch);
1538
341
        break;
1539
    }
1540
1541
14089
    for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1542
        int bl_pitch_sh2;
1543
1544
        /* Pitch calculation for ACB_TYPE_HAMMING ("pitch-per-block") */
1545
10783
        switch (frame_descs[bd_idx].acb_type) {
1546
8464
        case ACB_TYPE_HAMMING: {
1547
            /* Pitch is given per block. Per-block pitches are encoded as an
1548
             * absolute value for the first block, and then delta values
1549
             * relative to this value) for all subsequent blocks. The scale of
1550
             * this pitch value is semi-logarithmic compared to its use in the
1551
             * decoder, so we convert it to normal scale also. */
1552
            int block_pitch,
1553
8464
                t1 = (s->block_conv_table[1] - s->block_conv_table[0]) << 2,
1554
8464
                t2 = (s->block_conv_table[2] - s->block_conv_table[1]) << 1,
1555
8464
                t3 =  s->block_conv_table[3] - s->block_conv_table[2] + 1;
1556
1557
8464
            if (n == 0) {
1558
1975
                block_pitch = get_bits(gb, s->block_pitch_nbits);
1559
            } else
1560
6489
                block_pitch = last_block_pitch - s->block_delta_pitch_hrange +
1561
6489
                                 get_bits(gb, s->block_delta_pitch_nbits);
1562
            /* Convert last_ so that any next delta is within _range */
1563
8464
            last_block_pitch = av_clip(block_pitch,
1564
                                       s->block_delta_pitch_hrange,
1565
8464
                                       s->block_pitch_range -
1566
8464
                                           s->block_delta_pitch_hrange);
1567
1568
            /* Convert semi-log-style scale back to normal scale */
1569
8464
            if (block_pitch < t1) {
1570
1491
                bl_pitch_sh2 = (s->block_conv_table[0] << 2) + block_pitch;
1571
            } else {
1572
6973
                block_pitch -= t1;
1573
6973
                if (block_pitch < t2) {
1574
5712
                    bl_pitch_sh2 =
1575
5712
                        (s->block_conv_table[1] << 2) + (block_pitch << 1);
1576
                } else {
1577
1261
                    block_pitch -= t2;
1578
1261
                    if (block_pitch < t3) {
1579
1261
                        bl_pitch_sh2 =
1580
1261
                            (s->block_conv_table[2] + block_pitch) << 2;
1581
                    } else
1582
                        bl_pitch_sh2 = s->block_conv_table[3] << 2;
1583
                }
1584
            }
1585
8464
            pitch[n] = bl_pitch_sh2 >> 2;
1586
8464
            break;
1587
        }
1588
1589
1276
        case ACB_TYPE_ASYMMETRIC: {
1590
1276
            bl_pitch_sh2 = pitch[n] << 2;
1591
1276
            break;
1592
        }
1593
1594
1043
        default: // ACB_TYPE_NONE has no pitch
1595
1043
            bl_pitch_sh2 = 0;
1596
1043
            break;
1597
        }
1598
1599
10783
        synth_block(s, gb, n, block_nsamples, bl_pitch_sh2,
1600
                    lsps, prev_lsps, &frame_descs[bd_idx],
1601
10783
                    &excitation[n * block_nsamples],
1602
10783
                    &synth[n * block_nsamples]);
1603
    }
1604
1605
    /* Averaging projection filter, if applicable. Else, just copy samples
1606
     * from synthesis buffer */
1607
3306
    if (s->do_apf) {
1608
        double i_lsps[MAX_LSPS];
1609
        float lpcs[MAX_LSPS];
1610
1611
46266
        for (n = 0; n < s->lsps; n++) // LSF -> LSP
1612
42960
            i_lsps[n] = cos(0.5 * (prev_lsps[n] + lsps[n]));
1613
3306
        ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1614
3306
        postfilter(s, synth, samples, 80, lpcs,
1615
3306
                   &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx],
1616
3306
                   frame_descs[bd_idx].fcb_type, pitch[0]);
1617
1618
46266
        for (n = 0; n < s->lsps; n++) // LSF -> LSP
1619
42960
            i_lsps[n] = cos(lsps[n]);
1620
3306
        ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1621
3306
        postfilter(s, &synth[80], &samples[80], 80, lpcs,
1622
3306
                   &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx + 80],
1623
3306
                   frame_descs[bd_idx].fcb_type, pitch[0]);
1624
    } else
1625
        memcpy(samples, synth, 160 * sizeof(synth[0]));
1626
1627
    /* Cache values for next frame */
1628
3306
    s->frame_cntr++;
1629
3306
    if (s->frame_cntr >= 0xFFFF) s->frame_cntr -= 0xFFFF; // i.e. modulo (%)
1630
3306
    s->last_acb_type = frame_descs[bd_idx].acb_type;
1631

3306
    switch (frame_descs[bd_idx].acb_type) {
1632
771
    case ACB_TYPE_NONE:
1633
771
        s->last_pitch_val = 0;
1634
771
        break;
1635
560
    case ACB_TYPE_ASYMMETRIC:
1636
560
        s->last_pitch_val = cur_pitch_val;
1637
560
        break;
1638
1975
    case ACB_TYPE_HAMMING:
1639
1975
        s->last_pitch_val = pitch[frame_descs[bd_idx].n_blocks - 1];
1640
1975
        break;
1641
    }
1642
1643
3306
    return 0;
1644
}
1645
1646
/**
1647
 * Ensure minimum value for first item, maximum value for last value,
1648
 * proper spacing between each value and proper ordering.
1649
 *
1650
 * @param lsps array of LSPs
1651
 * @param num size of LSP array
1652
 *
1653
 * @note basically a double version of #ff_acelp_reorder_lsf(), might be
1654
 *       useful to put in a generic location later on. Parts are also
1655
 *       present in #ff_set_min_dist_lsf() + #ff_sort_nearly_sorted_floats(),
1656
 *       which is in float.
1657
 */
1658
3306
static void stabilize_lsps(double *lsps, int num)
1659
{
1660
    int n, m, l;
1661
1662
    /* set minimum value for first, maximum value for last and minimum
1663
     * spacing between LSF values.
1664
     * Very similar to ff_set_min_dist_lsf(), but in double. */
1665
3306
    lsps[0]       = FFMAX(lsps[0],       0.0015 * M_PI);
1666
42960
    for (n = 1; n < num; n++)
1667
39654
        lsps[n]   = FFMAX(lsps[n],       lsps[n - 1] + 0.0125 * M_PI);
1668
3306
    lsps[num - 1] = FFMIN(lsps[num - 1], 0.9985 * M_PI);
1669
1670
    /* reorder (looks like one-time / non-recursed bubblesort).
1671
     * Very similar to ff_sort_nearly_sorted_floats(), but in double. */
1672
42960
    for (n = 1; n < num; n++) {
1673
39654
        if (lsps[n] < lsps[n - 1]) {
1674
            for (m = 1; m < num; m++) {
1675
                double tmp = lsps[m];
1676
                for (l = m - 1; l >= 0; l--) {
1677
                    if (lsps[l] <= tmp) break;
1678
                    lsps[l + 1] = lsps[l];
1679
                }
1680
                lsps[l + 1] = tmp;
1681
            }
1682
            break;
1683
        }
1684
    }
1685
3306
}
1686
1687
/**
1688
 * Synthesize output samples for a single superframe. If we have any data
1689
 * cached in s->sframe_cache, that will be used instead of whatever is loaded
1690
 * in s->gb.
1691
 *
1692
 * WMA Voice superframes contain 3 frames, each containing 160 audio samples,
1693
 * to give a total of 480 samples per frame. See #synth_frame() for frame
1694
 * parsing. In addition to 3 frames, superframes can also contain the LSPs
1695
 * (if these are globally specified for all frames (residually); they can
1696
 * also be specified individually per-frame. See the s->has_residual_lsps
1697
 * option), and can specify the number of samples encoded in this superframe
1698
 * (if less than 480), usually used to prevent blanks at track boundaries.
1699
 *
1700
 * @param ctx WMA Voice decoder context
1701
 * @return 0 on success, <0 on error or 1 if there was not enough data to
1702
 *         fully parse the superframe
1703
 */
1704
1102
static int synth_superframe(AVCodecContext *ctx, AVFrame *frame,
1705
                            int *got_frame_ptr)
1706
{
1707
1102
    WMAVoiceContext *s = ctx->priv_data;
1708
1102
    GetBitContext *gb = &s->gb, s_gb;
1709
1102
    int n, res, n_samples = MAX_SFRAMESIZE;
1710
    double lsps[MAX_FRAMES][MAX_LSPS];
1711
2204
    const double *mean_lsf = s->lsps == 16 ?
1712
1102
        wmavoice_mean_lsf16[s->lsp_def_mode] : wmavoice_mean_lsf10[s->lsp_def_mode];
1713
    float excitation[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE + 12];
1714
    float synth[MAX_LSPS + MAX_SFRAMESIZE];
1715
    float *samples;
1716
1717
1102
    memcpy(synth,      s->synth_history,
1718
1102
           s->lsps             * sizeof(*synth));
1719
1102
    memcpy(excitation, s->excitation_history,
1720
1102
           s->history_nsamples * sizeof(*excitation));
1721
1722
1102
    if (s->sframe_cache_size > 0) {
1723
185
        gb = &s_gb;
1724
185
        init_get_bits(gb, s->sframe_cache, s->sframe_cache_size);
1725
185
        s->sframe_cache_size = 0;
1726
    }
1727
1728
    /* First bit is speech/music bit, it differentiates between WMAVoice
1729
     * speech samples (the actual codec) and WMAVoice music samples, which
1730
     * are really WMAPro-in-WMAVoice-superframes. I've never seen those in
1731
     * the wild yet. */
1732
1102
    if (!get_bits1(gb)) {
1733
        avpriv_request_sample(ctx, "WMAPro-in-WMAVoice");
1734
        return AVERROR_PATCHWELCOME;
1735
    }
1736
1737
    /* (optional) nr. of samples in superframe; always <= 480 and >= 0 */
1738
1102
    if (get_bits1(gb)) {
1739
3
        if ((n_samples = get_bits(gb, 12)) > MAX_SFRAMESIZE) {
1740
            av_log(ctx, AV_LOG_ERROR,
1741
                   "Superframe encodes > %d samples (%d), not allowed\n",
1742
                   MAX_SFRAMESIZE, n_samples);
1743
            return AVERROR_INVALIDDATA;
1744
        }
1745
    }
1746
1747
    /* Parse LSPs, if global for the superframe (can also be per-frame). */
1748
1102
    if (s->has_residual_lsps) {
1749
        double prev_lsps[MAX_LSPS], a1[MAX_LSPS * 2], a2[MAX_LSPS * 2];
1750
1751
15422
        for (n = 0; n < s->lsps; n++)
1752
14320
            prev_lsps[n] = s->prev_lsps[n] - mean_lsf[n];
1753
1754
1102
        if (s->lsps == 10) {
1755
552
            dequant_lsp10r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1756
        } else /* s->lsps == 16 */
1757
550
            dequant_lsp16r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1758
1759
15422
        for (n = 0; n < s->lsps; n++) {
1760
14320
            lsps[0][n]  = mean_lsf[n] + (a1[n]           - a2[n * 2]);
1761
14320
            lsps[1][n]  = mean_lsf[n] + (a1[s->lsps + n] - a2[n * 2 + 1]);
1762
14320
            lsps[2][n] += mean_lsf[n];
1763
        }
1764
4408
        for (n = 0; n < 3; n++)
1765
3306
            stabilize_lsps(lsps[n], s->lsps);
1766
    }
1767
1768
    /* synth_superframe can run multiple times per packet
1769
     * free potential previous frame */
1770
1102
    av_frame_unref(frame);
1771
1772
    /* get output buffer */
1773
1102
    frame->nb_samples = MAX_SFRAMESIZE;
1774
1102
    if ((res = ff_get_buffer(ctx, frame, 0)) < 0)
1775
        return res;
1776
1102
    frame->nb_samples = n_samples;
1777
1102
    samples = (float *)frame->data[0];
1778
1779
    /* Parse frames, optionally preceded by per-frame (independent) LSPs. */
1780
4408
    for (n = 0; n < 3; n++) {
1781
3306
        if (!s->has_residual_lsps) {
1782
            int m;
1783
1784
            if (s->lsps == 10) {
1785
                dequant_lsp10i(gb, lsps[n]);
1786
            } else /* s->lsps == 16 */
1787
                dequant_lsp16i(gb, lsps[n]);
1788
1789
            for (m = 0; m < s->lsps; m++)
1790
                lsps[n][m] += mean_lsf[m];
1791
            stabilize_lsps(lsps[n], s->lsps);
1792
        }
1793
1794
4408
        if ((res = synth_frame(ctx, gb, n,
1795
3306
                               &samples[n * MAX_FRAMESIZE],
1796
3306
                               lsps[n], n == 0 ? s->prev_lsps : lsps[n - 1],
1797
3306
                               &excitation[s->history_nsamples + n * MAX_FRAMESIZE],
1798
3306
                               &synth[s->lsps + n * MAX_FRAMESIZE]))) {
1799
            *got_frame_ptr = 0;
1800
            return res;
1801
        }
1802
    }
1803
1804
    /* Statistics? FIXME - we don't check for length, a slight overrun
1805
     * will be caught by internal buffer padding, and anything else
1806
     * will be skipped, not read. */
1807
1102
    if (get_bits1(gb)) {
1808
        res = get_bits(gb, 4);
1809
        skip_bits(gb, 10 * (res + 1));
1810
    }
1811
1812
1102
    if (get_bits_left(gb) < 0) {
1813
        wmavoice_flush(ctx);
1814
        return AVERROR_INVALIDDATA;
1815
    }
1816
1817
1102
    *got_frame_ptr = 1;
1818
1819
    /* Update history */
1820
1102
    memcpy(s->prev_lsps,           lsps[2],
1821
1102
           s->lsps             * sizeof(*s->prev_lsps));
1822
1102
    memcpy(s->synth_history,      &synth[MAX_SFRAMESIZE],
1823
1102
           s->lsps             * sizeof(*synth));
1824
1102
    memcpy(s->excitation_history, &excitation[MAX_SFRAMESIZE],
1825
1102
           s->history_nsamples * sizeof(*excitation));
1826
1102
    if (s->do_apf)
1827
1102
        memmove(s->zero_exc_pf,       &s->zero_exc_pf[MAX_SFRAMESIZE],
1828
1102
                s->history_nsamples * sizeof(*s->zero_exc_pf));
1829
1830
1102
    return 0;
1831
}
1832
1833
/**
1834
 * Parse the packet header at the start of each packet (input data to this
1835
 * decoder).
1836
 *
1837
 * @param s WMA Voice decoding context private data
1838
 * @return <0 on error, nb_superframes on success.
1839
 */
1840
186
static int parse_packet_header(WMAVoiceContext *s)
1841
{
1842
186
    GetBitContext *gb = &s->gb;
1843
186
    unsigned int res, n_superframes = 0;
1844
1845
186
    skip_bits(gb, 4);          // packet sequence number
1846
186
    s->has_residual_lsps = get_bits1(gb);
1847
    do {
1848
186
        if (get_bits_left(gb) < 6 + s->spillover_bitsize)
1849
            return AVERROR_INVALIDDATA;
1850
1851
186
        res = get_bits(gb, 6); // number of superframes per packet
1852
                               // (minus first one if there is spillover)
1853
186
        n_superframes += res;
1854
186
    } while (res == 0x3F);
1855
186
    s->spillover_nbits   = get_bits(gb, s->spillover_bitsize);
1856
1857
186
    return get_bits_left(gb) >= 0 ? n_superframes : AVERROR_INVALIDDATA;
1858
}
1859
1860
/**
1861
 * Copy (unaligned) bits from gb/data/size to pb.
1862
 *
1863
 * @param pb target buffer to copy bits into
1864
 * @param data source buffer to copy bits from
1865
 * @param size size of the source data, in bytes
1866
 * @param gb bit I/O context specifying the current position in the source.
1867
 *           data. This function might use this to align the bit position to
1868
 *           a whole-byte boundary before calling #avpriv_copy_bits() on aligned
1869
 *           source data
1870
 * @param nbits the amount of bits to copy from source to target
1871
 *
1872
 * @note after calling this function, the current position in the input bit
1873
 *       I/O context is undefined.
1874
 */
1875
370
static void copy_bits(PutBitContext *pb,
1876
                      const uint8_t *data, int size,
1877
                      GetBitContext *gb, int nbits)
1878
{
1879
    int rmn_bytes, rmn_bits;
1880
1881
370
    rmn_bits = rmn_bytes = get_bits_left(gb);
1882
370
    if (rmn_bits < nbits)
1883
        return;
1884
370
    if (nbits > pb->size_in_bits - put_bits_count(pb))
1885
        return;
1886
370
    rmn_bits &= 7; rmn_bytes >>= 3;
1887
370
    if ((rmn_bits = FFMIN(rmn_bits, nbits)) > 0)
1888
290
        put_bits(pb, rmn_bits, get_bits(gb, rmn_bits));
1889
370
    avpriv_copy_bits(pb, data + size - rmn_bytes,
1890
370
                 FFMIN(nbits - rmn_bits, rmn_bytes << 3));
1891
}
1892
1893
/**
1894
 * Packet decoding: a packet is anything that the (ASF) demuxer contains,
1895
 * and we expect that the demuxer / application provides it to us as such
1896
 * (else you'll probably get garbage as output). Every packet has a size of
1897
 * ctx->block_align bytes, starts with a packet header (see
1898
 * #parse_packet_header()), and then a series of superframes. Superframe
1899
 * boundaries may exceed packets, i.e. superframes can split data over
1900
 * multiple (two) packets.
1901
 *
1902
 * For more information about frames, see #synth_superframe().
1903
 */
1904
1291
static int wmavoice_decode_packet(AVCodecContext *ctx, void *data,
1905
                                  int *got_frame_ptr, AVPacket *avpkt)
1906
{
1907
1291
    WMAVoiceContext *s = ctx->priv_data;
1908
1291
    GetBitContext *gb = &s->gb;
1909
    int size, res, pos;
1910
1911
    /* Packets are sometimes a multiple of ctx->block_align, with a packet
1912
     * header at each ctx->block_align bytes. However, FFmpeg's ASF demuxer
1913
     * feeds us ASF packets, which may concatenate multiple "codec" packets
1914
     * in a single "muxer" packet, so we artificially emulate that by
1915
     * capping the packet size at ctx->block_align. */
1916
1471
    for (size = avpkt->size; size > ctx->block_align; size -= ctx->block_align);
1917
1291
    init_get_bits8(&s->gb, avpkt->data, size);
1918
1919
    /* size == ctx->block_align is used to indicate whether we are dealing with
1920
     * a new packet or a packet of which we already read the packet header
1921
     * previously. */
1922
1291
    if (!(size % ctx->block_align)) { // new packet header
1923
191
        if (!size) {
1924
5
            s->spillover_nbits = 0;
1925
5
            s->nb_superframes = 0;
1926
        } else {
1927
186
            if ((res = parse_packet_header(s)) < 0)
1928
                return res;
1929
186
            s->nb_superframes = res;
1930
        }
1931
1932
        /* If the packet header specifies a s->spillover_nbits, then we want
1933
         * to push out all data of the previous packet (+ spillover) before
1934
         * continuing to parse new superframes in the current packet. */
1935
191
        if (s->sframe_cache_size > 0) {
1936
185
            int cnt = get_bits_count(gb);
1937
185
            if (cnt + s->spillover_nbits > avpkt->size * 8) {
1938
                s->spillover_nbits = avpkt->size * 8 - cnt;
1939
            }
1940
185
            copy_bits(&s->pb, avpkt->data, size, gb, s->spillover_nbits);
1941
185
            flush_put_bits(&s->pb);
1942
185
            s->sframe_cache_size += s->spillover_nbits;
1943
185
            if ((res = synth_superframe(ctx, data, got_frame_ptr)) == 0 &&
1944
185
                *got_frame_ptr) {
1945
185
                cnt += s->spillover_nbits;
1946
185
                s->skip_bits_next = cnt & 7;
1947
185
                res = cnt >> 3;
1948
185
                return res;
1949
            } else
1950
                skip_bits_long (gb, s->spillover_nbits - cnt +
1951
                                get_bits_count(gb)); // resync
1952
6
        } else if (s->spillover_nbits) {
1953
            skip_bits_long(gb, s->spillover_nbits);  // resync
1954
        }
1955
1100
    } else if (s->skip_bits_next)
1956
971
        skip_bits(gb, s->skip_bits_next);
1957
1958
    /* Try parsing superframes in current packet */
1959
1106
    s->sframe_cache_size = 0;
1960
1106
    s->skip_bits_next = 0;
1961
1106
    pos = get_bits_left(gb);
1962
1106
    if (s->nb_superframes-- == 0) {
1963
4
        *got_frame_ptr = 0;
1964
4
        return size;
1965
1102
    } else if (s->nb_superframes > 0) {
1966
917
        if ((res = synth_superframe(ctx, data, got_frame_ptr)) < 0) {
1967
            return res;
1968
917
        } else if (*got_frame_ptr) {
1969
917
            int cnt = get_bits_count(gb);
1970
917
            s->skip_bits_next = cnt & 7;
1971
917
            res = cnt >> 3;
1972
917
            return res;
1973
        }
1974
185
    } else if ((s->sframe_cache_size = pos) > 0) {
1975
        /* ... cache it for spillover in next packet */
1976
185
        init_put_bits(&s->pb, s->sframe_cache, SFRAME_CACHE_MAXSIZE);
1977
185
        copy_bits(&s->pb, avpkt->data, size, gb, s->sframe_cache_size);
1978
        // FIXME bad - just copy bytes as whole and add use the
1979
        // skip_bits_next field
1980
    }
1981
1982
185
    return size;
1983
}
1984
1985
8
static av_cold int wmavoice_decode_end(AVCodecContext *ctx)
1986
{
1987
8
    WMAVoiceContext *s = ctx->priv_data;
1988
1989
8
    if (s->do_apf) {
1990
8
        ff_rdft_end(&s->rdft);
1991
8
        ff_rdft_end(&s->irdft);
1992
8
        ff_dct_end(&s->dct);
1993
8
        ff_dct_end(&s->dst);
1994
    }
1995
1996
8
    return 0;
1997
}
1998
1999
AVCodec ff_wmavoice_decoder = {
2000
    .name             = "wmavoice",
2001
    .long_name        = NULL_IF_CONFIG_SMALL("Windows Media Audio Voice"),
2002
    .type             = AVMEDIA_TYPE_AUDIO,
2003
    .id               = AV_CODEC_ID_WMAVOICE,
2004
    .priv_data_size   = sizeof(WMAVoiceContext),
2005
    .init             = wmavoice_decode_init,
2006
    .close            = wmavoice_decode_end,
2007
    .decode           = wmavoice_decode_packet,
2008
    .capabilities     = AV_CODEC_CAP_SUBFRAMES | AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY,
2009
    .caps_internal    = FF_CODEC_CAP_INIT_CLEANUP,
2010
    .flush            = wmavoice_flush,
2011
};