GCC Code Coverage Report
Directory: ../../../ffmpeg/ Exec Total Coverage
File: src/libavcodec/wmavoice.c Lines: 664 753 88.2 %
Date: 2021-04-14 23:45:22 Branches: 305 390 78.2 %

Line Branch Exec Source
1
/*
2
 * Windows Media Audio Voice decoder.
3
 * Copyright (c) 2009 Ronald S. Bultje
4
 *
5
 * This file is part of FFmpeg.
6
 *
7
 * FFmpeg is free software; you can redistribute it and/or
8
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11
 *
12
 * FFmpeg is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with FFmpeg; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
 */
21
22
/**
23
 * @file
24
 * @brief Windows Media Audio Voice compatible decoder
25
 * @author Ronald S. Bultje <rsbultje@gmail.com>
26
 */
27
28
#include <math.h>
29
30
#include "libavutil/channel_layout.h"
31
#include "libavutil/float_dsp.h"
32
#include "libavutil/mem_internal.h"
33
#include "libavutil/thread.h"
34
#include "avcodec.h"
35
#include "internal.h"
36
#include "get_bits.h"
37
#include "put_bits.h"
38
#include "wmavoice_data.h"
39
#include "celp_filters.h"
40
#include "acelp_vectors.h"
41
#include "acelp_filters.h"
42
#include "lsp.h"
43
#include "dct.h"
44
#include "rdft.h"
45
#include "sinewin.h"
46
47
#define MAX_BLOCKS           8   ///< maximum number of blocks per frame
48
#define MAX_LSPS             16  ///< maximum filter order
49
#define MAX_LSPS_ALIGN16     16  ///< same as #MAX_LSPS; needs to be multiple
50
                                 ///< of 16 for ASM input buffer alignment
51
#define MAX_FRAMES           3   ///< maximum number of frames per superframe
52
#define MAX_FRAMESIZE        160 ///< maximum number of samples per frame
53
#define MAX_SIGNAL_HISTORY   416 ///< maximum excitation signal history
54
#define MAX_SFRAMESIZE       (MAX_FRAMESIZE * MAX_FRAMES)
55
                                 ///< maximum number of samples per superframe
56
#define SFRAME_CACHE_MAXSIZE 256 ///< maximum cache size for frame data that
57
                                 ///< was split over two packets
58
#define VLC_NBITS            6   ///< number of bits to read per VLC iteration
59
60
/**
61
 * Frame type VLC coding.
62
 */
63
static VLC frame_type_vlc;
64
65
/**
66
 * Adaptive codebook types.
67
 */
68
enum {
69
    ACB_TYPE_NONE       = 0, ///< no adaptive codebook (only hardcoded fixed)
70
    ACB_TYPE_ASYMMETRIC = 1, ///< adaptive codebook with per-frame pitch, which
71
                             ///< we interpolate to get a per-sample pitch.
72
                             ///< Signal is generated using an asymmetric sinc
73
                             ///< window function
74
                             ///< @note see #wmavoice_ipol1_coeffs
75
    ACB_TYPE_HAMMING    = 2  ///< Per-block pitch with signal generation using
76
                             ///< a Hamming sinc window function
77
                             ///< @note see #wmavoice_ipol2_coeffs
78
};
79
80
/**
81
 * Fixed codebook types.
82
 */
83
enum {
84
    FCB_TYPE_SILENCE    = 0, ///< comfort noise during silence
85
                             ///< generated from a hardcoded (fixed) codebook
86
                             ///< with per-frame (low) gain values
87
    FCB_TYPE_HARDCODED  = 1, ///< hardcoded (fixed) codebook with per-block
88
                             ///< gain values
89
    FCB_TYPE_AW_PULSES  = 2, ///< Pitch-adaptive window (AW) pulse signals,
90
                             ///< used in particular for low-bitrate streams
91
    FCB_TYPE_EXC_PULSES = 3, ///< Innovation (fixed) codebook pulse sets in
92
                             ///< combinations of either single pulses or
93
                             ///< pulse pairs
94
};
95
96
/**
97
 * Description of frame types.
98
 */
99
static const struct frame_type_desc {
100
    uint8_t n_blocks;     ///< amount of blocks per frame (each block
101
                          ///< (contains 160/#n_blocks samples)
102
    uint8_t log_n_blocks; ///< log2(#n_blocks)
103
    uint8_t acb_type;     ///< Adaptive codebook type (ACB_TYPE_*)
104
    uint8_t fcb_type;     ///< Fixed codebook type (FCB_TYPE_*)
105
    uint8_t dbl_pulses;   ///< how many pulse vectors have pulse pairs
106
                          ///< (rather than just one single pulse)
107
                          ///< only if #fcb_type == #FCB_TYPE_EXC_PULSES
108
} frame_descs[17] = {
109
    { 1, 0, ACB_TYPE_NONE,       FCB_TYPE_SILENCE,    0 },
110
    { 2, 1, ACB_TYPE_NONE,       FCB_TYPE_HARDCODED,  0 },
111
    { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_AW_PULSES,  0 },
112
    { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2 },
113
    { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5 },
114
    { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 0 },
115
    { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2 },
116
    { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5 },
117
    { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0 },
118
    { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2 },
119
    { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5 },
120
    { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0 },
121
    { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2 },
122
    { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5 },
123
    { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0 },
124
    { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2 },
125
    { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5 }
126
};
127
128
/**
129
 * WMA Voice decoding context.
130
 */
131
typedef struct WMAVoiceContext {
132
    /**
133
     * @name Global values specified in the stream header / extradata or used all over.
134
     * @{
135
     */
136
    GetBitContext gb;             ///< packet bitreader. During decoder init,
137
                                  ///< it contains the extradata from the
138
                                  ///< demuxer. During decoding, it contains
139
                                  ///< packet data.
140
    int8_t vbm_tree[25];          ///< converts VLC codes to frame type
141
142
    int spillover_bitsize;        ///< number of bits used to specify
143
                                  ///< #spillover_nbits in the packet header
144
                                  ///< = ceil(log2(ctx->block_align << 3))
145
    int history_nsamples;         ///< number of samples in history for signal
146
                                  ///< prediction (through ACB)
147
148
    /* postfilter specific values */
149
    int do_apf;                   ///< whether to apply the averaged
150
                                  ///< projection filter (APF)
151
    int denoise_strength;         ///< strength of denoising in Wiener filter
152
                                  ///< [0-11]
153
    int denoise_tilt_corr;        ///< Whether to apply tilt correction to the
154
                                  ///< Wiener filter coefficients (postfilter)
155
    int dc_level;                 ///< Predicted amount of DC noise, based
156
                                  ///< on which a DC removal filter is used
157
158
    int lsps;                     ///< number of LSPs per frame [10 or 16]
159
    int lsp_q_mode;               ///< defines quantizer defaults [0, 1]
160
    int lsp_def_mode;             ///< defines different sets of LSP defaults
161
                                  ///< [0, 1]
162
163
    int min_pitch_val;            ///< base value for pitch parsing code
164
    int max_pitch_val;            ///< max value + 1 for pitch parsing
165
    int pitch_nbits;              ///< number of bits used to specify the
166
                                  ///< pitch value in the frame header
167
    int block_pitch_nbits;        ///< number of bits used to specify the
168
                                  ///< first block's pitch value
169
    int block_pitch_range;        ///< range of the block pitch
170
    int block_delta_pitch_nbits;  ///< number of bits used to specify the
171
                                  ///< delta pitch between this and the last
172
                                  ///< block's pitch value, used in all but
173
                                  ///< first block
174
    int block_delta_pitch_hrange; ///< 1/2 range of the delta (full range is
175
                                  ///< from -this to +this-1)
176
    uint16_t block_conv_table[4]; ///< boundaries for block pitch unit/scale
177
                                  ///< conversion
178
179
    /**
180
     * @}
181
     *
182
     * @name Packet values specified in the packet header or related to a packet.
183
     *
184
     * A packet is considered to be a single unit of data provided to this
185
     * decoder by the demuxer.
186
     * @{
187
     */
188
    int spillover_nbits;          ///< number of bits of the previous packet's
189
                                  ///< last superframe preceding this
190
                                  ///< packet's first full superframe (useful
191
                                  ///< for re-synchronization also)
192
    int has_residual_lsps;        ///< if set, superframes contain one set of
193
                                  ///< LSPs that cover all frames, encoded as
194
                                  ///< independent and residual LSPs; if not
195
                                  ///< set, each frame contains its own, fully
196
                                  ///< independent, LSPs
197
    int skip_bits_next;           ///< number of bits to skip at the next call
198
                                  ///< to #wmavoice_decode_packet() (since
199
                                  ///< they're part of the previous superframe)
200
201
    uint8_t sframe_cache[SFRAME_CACHE_MAXSIZE + AV_INPUT_BUFFER_PADDING_SIZE];
202
                                  ///< cache for superframe data split over
203
                                  ///< multiple packets
204
    int sframe_cache_size;        ///< set to >0 if we have data from an
205
                                  ///< (incomplete) superframe from a previous
206
                                  ///< packet that spilled over in the current
207
                                  ///< packet; specifies the amount of bits in
208
                                  ///< #sframe_cache
209
    PutBitContext pb;             ///< bitstream writer for #sframe_cache
210
211
    /**
212
     * @}
213
     *
214
     * @name Frame and superframe values
215
     * Superframe and frame data - these can change from frame to frame,
216
     * although some of them do in that case serve as a cache / history for
217
     * the next frame or superframe.
218
     * @{
219
     */
220
    double prev_lsps[MAX_LSPS];   ///< LSPs of the last frame of the previous
221
                                  ///< superframe
222
    int last_pitch_val;           ///< pitch value of the previous frame
223
    int last_acb_type;            ///< frame type [0-2] of the previous frame
224
    int pitch_diff_sh16;          ///< ((cur_pitch_val - #last_pitch_val)
225
                                  ///< << 16) / #MAX_FRAMESIZE
226
    float silence_gain;           ///< set for use in blocks if #ACB_TYPE_NONE
227
228
    int aw_idx_is_ext;            ///< whether the AW index was encoded in
229
                                  ///< 8 bits (instead of 6)
230
    int aw_pulse_range;           ///< the range over which #aw_pulse_set1()
231
                                  ///< can apply the pulse, relative to the
232
                                  ///< value in aw_first_pulse_off. The exact
233
                                  ///< position of the first AW-pulse is within
234
                                  ///< [pulse_off, pulse_off + this], and
235
                                  ///< depends on bitstream values; [16 or 24]
236
    int aw_n_pulses[2];           ///< number of AW-pulses in each block; note
237
                                  ///< that this number can be negative (in
238
                                  ///< which case it basically means "zero")
239
    int aw_first_pulse_off[2];    ///< index of first sample to which to
240
                                  ///< apply AW-pulses, or -0xff if unset
241
    int aw_next_pulse_off_cache;  ///< the position (relative to start of the
242
                                  ///< second block) at which pulses should
243
                                  ///< start to be positioned, serves as a
244
                                  ///< cache for pitch-adaptive window pulses
245
                                  ///< between blocks
246
247
    int frame_cntr;               ///< current frame index [0 - 0xFFFE]; is
248
                                  ///< only used for comfort noise in #pRNG()
249
    int nb_superframes;           ///< number of superframes in current packet
250
    float gain_pred_err[6];       ///< cache for gain prediction
251
    float excitation_history[MAX_SIGNAL_HISTORY];
252
                                  ///< cache of the signal of previous
253
                                  ///< superframes, used as a history for
254
                                  ///< signal generation
255
    float synth_history[MAX_LSPS]; ///< see #excitation_history
256
    /**
257
     * @}
258
     *
259
     * @name Postfilter values
260
     *
261
     * Variables used for postfilter implementation, mostly history for
262
     * smoothing and so on, and context variables for FFT/iFFT.
263
     * @{
264
     */
265
    RDFTContext rdft, irdft;      ///< contexts for FFT-calculation in the
266
                                  ///< postfilter (for denoise filter)
267
    DCTContext dct, dst;          ///< contexts for phase shift (in Hilbert
268
                                  ///< transform, part of postfilter)
269
    float sin[511], cos[511];     ///< 8-bit cosine/sine windows over [-pi,pi]
270
                                  ///< range
271
    float postfilter_agc;         ///< gain control memory, used in
272
                                  ///< #adaptive_gain_control()
273
    float dcf_mem[2];             ///< DC filter history
274
    float zero_exc_pf[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE];
275
                                  ///< zero filter output (i.e. excitation)
276
                                  ///< by postfilter
277
    float denoise_filter_cache[MAX_FRAMESIZE];
278
    int   denoise_filter_cache_size; ///< samples in #denoise_filter_cache
279
    DECLARE_ALIGNED(32, float, tilted_lpcs_pf)[0x80];
280
                                  ///< aligned buffer for LPC tilting
281
    DECLARE_ALIGNED(32, float, denoise_coeffs_pf)[0x80];
282
                                  ///< aligned buffer for denoise coefficients
283
    DECLARE_ALIGNED(32, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16];
284
                                  ///< aligned buffer for postfilter speech
285
                                  ///< synthesis
286
    /**
287
     * @}
288
     */
289
} WMAVoiceContext;
290
291
/**
292
 * Set up the variable bit mode (VBM) tree from container extradata.
293
 * @param gb bit I/O context.
294
 *           The bit context (s->gb) should be loaded with byte 23-46 of the
295
 *           container extradata (i.e. the ones containing the VBM tree).
296
 * @param vbm_tree pointer to array to which the decoded VBM tree will be
297
 *                 written.
298
 * @return 0 on success, <0 on error.
299
 */
300
8
static av_cold int decode_vbmtree(GetBitContext *gb, int8_t vbm_tree[25])
301
{
302
8
    int cntr[8] = { 0 }, n, res;
303
304
8
    memset(vbm_tree, 0xff, sizeof(vbm_tree[0]) * 25);
305
144
    for (n = 0; n < 17; n++) {
306
136
        res = get_bits(gb, 3);
307
136
        if (cntr[res] > 3) // should be >= 3 + (res == 7))
308
            return -1;
309
136
        vbm_tree[res * 3 + cntr[res]++] = n;
310
    }
311
8
    return 0;
312
}
313
314
5
static av_cold void wmavoice_init_static_data(void)
315
{
316
    static const uint8_t bits[] = {
317
         2,  2,  2,  4,  4,  4,
318
         6,  6,  6,  8,  8,  8,
319
        10, 10, 10, 12, 12, 12,
320
        14, 14, 14, 14
321
    };
322
    static const uint16_t codes[] = {
323
          0x0000, 0x0001, 0x0002,        //              00/01/10
324
          0x000c, 0x000d, 0x000e,        //           11+00/01/10
325
          0x003c, 0x003d, 0x003e,        //         1111+00/01/10
326
          0x00fc, 0x00fd, 0x00fe,        //       111111+00/01/10
327
          0x03fc, 0x03fd, 0x03fe,        //     11111111+00/01/10
328
          0x0ffc, 0x0ffd, 0x0ffe,        //   1111111111+00/01/10
329
          0x3ffc, 0x3ffd, 0x3ffe, 0x3fff // 111111111111+xx
330
    };
331
332
5
    INIT_VLC_STATIC(&frame_type_vlc, VLC_NBITS, sizeof(bits),
333
                    bits, 1, 1, codes, 2, 2, 132);
334
5
}
335
336
static av_cold void wmavoice_flush(AVCodecContext *ctx)
337
{
338
    WMAVoiceContext *s = ctx->priv_data;
339
    int n;
340
341
    s->postfilter_agc    = 0;
342
    s->sframe_cache_size = 0;
343
    s->skip_bits_next    = 0;
344
    for (n = 0; n < s->lsps; n++)
345
        s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
346
    memset(s->excitation_history, 0,
347
           sizeof(*s->excitation_history) * MAX_SIGNAL_HISTORY);
348
    memset(s->synth_history,      0,
349
           sizeof(*s->synth_history)      * MAX_LSPS);
350
    memset(s->gain_pred_err,      0,
351
           sizeof(s->gain_pred_err));
352
353
    if (s->do_apf) {
354
        memset(&s->synth_filter_out_buf[MAX_LSPS_ALIGN16 - s->lsps], 0,
355
               sizeof(*s->synth_filter_out_buf) * s->lsps);
356
        memset(s->dcf_mem,              0,
357
               sizeof(*s->dcf_mem)              * 2);
358
        memset(s->zero_exc_pf,          0,
359
               sizeof(*s->zero_exc_pf)          * s->history_nsamples);
360
        memset(s->denoise_filter_cache, 0, sizeof(s->denoise_filter_cache));
361
    }
362
}
363
364
/**
365
 * Set up decoder with parameters from demuxer (extradata etc.).
366
 */
367
8
static av_cold int wmavoice_decode_init(AVCodecContext *ctx)
368
{
369
    static AVOnce init_static_once = AV_ONCE_INIT;
370
    int n, flags, pitch_range, lsp16_flag, ret;
371
8
    WMAVoiceContext *s = ctx->priv_data;
372
373
8
    ff_thread_once(&init_static_once, wmavoice_init_static_data);
374
375
    /**
376
     * Extradata layout:
377
     * - byte  0-18: WMAPro-in-WMAVoice extradata (see wmaprodec.c),
378
     * - byte 19-22: flags field (annoyingly in LE; see below for known
379
     *               values),
380
     * - byte 23-46: variable bitmode tree (really just 17 * 3 bits,
381
     *               rest is 0).
382
     */
383
8
    if (ctx->extradata_size != 46) {
384
        av_log(ctx, AV_LOG_ERROR,
385
               "Invalid extradata size %d (should be 46)\n",
386
               ctx->extradata_size);
387
        return AVERROR_INVALIDDATA;
388
    }
389

8
    if (ctx->block_align <= 0 || ctx->block_align > (1<<22)) {
390
        av_log(ctx, AV_LOG_ERROR, "Invalid block alignment %d.\n", ctx->block_align);
391
        return AVERROR_INVALIDDATA;
392
    }
393
394
8
    flags                = AV_RL32(ctx->extradata + 18);
395
8
    s->spillover_bitsize = 3 + av_ceil_log2(ctx->block_align);
396
8
    s->do_apf            =    flags & 0x1;
397
8
    if (s->do_apf) {
398

16
        if ((ret = ff_rdft_init(&s->rdft,  7,  DFT_R2C)) < 0 ||
399
16
            (ret = ff_rdft_init(&s->irdft, 7, IDFT_C2R)) < 0 ||
400
16
            (ret = ff_dct_init (&s->dct,   6,    DCT_I)) < 0 ||
401
8
            (ret = ff_dct_init (&s->dst,   6,    DST_I)) < 0)
402
            return ret;
403
404
8
        ff_sine_window_init(s->cos, 256);
405
8
        memcpy(&s->sin[255], s->cos, 256 * sizeof(s->cos[0]));
406
2048
        for (n = 0; n < 255; n++) {
407
2040
            s->sin[n]       = -s->sin[510 - n];
408
2040
            s->cos[510 - n] =  s->cos[n];
409
        }
410
    }
411
8
    s->denoise_strength  =   (flags >> 2) & 0xF;
412
8
    if (s->denoise_strength >= 12) {
413
        av_log(ctx, AV_LOG_ERROR,
414
               "Invalid denoise filter strength %d (max=11)\n",
415
               s->denoise_strength);
416
        return AVERROR_INVALIDDATA;
417
    }
418
8
    s->denoise_tilt_corr = !!(flags & 0x40);
419
8
    s->dc_level          =   (flags >> 7) & 0xF;
420
8
    s->lsp_q_mode        = !!(flags & 0x2000);
421
8
    s->lsp_def_mode      = !!(flags & 0x4000);
422
8
    lsp16_flag           =    flags & 0x1000;
423
8
    if (lsp16_flag) {
424
4
        s->lsps               = 16;
425
    } else {
426
4
        s->lsps               = 10;
427
    }
428
112
    for (n = 0; n < s->lsps; n++)
429
104
        s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
430
431
8
    init_get_bits(&s->gb, ctx->extradata + 22, (ctx->extradata_size - 22) << 3);
432
8
    if (decode_vbmtree(&s->gb, s->vbm_tree) < 0) {
433
        av_log(ctx, AV_LOG_ERROR, "Invalid VBM tree; broken extradata?\n");
434
        return AVERROR_INVALIDDATA;
435
    }
436
437
8
    if (ctx->sample_rate >= INT_MAX / (256 * 37))
438
        return AVERROR_INVALIDDATA;
439
440
8
    s->min_pitch_val    = ((ctx->sample_rate << 8)      /  400 + 50) >> 8;
441
8
    s->max_pitch_val    = ((ctx->sample_rate << 8) * 37 / 2000 + 50) >> 8;
442
8
    pitch_range         = s->max_pitch_val - s->min_pitch_val;
443
8
    if (pitch_range <= 0) {
444
        av_log(ctx, AV_LOG_ERROR, "Invalid pitch range; broken extradata?\n");
445
        return AVERROR_INVALIDDATA;
446
    }
447
8
    s->pitch_nbits      = av_ceil_log2(pitch_range);
448
8
    s->last_pitch_val   = 40;
449
8
    s->last_acb_type    = ACB_TYPE_NONE;
450
8
    s->history_nsamples = s->max_pitch_val + 8;
451
452

8
    if (s->min_pitch_val < 1 || s->history_nsamples > MAX_SIGNAL_HISTORY) {
453
        int min_sr = ((((1 << 8) - 50) * 400) + 0xFF) >> 8,
454
            max_sr = ((((MAX_SIGNAL_HISTORY - 8) << 8) + 205) * 2000 / 37) >> 8;
455
456
        av_log(ctx, AV_LOG_ERROR,
457
               "Unsupported samplerate %d (min=%d, max=%d)\n",
458
               ctx->sample_rate, min_sr, max_sr); // 322-22097 Hz
459
460
        return AVERROR(ENOSYS);
461
    }
462
463
8
    s->block_conv_table[0]      = s->min_pitch_val;
464
8
    s->block_conv_table[1]      = (pitch_range * 25) >> 6;
465
8
    s->block_conv_table[2]      = (pitch_range * 44) >> 6;
466
8
    s->block_conv_table[3]      = s->max_pitch_val - 1;
467
8
    s->block_delta_pitch_hrange = (pitch_range >> 3) & ~0xF;
468
8
    if (s->block_delta_pitch_hrange <= 0) {
469
        av_log(ctx, AV_LOG_ERROR, "Invalid delta pitch hrange; broken extradata?\n");
470
        return AVERROR_INVALIDDATA;
471
    }
472
8
    s->block_delta_pitch_nbits  = 1 + av_ceil_log2(s->block_delta_pitch_hrange);
473
8
    s->block_pitch_range        = s->block_conv_table[2] +
474
8
                                  s->block_conv_table[3] + 1 +
475
8
                                  2 * (s->block_conv_table[1] - 2 * s->min_pitch_val);
476
8
    s->block_pitch_nbits        = av_ceil_log2(s->block_pitch_range);
477
478
8
    ctx->channels               = 1;
479
8
    ctx->channel_layout         = AV_CH_LAYOUT_MONO;
480
8
    ctx->sample_fmt             = AV_SAMPLE_FMT_FLT;
481
482
8
    return 0;
483
}
484
485
/**
486
 * @name Postfilter functions
487
 * Postfilter functions (gain control, wiener denoise filter, DC filter,
488
 * kalman smoothening, plus surrounding code to wrap it)
489
 * @{
490
 */
491
/**
492
 * Adaptive gain control (as used in postfilter).
493
 *
494
 * Identical to #ff_adaptive_gain_control() in acelp_vectors.c, except
495
 * that the energy here is calculated using sum(abs(...)), whereas the
496
 * other codecs (e.g. AMR-NB, SIPRO) use sqrt(dotproduct(...)).
497
 *
498
 * @param out output buffer for filtered samples
499
 * @param in input buffer containing the samples as they are after the
500
 *           postfilter steps so far
501
 * @param speech_synth input buffer containing speech synth before postfilter
502
 * @param size input buffer size
503
 * @param alpha exponential filter factor
504
 * @param gain_mem pointer to filter memory (single float)
505
 */
506
6612
static void adaptive_gain_control(float *out, const float *in,
507
                                  const float *speech_synth,
508
                                  int size, float alpha, float *gain_mem)
509
{
510
    int i;
511
6612
    float speech_energy = 0.0, postfilter_energy = 0.0, gain_scale_factor;
512
6612
    float mem = *gain_mem;
513
514
535572
    for (i = 0; i < size; i++) {
515
528960
        speech_energy     += fabsf(speech_synth[i]);
516
528960
        postfilter_energy += fabsf(in[i]);
517
    }
518
6612
    gain_scale_factor = postfilter_energy == 0.0 ? 0.0 :
519
6612
                        (1.0 - alpha) * speech_energy / postfilter_energy;
520
521
535572
    for (i = 0; i < size; i++) {
522
528960
        mem = alpha * mem + gain_scale_factor;
523
528960
        out[i] = in[i] * mem;
524
    }
525
526
6612
    *gain_mem = mem;
527
6612
}
528
529
/**
530
 * Kalman smoothing function.
531
 *
532
 * This function looks back pitch +/- 3 samples back into history to find
533
 * the best fitting curve (that one giving the optimal gain of the two
534
 * signals, i.e. the highest dot product between the two), and then
535
 * uses that signal history to smoothen the output of the speech synthesis
536
 * filter.
537
 *
538
 * @param s WMA Voice decoding context
539
 * @param pitch pitch of the speech signal
540
 * @param in input speech signal
541
 * @param out output pointer for smoothened signal
542
 * @param size input/output buffer size
543
 *
544
 * @returns -1 if no smoothening took place, e.g. because no optimal
545
 *          fit could be found, or 0 on success.
546
 */
547
5070
static int kalman_smoothen(WMAVoiceContext *s, int pitch,
548
                           const float *in, float *out, int size)
549
{
550
    int n;
551
5070
    float optimal_gain = 0, dot;
552
5070
    const float *ptr = &in[-FFMAX(s->min_pitch_val, pitch - 3)],
553
5070
                *end = &in[-FFMIN(s->max_pitch_val, pitch + 3)],
554
5070
                *best_hist_ptr = NULL;
555
556
    /* find best fitting point in history */
557
    do {
558
35388
        dot = avpriv_scalarproduct_float_c(in, ptr, size);
559
35388
        if (dot > optimal_gain) {
560
12328
            optimal_gain  = dot;
561
12328
            best_hist_ptr = ptr;
562
        }
563
35388
    } while (--ptr >= end);
564
565
5070
    if (optimal_gain <= 0)
566
26
        return -1;
567
5044
    dot = avpriv_scalarproduct_float_c(best_hist_ptr, best_hist_ptr, size);
568
5044
    if (dot <= 0) // would be 1.0
569
        return -1;
570
571
5044
    if (optimal_gain <= dot) {
572
4872
        dot = dot / (dot + 0.6 * optimal_gain); // 0.625-1.000
573
    } else
574
172
        dot = 0.625;
575
576
    /* actual smoothing */
577
408564
    for (n = 0; n < size; n++)
578
403520
        out[n] = best_hist_ptr[n] + dot * (in[n] - best_hist_ptr[n]);
579
580
5044
    return 0;
581
}
582
583
/**
584
 * Get the tilt factor of a formant filter from its transfer function
585
 * @see #tilt_factor() in amrnbdec.c, which does essentially the same,
586
 *      but somehow (??) it does a speech synthesis filter in the
587
 *      middle, which is missing here
588
 *
589
 * @param lpcs LPC coefficients
590
 * @param n_lpcs Size of LPC buffer
591
 * @returns the tilt factor
592
 */
593
7098
static float tilt_factor(const float *lpcs, int n_lpcs)
594
{
595
    float rh0, rh1;
596
597
7098
    rh0 = 1.0     + avpriv_scalarproduct_float_c(lpcs,  lpcs,    n_lpcs);
598
7098
    rh1 = lpcs[0] + avpriv_scalarproduct_float_c(lpcs, &lpcs[1], n_lpcs - 1);
599
600
7098
    return rh1 / rh0;
601
}
602
603
/**
604
 * Derive denoise filter coefficients (in real domain) from the LPCs.
605
 */
606
5614
static void calc_input_response(WMAVoiceContext *s, float *lpcs,
607
                                int fcb_type, float *coeffs, int remainder)
608
{
609
5614
    float last_coeff, min = 15.0, max = -15.0;
610
    float irange, angle_mul, gain_mul, range, sq;
611
    int n, idx;
612
613
    /* Create frequency power spectrum of speech input (i.e. RDFT of LPCs) */
614
5614
    s->rdft.rdft_calc(&s->rdft, lpcs);
615
#define log_range(var, assign) do { \
616
        float tmp = log10f(assign);  var = tmp; \
617
        max       = FFMAX(max, tmp); min = FFMIN(min, tmp); \
618
    } while (0)
619

5614
    log_range(last_coeff,  lpcs[1]         * lpcs[1]);
620
359296
    for (n = 1; n < 64; n++)
621

353682
        log_range(lpcs[n], lpcs[n * 2]     * lpcs[n * 2] +
622
                           lpcs[n * 2 + 1] * lpcs[n * 2 + 1]);
623

5614
    log_range(lpcs[0],     lpcs[0]         * lpcs[0]);
624
#undef log_range
625
5614
    range    = max - min;
626
5614
    lpcs[64] = last_coeff;
627
628
    /* Now, use this spectrum to pick out these frequencies with higher
629
     * (relative) power/energy (which we then take to be "not noise"),
630
     * and set up a table (still in lpc[]) of (relative) gains per frequency.
631
     * These frequencies will be maintained, while others ("noise") will be
632
     * decreased in the filter output. */
633
5614
    irange    = 64.0 / range; // so irange*(max-value) is in the range [0, 63]
634
5614
    gain_mul  = range * (fcb_type == FCB_TYPE_HARDCODED ? (5.0 / 13.0) :
635
                                                          (5.0 / 14.7));
636
5614
    angle_mul = gain_mul * (8.0 * M_LN10 / M_PI);
637
370524
    for (n = 0; n <= 64; n++) {
638
        float pwr;
639
640
364910
        idx = lrint((max - lpcs[n]) * irange - 1);
641
364910
        idx = FFMAX(0, idx);
642
364910
        pwr = wmavoice_denoise_power_table[s->denoise_strength][idx];
643
364910
        lpcs[n] = angle_mul * pwr;
644
645
        /* 70.57 =~ 1/log10(1.0331663) */
646
364910
        idx = av_clipf((pwr * gain_mul - 0.0295) * 70.570526123, 0, INT_MAX / 2);
647
648
364910
        if (idx > 127) { // fall back if index falls outside table range
649
8557
            coeffs[n] = wmavoice_energy_table[127] *
650
8557
                        powf(1.0331663, idx - 127);
651
        } else
652
356353
            coeffs[n] = wmavoice_energy_table[FFMAX(0, idx)];
653
    }
654
655
    /* calculate the Hilbert transform of the gains, which we do (since this
656
     * is a sine input) by doing a phase shift (in theory, H(sin())=cos()).
657
     * Hilbert_Transform(RDFT(x)) = Laplace_Transform(x), which calculates the
658
     * "moment" of the LPCs in this filter. */
659
5614
    s->dct.dct_calc(&s->dct, lpcs);
660
5614
    s->dst.dct_calc(&s->dst, lpcs);
661
662
    /* Split out the coefficient indexes into phase/magnitude pairs */
663
5614
    idx = 255 + av_clip(lpcs[64],               -255, 255);
664
5614
    coeffs[0]  = coeffs[0]  * s->cos[idx];
665
5614
    idx = 255 + av_clip(lpcs[64] - 2 * lpcs[63], -255, 255);
666
5614
    last_coeff = coeffs[64] * s->cos[idx];
667
5614
    for (n = 63;; n--) {
668
179648
        idx = 255 + av_clip(-lpcs[64] - 2 * lpcs[n - 1], -255, 255);
669
179648
        coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
670
179648
        coeffs[n * 2]     = coeffs[n] * s->cos[idx];
671
672
179648
        if (!--n) break;
673
674
174034
        idx = 255 + av_clip( lpcs[64] - 2 * lpcs[n - 1], -255, 255);
675
174034
        coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
676
174034
        coeffs[n * 2]     = coeffs[n] * s->cos[idx];
677
    }
678
5614
    coeffs[1] = last_coeff;
679
680
    /* move into real domain */
681
5614
    s->irdft.rdft_calc(&s->irdft, coeffs);
682
683
    /* tilt correction and normalize scale */
684
5614
    memset(&coeffs[remainder], 0, sizeof(coeffs[0]) * (128 - remainder));
685
5614
    if (s->denoise_tilt_corr) {
686
1484
        float tilt_mem = 0;
687
688
1484
        coeffs[remainder - 1] = 0;
689
1484
        ff_tilt_compensation(&tilt_mem,
690
1484
                             -1.8 * tilt_factor(coeffs, remainder - 1),
691
                             coeffs, remainder);
692
    }
693
5614
    sq = (1.0 / 64.0) * sqrtf(1 / avpriv_scalarproduct_float_c(coeffs, coeffs,
694
                                                               remainder));
695
269472
    for (n = 0; n < remainder; n++)
696
263858
        coeffs[n] *= sq;
697
5614
}
698
699
/**
700
 * This function applies a Wiener filter on the (noisy) speech signal as
701
 * a means to denoise it.
702
 *
703
 * - take RDFT of LPCs to get the power spectrum of the noise + speech;
704
 * - using this power spectrum, calculate (for each frequency) the Wiener
705
 *    filter gain, which depends on the frequency power and desired level
706
 *    of noise subtraction (when set too high, this leads to artifacts)
707
 *    We can do this symmetrically over the X-axis (so 0-4kHz is the inverse
708
 *    of 4-8kHz);
709
 * - by doing a phase shift, calculate the Hilbert transform of this array
710
 *    of per-frequency filter-gains to get the filtering coefficients;
711
 * - smoothen/normalize/de-tilt these filter coefficients as desired;
712
 * - take RDFT of noisy sound, apply the coefficients and take its IRDFT
713
 *    to get the denoised speech signal;
714
 * - the leftover (i.e. output of the IRDFT on denoised speech data beyond
715
 *    the frame boundary) are saved and applied to subsequent frames by an
716
 *    overlap-add method (otherwise you get clicking-artifacts).
717
 *
718
 * @param s WMA Voice decoding context
719
 * @param fcb_type Frame (codebook) type
720
 * @param synth_pf input: the noisy speech signal, output: denoised speech
721
 *                 data; should be 16-byte aligned (for ASM purposes)
722
 * @param size size of the speech data
723
 * @param lpcs LPCs used to synthesize this frame's speech data
724
 */
725
6612
static void wiener_denoise(WMAVoiceContext *s, int fcb_type,
726
                           float *synth_pf, int size,
727
                           const float *lpcs)
728
{
729
    int remainder, lim, n;
730
731
6612
    if (fcb_type != FCB_TYPE_SILENCE) {
732
5614
        float *tilted_lpcs = s->tilted_lpcs_pf,
733
5614
              *coeffs = s->denoise_coeffs_pf, tilt_mem = 0;
734
735
5614
        tilted_lpcs[0]           = 1.0;
736
5614
        memcpy(&tilted_lpcs[1], lpcs, sizeof(lpcs[0]) * s->lsps);
737
5614
        memset(&tilted_lpcs[s->lsps + 1], 0,
738
5614
               sizeof(tilted_lpcs[0]) * (128 - s->lsps - 1));
739
5614
        ff_tilt_compensation(&tilt_mem, 0.7 * tilt_factor(lpcs, s->lsps),
740
5614
                             tilted_lpcs, s->lsps + 2);
741
742
        /* The IRDFT output (127 samples for 7-bit filter) beyond the frame
743
         * size is applied to the next frame. All input beyond this is zero,
744
         * and thus all output beyond this will go towards zero, hence we can
745
         * limit to min(size-1, 127-size) as a performance consideration. */
746
5614
        remainder = FFMIN(127 - size, size - 1);
747
5614
        calc_input_response(s, tilted_lpcs, fcb_type, coeffs, remainder);
748
749
        /* apply coefficients (in frequency spectrum domain), i.e. complex
750
         * number multiplication */
751
5614
        memset(&synth_pf[size], 0, sizeof(synth_pf[0]) * (128 - size));
752
5614
        s->rdft.rdft_calc(&s->rdft, synth_pf);
753
5614
        s->rdft.rdft_calc(&s->rdft, coeffs);
754
5614
        synth_pf[0] *= coeffs[0];
755
5614
        synth_pf[1] *= coeffs[1];
756
359296
        for (n = 1; n < 64; n++) {
757
353682
            float v1 = synth_pf[n * 2], v2 = synth_pf[n * 2 + 1];
758
353682
            synth_pf[n * 2]     = v1 * coeffs[n * 2] - v2 * coeffs[n * 2 + 1];
759
353682
            synth_pf[n * 2 + 1] = v2 * coeffs[n * 2] + v1 * coeffs[n * 2 + 1];
760
        }
761
5614
        s->irdft.rdft_calc(&s->irdft, synth_pf);
762
    }
763
764
    /* merge filter output with the history of previous runs */
765
6612
    if (s->denoise_filter_cache_size) {
766
5612
        lim = FFMIN(s->denoise_filter_cache_size, size);
767
269376
        for (n = 0; n < lim; n++)
768
263764
            synth_pf[n] += s->denoise_filter_cache[n];
769
5612
        s->denoise_filter_cache_size -= lim;
770
5612
        memmove(s->denoise_filter_cache, &s->denoise_filter_cache[size],
771
5612
                sizeof(s->denoise_filter_cache[0]) * s->denoise_filter_cache_size);
772
    }
773
774
    /* move remainder of filter output into a cache for future runs */
775
6612
    if (fcb_type != FCB_TYPE_SILENCE) {
776
5614
        lim = FFMIN(remainder, s->denoise_filter_cache_size);
777
5614
        for (n = 0; n < lim; n++)
778
            s->denoise_filter_cache[n] += synth_pf[size + n];
779
5614
        if (lim < remainder) {
780
5614
            memcpy(&s->denoise_filter_cache[lim], &synth_pf[size + lim],
781
5614
                   sizeof(s->denoise_filter_cache[0]) * (remainder - lim));
782
5614
            s->denoise_filter_cache_size = remainder;
783
        }
784
    }
785
6612
}
786
787
/**
788
 * Averaging projection filter, the postfilter used in WMAVoice.
789
 *
790
 * This uses the following steps:
791
 * - A zero-synthesis filter (generate excitation from synth signal)
792
 * - Kalman smoothing on excitation, based on pitch
793
 * - Re-synthesized smoothened output
794
 * - Iterative Wiener denoise filter
795
 * - Adaptive gain filter
796
 * - DC filter
797
 *
798
 * @param s WMAVoice decoding context
799
 * @param synth Speech synthesis output (before postfilter)
800
 * @param samples Output buffer for filtered samples
801
 * @param size Buffer size of synth & samples
802
 * @param lpcs Generated LPCs used for speech synthesis
803
 * @param zero_exc_pf destination for zero synthesis filter (16-byte aligned)
804
 * @param fcb_type Frame type (silence, hardcoded, AW-pulses or FCB-pulses)
805
 * @param pitch Pitch of the input signal
806
 */
807
6612
static void postfilter(WMAVoiceContext *s, const float *synth,
808
                       float *samples,    int size,
809
                       const float *lpcs, float *zero_exc_pf,
810
                       int fcb_type,      int pitch)
811
{
812
    float synth_filter_in_buf[MAX_FRAMESIZE / 2],
813
6612
          *synth_pf = &s->synth_filter_out_buf[MAX_LSPS_ALIGN16],
814
6612
          *synth_filter_in = zero_exc_pf;
815
816
6612
    av_assert0(size <= MAX_FRAMESIZE / 2);
817
818
    /* generate excitation from input signal */
819
6612
    ff_celp_lp_zero_synthesis_filterf(zero_exc_pf, lpcs, synth, size, s->lsps);
820
821

11682
    if (fcb_type >= FCB_TYPE_AW_PULSES &&
822
5070
        !kalman_smoothen(s, pitch, zero_exc_pf, synth_filter_in_buf, size))
823
5044
        synth_filter_in = synth_filter_in_buf;
824
825
    /* re-synthesize speech after smoothening, and keep history */
826
6612
    ff_celp_lp_synthesis_filterf(synth_pf, lpcs,
827
                                 synth_filter_in, size, s->lsps);
828
6612
    memcpy(&synth_pf[-s->lsps], &synth_pf[size - s->lsps],
829
6612
           sizeof(synth_pf[0]) * s->lsps);
830
831
6612
    wiener_denoise(s, fcb_type, synth_pf, size, lpcs);
832
833
6612
    adaptive_gain_control(samples, synth_pf, synth, size, 0.99,
834
                          &s->postfilter_agc);
835
836
6612
    if (s->dc_level > 8) {
837
        /* remove ultra-low frequency DC noise / highpass filter;
838
         * coefficients are identical to those used in SIPR decoding,
839
         * and very closely resemble those used in AMR-NB decoding. */
840
        ff_acelp_apply_order_2_transfer_function(samples, samples,
841
            (const float[2]) { -1.99997,      1.0 },
842
            (const float[2]) { -1.9330735188, 0.93589198496 },
843
            0.93980580475, s->dcf_mem, size);
844
    }
845
6612
}
846
/**
847
 * @}
848
 */
849
850
/**
851
 * Dequantize LSPs
852
 * @param lsps output pointer to the array that will hold the LSPs
853
 * @param num number of LSPs to be dequantized
854
 * @param values quantized values, contains n_stages values
855
 * @param sizes range (i.e. max value) of each quantized value
856
 * @param n_stages number of dequantization runs
857
 * @param table dequantization table to be used
858
 * @param mul_q LSF multiplier
859
 * @param base_q base (lowest) LSF values
860
 */
861
4404
static void dequant_lsps(double *lsps, int num,
862
                         const uint16_t *values,
863
                         const uint16_t *sizes,
864
                         int n_stages, const uint8_t *table,
865
                         const double *mul_q,
866
                         const double *base_q)
867
{
868
    int n, m;
869
870
4404
    memset(lsps, 0, num * sizeof(*lsps));
871
12668
    for (n = 0; n < n_stages; n++) {
872
8264
        const uint8_t *t_off = &table[values[n] * num];
873
8264
        double base = base_q[n], mul = mul_q[n];
874
875
95364
        for (m = 0; m < num; m++)
876
87100
            lsps[m] += base + mul * t_off[m];
877
878
8264
        table += sizes[n] * num;
879
    }
880
4404
}
881
882
/**
883
 * @name LSP dequantization routines
884
 * LSP dequantization routines, for 10/16LSPs and independent/residual coding.
885
 * lsp10i() consumes 24 bits; lsp10r() consumes an additional 24 bits;
886
 * lsp16i() consumes 34 bits; lsp16r() consumes an additional 26 bits.
887
 * @{
888
 */
889
/**
890
 * Parse 10 independently-coded LSPs.
891
 */
892
552
static void dequant_lsp10i(GetBitContext *gb, double *lsps)
893
{
894
    static const uint16_t vec_sizes[4] = { 256, 64, 32, 32 };
895
    static const double mul_lsf[4] = {
896
        5.2187144800e-3,    1.4626986422e-3,
897
        9.6179549166e-4,    1.1325736225e-3
898
    };
899
    static const double base_lsf[4] = {
900
        M_PI * -2.15522e-1, M_PI * -6.1646e-2,
901
        M_PI * -3.3486e-2,  M_PI * -5.7408e-2
902
    };
903
    uint16_t v[4];
904
905
552
    v[0] = get_bits(gb, 8);
906
552
    v[1] = get_bits(gb, 6);
907
552
    v[2] = get_bits(gb, 5);
908
552
    v[3] = get_bits(gb, 5);
909
910
552
    dequant_lsps(lsps, 10, v, vec_sizes, 4, wmavoice_dq_lsp10i,
911
                 mul_lsf, base_lsf);
912
552
}
913
914
/**
915
 * Parse 10 independently-coded LSPs, and then derive the tables to
916
 * generate LSPs for the other frames from them (residual coding).
917
 */
918
552
static void dequant_lsp10r(GetBitContext *gb,
919
                           double *i_lsps, const double *old,
920
                           double *a1, double *a2, int q_mode)
921
{
922
    static const uint16_t vec_sizes[3] = { 128, 64, 64 };
923
    static const double mul_lsf[3] = {
924
        2.5807601174e-3,    1.2354460219e-3,   1.1763821673e-3
925
    };
926
    static const double base_lsf[3] = {
927
        M_PI * -1.07448e-1, M_PI * -5.2706e-2, M_PI * -5.1634e-2
928
    };
929
552
    const float (*ipol_tab)[2][10] = q_mode ?
930
552
        wmavoice_lsp10_intercoeff_b : wmavoice_lsp10_intercoeff_a;
931
    uint16_t interpol, v[3];
932
    int n;
933
934
552
    dequant_lsp10i(gb, i_lsps);
935
936
552
    interpol = get_bits(gb, 5);
937
552
    v[0]     = get_bits(gb, 7);
938
552
    v[1]     = get_bits(gb, 6);
939
552
    v[2]     = get_bits(gb, 6);
940
941
6072
    for (n = 0; n < 10; n++) {
942
5520
        double delta = old[n] - i_lsps[n];
943
5520
        a1[n]        = ipol_tab[interpol][0][n] * delta + i_lsps[n];
944
5520
        a1[10 + n]   = ipol_tab[interpol][1][n] * delta + i_lsps[n];
945
    }
946
947
552
    dequant_lsps(a2, 20, v, vec_sizes, 3, wmavoice_dq_lsp10r,
948
                 mul_lsf, base_lsf);
949
552
}
950
951
/**
952
 * Parse 16 independently-coded LSPs.
953
 */
954
550
static void dequant_lsp16i(GetBitContext *gb, double *lsps)
955
{
956
    static const uint16_t vec_sizes[5] = { 256, 64, 128, 64, 128 };
957
    static const double mul_lsf[5] = {
958
        3.3439586280e-3,    6.9908173703e-4,
959
        3.3216608306e-3,    1.0334960326e-3,
960
        3.1899104283e-3
961
    };
962
    static const double base_lsf[5] = {
963
        M_PI * -1.27576e-1, M_PI * -2.4292e-2,
964
        M_PI * -1.28094e-1, M_PI * -3.2128e-2,
965
        M_PI * -1.29816e-1
966
    };
967
    uint16_t v[5];
968
969
550
    v[0] = get_bits(gb, 8);
970
550
    v[1] = get_bits(gb, 6);
971
550
    v[2] = get_bits(gb, 7);
972
550
    v[3] = get_bits(gb, 6);
973
550
    v[4] = get_bits(gb, 7);
974
975
550
    dequant_lsps( lsps,     5,  v,     vec_sizes,    2,
976
                 wmavoice_dq_lsp16i1,  mul_lsf,     base_lsf);
977
550
    dequant_lsps(&lsps[5],  5, &v[2], &vec_sizes[2], 2,
978
                 wmavoice_dq_lsp16i2, &mul_lsf[2], &base_lsf[2]);
979
550
    dequant_lsps(&lsps[10], 6, &v[4], &vec_sizes[4], 1,
980
                 wmavoice_dq_lsp16i3, &mul_lsf[4], &base_lsf[4]);
981
550
}
982
983
/**
984
 * Parse 16 independently-coded LSPs, and then derive the tables to
985
 * generate LSPs for the other frames from them (residual coding).
986
 */
987
550
static void dequant_lsp16r(GetBitContext *gb,
988
                           double *i_lsps, const double *old,
989
                           double *a1, double *a2, int q_mode)
990
{
991
    static const uint16_t vec_sizes[3] = { 128, 128, 128 };
992
    static const double mul_lsf[3] = {
993
        1.2232979501e-3,   1.4062241527e-3,   1.6114744851e-3
994
    };
995
    static const double base_lsf[3] = {
996
        M_PI * -5.5830e-2, M_PI * -5.2908e-2, M_PI * -5.4776e-2
997
    };
998
550
    const float (*ipol_tab)[2][16] = q_mode ?
999
550
        wmavoice_lsp16_intercoeff_b : wmavoice_lsp16_intercoeff_a;
1000
    uint16_t interpol, v[3];
1001
    int n;
1002
1003
550
    dequant_lsp16i(gb, i_lsps);
1004
1005
550
    interpol = get_bits(gb, 5);
1006
550
    v[0]     = get_bits(gb, 7);
1007
550
    v[1]     = get_bits(gb, 7);
1008
550
    v[2]     = get_bits(gb, 7);
1009
1010
9350
    for (n = 0; n < 16; n++) {
1011
8800
        double delta = old[n] - i_lsps[n];
1012
8800
        a1[n]        = ipol_tab[interpol][0][n] * delta + i_lsps[n];
1013
8800
        a1[16 + n]   = ipol_tab[interpol][1][n] * delta + i_lsps[n];
1014
    }
1015
1016
550
    dequant_lsps( a2,     10,  v,     vec_sizes,    1,
1017
                 wmavoice_dq_lsp16r1,  mul_lsf,     base_lsf);
1018
550
    dequant_lsps(&a2[10], 10, &v[1], &vec_sizes[1], 1,
1019
                 wmavoice_dq_lsp16r2, &mul_lsf[1], &base_lsf[1]);
1020
550
    dequant_lsps(&a2[20], 12, &v[2], &vec_sizes[2], 1,
1021
                 wmavoice_dq_lsp16r3, &mul_lsf[2], &base_lsf[2]);
1022
550
}
1023
1024
/**
1025
 * @}
1026
 * @name Pitch-adaptive window coding functions
1027
 * The next few functions are for pitch-adaptive window coding.
1028
 * @{
1029
 */
1030
/**
1031
 * Parse the offset of the first pitch-adaptive window pulses, and
1032
 * the distribution of pulses between the two blocks in this frame.
1033
 * @param s WMA Voice decoding context private data
1034
 * @param gb bit I/O context
1035
 * @param pitch pitch for each block in this frame
1036
 */
1037
341
static void aw_parse_coords(WMAVoiceContext *s, GetBitContext *gb,
1038
                            const int *pitch)
1039
{
1040
    static const int16_t start_offset[94] = {
1041
        -11,  -9,  -7,  -5,  -3,  -1,   1,   3,   5,   7,   9,  11,
1042
         13,  15,  18,  17,  19,  20,  21,  22,  23,  24,  25,  26,
1043
         27,  28,  29,  30,  31,  32,  33,  35,  37,  39,  41,  43,
1044
         45,  47,  49,  51,  53,  55,  57,  59,  61,  63,  65,  67,
1045
         69,  71,  73,  75,  77,  79,  81,  83,  85,  87,  89,  91,
1046
         93,  95,  97,  99, 101, 103, 105, 107, 109, 111, 113, 115,
1047
        117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139,
1048
        141, 143, 145, 147, 149, 151, 153, 155, 157, 159
1049
    };
1050
    int bits, offset;
1051
1052
    /* position of pulse */
1053
341
    s->aw_idx_is_ext = 0;
1054
341
    if ((bits = get_bits(gb, 6)) >= 54) {
1055
10
        s->aw_idx_is_ext = 1;
1056
10
        bits += (bits - 54) * 3 + get_bits(gb, 2);
1057
    }
1058
1059
    /* for a repeated pulse at pulse_off with a pitch_lag of pitch[], count
1060
     * the distribution of the pulses in each block contained in this frame. */
1061
341
    s->aw_pulse_range        = FFMIN(pitch[0], pitch[1]) > 32 ? 24 : 16;
1062
391
    for (offset = start_offset[bits]; offset < 0; offset += pitch[0]) ;
1063
341
    s->aw_n_pulses[0]        = (pitch[0] - 1 + MAX_FRAMESIZE / 2 - offset) / pitch[0];
1064
341
    s->aw_first_pulse_off[0] = offset - s->aw_pulse_range / 2;
1065
341
    offset                  += s->aw_n_pulses[0] * pitch[0];
1066
341
    s->aw_n_pulses[1]        = (pitch[1] - 1 + MAX_FRAMESIZE - offset) / pitch[1];
1067
341
    s->aw_first_pulse_off[1] = offset - (MAX_FRAMESIZE + s->aw_pulse_range) / 2;
1068
1069
    /* if continuing from a position before the block, reset position to
1070
     * start of block (when corrected for the range over which it can be
1071
     * spread in aw_pulse_set1()). */
1072
341
    if (start_offset[bits] < MAX_FRAMESIZE / 2) {
1073
387
        while (s->aw_first_pulse_off[1] - pitch[1] + s->aw_pulse_range > 0)
1074
56
            s->aw_first_pulse_off[1] -= pitch[1];
1075
331
        if (start_offset[bits] < 0)
1076
100
            while (s->aw_first_pulse_off[0] - pitch[0] + s->aw_pulse_range > 0)
1077
50
                s->aw_first_pulse_off[0] -= pitch[0];
1078
    }
1079
341
}
1080
1081
/**
1082
 * Apply second set of pitch-adaptive window pulses.
1083
 * @param s WMA Voice decoding context private data
1084
 * @param gb bit I/O context
1085
 * @param block_idx block index in frame [0, 1]
1086
 * @param fcb structure containing fixed codebook vector info
1087
 * @return -1 on error, 0 otherwise
1088
 */
1089
682
static int aw_pulse_set2(WMAVoiceContext *s, GetBitContext *gb,
1090
                         int block_idx, AMRFixed *fcb)
1091
{
1092
    uint16_t use_mask_mem[9]; // only 5 are used, rest is padding
1093
682
    uint16_t *use_mask = use_mask_mem + 2;
1094
    /* in this function, idx is the index in the 80-bit (+ padding) use_mask
1095
     * bit-array. Since use_mask consists of 16-bit values, the lower 4 bits
1096
     * of idx are the position of the bit within a particular item in the
1097
     * array (0 being the most significant bit, and 15 being the least
1098
     * significant bit), and the remainder (>> 4) is the index in the
1099
     * use_mask[]-array. This is faster and uses less memory than using a
1100
     * 80-byte/80-int array. */
1101
682
    int pulse_off = s->aw_first_pulse_off[block_idx],
1102
682
        pulse_start, n, idx, range, aidx, start_off = 0;
1103
1104
    /* set offset of first pulse to within this block */
1105
682
    if (s->aw_n_pulses[block_idx] > 0)
1106
657
        while (pulse_off + s->aw_pulse_range < 1)
1107
            pulse_off += fcb->pitch_lag;
1108
1109
    /* find range per pulse */
1110
682
    if (s->aw_n_pulses[0] > 0) {
1111
646
        if (block_idx == 0) {
1112
323
            range = 32;
1113
        } else /* block_idx = 1 */ {
1114
323
            range = 8;
1115
323
            if (s->aw_n_pulses[block_idx] > 0)
1116
316
                pulse_off = s->aw_next_pulse_off_cache;
1117
        }
1118
    } else
1119
36
        range = 16;
1120
682
    pulse_start = s->aw_n_pulses[block_idx] > 0 ? pulse_off - range / 2 : 0;
1121
1122
    /* aw_pulse_set1() already applies pulses around pulse_off (to be exactly,
1123
     * in the range of [pulse_off, pulse_off + s->aw_pulse_range], and thus
1124
     * we exclude that range from being pulsed again in this function. */
1125
682
    memset(&use_mask[-2], 0, 2 * sizeof(use_mask[0]));
1126
682
    memset( use_mask,   -1, 5 * sizeof(use_mask[0]));
1127
682
    memset(&use_mask[5], 0, 2 * sizeof(use_mask[0]));
1128
682
    if (s->aw_n_pulses[block_idx] > 0)
1129
1568
        for (idx = pulse_off; idx < MAX_FRAMESIZE / 2; idx += fcb->pitch_lag) {
1130
911
            int excl_range         = s->aw_pulse_range; // always 16 or 24
1131
911
            uint16_t *use_mask_ptr = &use_mask[idx >> 4];
1132
911
            int first_sh           = 16 - (idx & 15);
1133
911
            *use_mask_ptr++       &= 0xFFFFu << first_sh;
1134
911
            excl_range            -= first_sh;
1135
911
            if (excl_range >= 16) {
1136
468
                *use_mask_ptr++    = 0;
1137
468
                *use_mask_ptr     &= 0xFFFF >> (excl_range - 16);
1138
            } else
1139
443
                *use_mask_ptr     &= 0xFFFF >> excl_range;
1140
        }
1141
1142
    /* find the 'aidx'th offset that is not excluded */
1143
682
    aidx = get_bits(gb, s->aw_n_pulses[0] > 0 ? 5 - 2 * block_idx : 4);
1144
16825
    for (n = 0; n <= aidx; pulse_start++) {
1145
18458
        for (idx = pulse_start; idx < 0; idx += fcb->pitch_lag) ;
1146
16143
        if (idx >= MAX_FRAMESIZE / 2) { // find from zero
1147
538
            if (use_mask[0])      idx = 0x0F;
1148
123
            else if (use_mask[1]) idx = 0x1F;
1149
18
            else if (use_mask[2]) idx = 0x2F;
1150
            else if (use_mask[3]) idx = 0x3F;
1151
            else if (use_mask[4]) idx = 0x4F;
1152
            else return -1;
1153
538
            idx -= av_log2_16bit(use_mask[idx >> 4]);
1154
        }
1155
16143
        if (use_mask[idx >> 4] & (0x8000 >> (idx & 15))) {
1156
7465
            use_mask[idx >> 4] &= ~(0x8000 >> (idx & 15));
1157
7465
            n++;
1158
7465
            start_off = idx;
1159
        }
1160
    }
1161
1162
682
    fcb->x[fcb->n] = start_off;
1163
682
    fcb->y[fcb->n] = get_bits1(gb) ? -1.0 : 1.0;
1164
682
    fcb->n++;
1165
1166
    /* set offset for next block, relative to start of that block */
1167
682
    n = (MAX_FRAMESIZE / 2 - start_off) % fcb->pitch_lag;
1168
682
    s->aw_next_pulse_off_cache = n ? fcb->pitch_lag - n : 0;
1169
682
    return 0;
1170
}
1171
1172
/**
1173
 * Apply first set of pitch-adaptive window pulses.
1174
 * @param s WMA Voice decoding context private data
1175
 * @param gb bit I/O context
1176
 * @param block_idx block index in frame [0, 1]
1177
 * @param fcb storage location for fixed codebook pulse info
1178
 */
1179
682
static void aw_pulse_set1(WMAVoiceContext *s, GetBitContext *gb,
1180
                          int block_idx, AMRFixed *fcb)
1181
{
1182

682
    int val = get_bits(gb, 12 - 2 * (s->aw_idx_is_ext && !block_idx));
1183
    float v;
1184
1185
682
    if (s->aw_n_pulses[block_idx] > 0) {
1186
        int n, v_mask, i_mask, sh, n_pulses;
1187
1188
657
        if (s->aw_pulse_range == 24) { // 3 pulses, 1:sign + 3:index each
1189
652
            n_pulses = 3;
1190
652
            v_mask   = 8;
1191
652
            i_mask   = 7;
1192
652
            sh       = 4;
1193
        } else { // 4 pulses, 1:sign + 2:index each
1194
5
            n_pulses = 4;
1195
5
            v_mask   = 4;
1196
5
            i_mask   = 3;
1197
5
            sh       = 3;
1198
        }
1199
1200
2633
        for (n = n_pulses - 1; n >= 0; n--, val >>= sh) {
1201
1976
            fcb->y[fcb->n] = (val & v_mask) ? -1.0 : 1.0;
1202
1976
            fcb->x[fcb->n] = (val & i_mask) * n_pulses + n +
1203
1976
                                 s->aw_first_pulse_off[block_idx];
1204
2217
            while (fcb->x[fcb->n] < 0)
1205
241
                fcb->x[fcb->n] += fcb->pitch_lag;
1206
1976
            if (fcb->x[fcb->n] < MAX_FRAMESIZE / 2)
1207
1959
                fcb->n++;
1208
        }
1209
    } else {
1210
25
        int num2 = (val & 0x1FF) >> 1, delta, idx;
1211
1212
25
        if (num2 < 1 * 79)      { delta = 1; idx = num2 + 1; }
1213
21
        else if (num2 < 2 * 78) { delta = 3; idx = num2 + 1 - 1 * 77; }
1214
15
        else if (num2 < 3 * 77) { delta = 5; idx = num2 + 1 - 2 * 76; }
1215
5
        else                    { delta = 7; idx = num2 + 1 - 3 * 75; }
1216
25
        v = (val & 0x200) ? -1.0 : 1.0;
1217
1218
25
        fcb->no_repeat_mask |= 3 << fcb->n;
1219
25
        fcb->x[fcb->n]       = idx - delta;
1220
25
        fcb->y[fcb->n]       = v;
1221
25
        fcb->x[fcb->n + 1]   = idx;
1222
25
        fcb->y[fcb->n + 1]   = (val & 1) ? -v : v;
1223
25
        fcb->n              += 2;
1224
    }
1225
682
}
1226
1227
/**
1228
 * @}
1229
 *
1230
 * Generate a random number from frame_cntr and block_idx, which will live
1231
 * in the range [0, 1000 - block_size] (so it can be used as an index in a
1232
 * table of size 1000 of which you want to read block_size entries).
1233
 *
1234
 * @param frame_cntr current frame number
1235
 * @param block_num current block index
1236
 * @param block_size amount of entries we want to read from a table
1237
 *                   that has 1000 entries
1238
 * @return a (non-)random number in the [0, 1000 - block_size] range.
1239
 */
1240
499
static int pRNG(int frame_cntr, int block_num, int block_size)
1241
{
1242
    /* array to simplify the calculation of z:
1243
     * y = (x % 9) * 5 + 6;
1244
     * z = (49995 * x) / y;
1245
     * Since y only has 9 values, we can remove the division by using a
1246
     * LUT and using FASTDIV-style divisions. For each of the 9 values
1247
     * of y, we can rewrite z as:
1248
     * z = x * (49995 / y) + x * ((49995 % y) / y)
1249
     * In this table, each col represents one possible value of y, the
1250
     * first number is 49995 / y, and the second is the FASTDIV variant
1251
     * of 49995 % y / y. */
1252
    static const unsigned int div_tbl[9][2] = {
1253
        { 8332,  3 * 715827883U }, // y =  6
1254
        { 4545,  0 * 390451573U }, // y = 11
1255
        { 3124, 11 * 268435456U }, // y = 16
1256
        { 2380, 15 * 204522253U }, // y = 21
1257
        { 1922, 23 * 165191050U }, // y = 26
1258
        { 1612, 23 * 138547333U }, // y = 31
1259
        { 1388, 27 * 119304648U }, // y = 36
1260
        { 1219, 16 * 104755300U }, // y = 41
1261
        { 1086, 39 *  93368855U }  // y = 46
1262
    };
1263
499
    unsigned int z, y, x = MUL16(block_num, 1877) + frame_cntr;
1264
499
    if (x >= 0xFFFF) x -= 0xFFFF;   // max value of x is 8*1877+0xFFFE=0x13AA6,
1265
                                    // so this is effectively a modulo (%)
1266
499
    y = x - 9 * MULH(477218589, x); // x % 9
1267
499
    z = (uint16_t) (x * div_tbl[y][0] + UMULH(x, div_tbl[y][1]));
1268
                                    // z = x * 49995 / (y * 5 + 6)
1269
499
    return z % (1000 - block_size);
1270
}
1271
1272
/**
1273
 * Parse hardcoded signal for a single block.
1274
 * @note see #synth_block().
1275
 */
1276
1043
static void synth_block_hardcoded(WMAVoiceContext *s, GetBitContext *gb,
1277
                                 int block_idx, int size,
1278
                                 const struct frame_type_desc *frame_desc,
1279
                                 float *excitation)
1280
{
1281
    float gain;
1282
    int n, r_idx;
1283
1284
1043
    av_assert0(size <= MAX_FRAMESIZE);
1285
1286
    /* Set the offset from which we start reading wmavoice_std_codebook */
1287
1043
    if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
1288
499
        r_idx = pRNG(s->frame_cntr, block_idx, size);
1289
499
        gain  = s->silence_gain;
1290
    } else /* FCB_TYPE_HARDCODED */ {
1291
544
        r_idx = get_bits(gb, 8);
1292
544
        gain  = wmavoice_gain_universal[get_bits(gb, 6)];
1293
    }
1294
1295
    /* Clear gain prediction parameters */
1296
1043
    memset(s->gain_pred_err, 0, sizeof(s->gain_pred_err));
1297
1298
    /* Apply gain to hardcoded codebook and use that as excitation signal */
1299
124403
    for (n = 0; n < size; n++)
1300
123360
        excitation[n] = wmavoice_std_codebook[r_idx + n] * gain;
1301
1043
}
1302
1303
/**
1304
 * Parse FCB/ACB signal for a single block.
1305
 * @note see #synth_block().
1306
 */
1307
9740
static void synth_block_fcb_acb(WMAVoiceContext *s, GetBitContext *gb,
1308
                                int block_idx, int size,
1309
                                int block_pitch_sh2,
1310
                                const struct frame_type_desc *frame_desc,
1311
                                float *excitation)
1312
{
1313
    static const float gain_coeff[6] = {
1314
        0.8169, -0.06545, 0.1726, 0.0185, -0.0359, 0.0458
1315
    };
1316
    float pulses[MAX_FRAMESIZE / 2], pred_err, acb_gain, fcb_gain;
1317
    int n, idx, gain_weight;
1318
    AMRFixed fcb;
1319
1320
9740
    av_assert0(size <= MAX_FRAMESIZE / 2);
1321
9740
    memset(pulses, 0, sizeof(*pulses) * size);
1322
1323
9740
    fcb.pitch_lag      = block_pitch_sh2 >> 2;
1324
9740
    fcb.pitch_fac      = 1.0;
1325
9740
    fcb.no_repeat_mask = 0;
1326
9740
    fcb.n              = 0;
1327
1328
    /* For the other frame types, this is where we apply the innovation
1329
     * (fixed) codebook pulses of the speech signal. */
1330
9740
    if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1331
682
        aw_pulse_set1(s, gb, block_idx, &fcb);
1332
682
        if (aw_pulse_set2(s, gb, block_idx, &fcb)) {
1333
            /* Conceal the block with silence and return.
1334
             * Skip the correct amount of bits to read the next
1335
             * block from the correct offset. */
1336
            int r_idx = pRNG(s->frame_cntr, block_idx, size);
1337
1338
            for (n = 0; n < size; n++)
1339
                excitation[n] =
1340
                    wmavoice_std_codebook[r_idx + n] * s->silence_gain;
1341
            skip_bits(gb, 7 + 1);
1342
            return;
1343
        }
1344
    } else /* FCB_TYPE_EXC_PULSES */ {
1345
9058
        int offset_nbits = 5 - frame_desc->log_n_blocks;
1346
1347
9058
        fcb.no_repeat_mask = -1;
1348
        /* similar to ff_decode_10_pulses_35bits(), but with single pulses
1349
         * (instead of double) for a subset of pulses */
1350
54348
        for (n = 0; n < 5; n++) {
1351
            float sign;
1352
            int pos1, pos2;
1353
1354
45290
            sign           = get_bits1(gb) ? 1.0 : -1.0;
1355
45290
            pos1           = get_bits(gb, offset_nbits);
1356
45290
            fcb.x[fcb.n]   = n + 5 * pos1;
1357
45290
            fcb.y[fcb.n++] = sign;
1358
45290
            if (n < frame_desc->dbl_pulses) {
1359
36270
                pos2           = get_bits(gb, offset_nbits);
1360
36270
                fcb.x[fcb.n]   = n + 5 * pos2;
1361
36270
                fcb.y[fcb.n++] = (pos1 < pos2) ? -sign : sign;
1362
            }
1363
        }
1364
    }
1365
9740
    ff_set_fixed_vector(pulses, &fcb, 1.0, size);
1366
1367
    /* Calculate gain for adaptive & fixed codebook signal.
1368
     * see ff_amr_set_fixed_gain(). */
1369
9740
    idx = get_bits(gb, 7);
1370
9740
    fcb_gain = expf(avpriv_scalarproduct_float_c(s->gain_pred_err,
1371
9740
                                                 gain_coeff, 6) -
1372
9740
                    5.2409161640 + wmavoice_gain_codebook_fcb[idx]);
1373
9740
    acb_gain = wmavoice_gain_codebook_acb[idx];
1374
9740
    pred_err = av_clipf(wmavoice_gain_codebook_fcb[idx],
1375
                        -2.9957322736 /* log(0.05) */,
1376
                         1.6094379124 /* log(5.0)  */);
1377
1378
9740
    gain_weight = 8 >> frame_desc->log_n_blocks;
1379
9740
    memmove(&s->gain_pred_err[gain_weight], s->gain_pred_err,
1380
9740
            sizeof(*s->gain_pred_err) * (6 - gain_weight));
1381
30020
    for (n = 0; n < gain_weight; n++)
1382
20280
        s->gain_pred_err[n] = pred_err;
1383
1384
    /* Calculation of adaptive codebook */
1385
9740
    if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
1386
        int len;
1387
19152
        for (n = 0; n < size; n += len) {
1388
            int next_idx_sh16;
1389
17876
            int abs_idx    = block_idx * size + n;
1390
17876
            int pitch_sh16 = (s->last_pitch_val << 16) +
1391
17876
                             s->pitch_diff_sh16 * abs_idx;
1392
17876
            int pitch      = (pitch_sh16 + 0x6FFF) >> 16;
1393
17876
            int idx_sh16   = ((pitch << 16) - pitch_sh16) * 8 + 0x58000;
1394
17876
            idx            = idx_sh16 >> 16;
1395
17876
            if (s->pitch_diff_sh16) {
1396
17442
                if (s->pitch_diff_sh16 > 0) {
1397
10526
                    next_idx_sh16 = (idx_sh16) &~ 0xFFFF;
1398
                } else
1399
6916
                    next_idx_sh16 = (idx_sh16 + 0x10000) &~ 0xFFFF;
1400
17442
                len = av_clip((idx_sh16 - next_idx_sh16) / s->pitch_diff_sh16 / 8,
1401
                              1, size - n);
1402
            } else
1403
434
                len = size;
1404
1405
17876
            ff_acelp_interpolatef(&excitation[n], &excitation[n - pitch],
1406
                                  wmavoice_ipol1_coeffs, 17,
1407
                                  idx, 9, len);
1408
        }
1409
    } else /* ACB_TYPE_HAMMING */ {
1410
8464
        int block_pitch = block_pitch_sh2 >> 2;
1411
8464
        idx             = block_pitch_sh2 & 3;
1412
8464
        if (idx) {
1413
3652
            ff_acelp_interpolatef(excitation, &excitation[-block_pitch],
1414
                                  wmavoice_ipol2_coeffs, 4,
1415
                                  idx, 8, size);
1416
        } else
1417
4812
            av_memcpy_backptr((uint8_t *) excitation, sizeof(float) * block_pitch,
1418
                              sizeof(float) * size);
1419
    }
1420
1421
    /* Interpolate ACB/FCB and use as excitation signal */
1422
9740
    ff_weighted_vector_sumf(excitation, excitation, pulses,
1423
                            acb_gain, fcb_gain, size);
1424
}
1425
1426
/**
1427
 * Parse data in a single block.
1428
 *
1429
 * @param s WMA Voice decoding context private data
1430
 * @param gb bit I/O context
1431
 * @param block_idx index of the to-be-read block
1432
 * @param size amount of samples to be read in this block
1433
 * @param block_pitch_sh2 pitch for this block << 2
1434
 * @param lsps LSPs for (the end of) this frame
1435
 * @param prev_lsps LSPs for the last frame
1436
 * @param frame_desc frame type descriptor
1437
 * @param excitation target memory for the ACB+FCB interpolated signal
1438
 * @param synth target memory for the speech synthesis filter output
1439
 * @return 0 on success, <0 on error.
1440
 */
1441
10783
static void synth_block(WMAVoiceContext *s, GetBitContext *gb,
1442
                        int block_idx, int size,
1443
                        int block_pitch_sh2,
1444
                        const double *lsps, const double *prev_lsps,
1445
                        const struct frame_type_desc *frame_desc,
1446
                        float *excitation, float *synth)
1447
{
1448
    double i_lsps[MAX_LSPS];
1449
    float lpcs[MAX_LSPS];
1450
    float fac;
1451
    int n;
1452
1453
10783
    if (frame_desc->acb_type == ACB_TYPE_NONE)
1454
1043
        synth_block_hardcoded(s, gb, block_idx, size, frame_desc, excitation);
1455
    else
1456
9740
        synth_block_fcb_acb(s, gb, block_idx, size, block_pitch_sh2,
1457
                            frame_desc, excitation);
1458
1459
    /* convert interpolated LSPs to LPCs */
1460
10783
    fac = (block_idx + 0.5) / frame_desc->n_blocks;
1461
151559
    for (n = 0; n < s->lsps; n++) // LSF -> LSP
1462
140776
        i_lsps[n] = cos(prev_lsps[n] + fac * (lsps[n] - prev_lsps[n]));
1463
10783
    ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1464
1465
    /* Speech synthesis */
1466
10783
    ff_celp_lp_synthesis_filterf(synth, lpcs, excitation, size, s->lsps);
1467
10783
}
1468
1469
/**
1470
 * Synthesize output samples for a single frame.
1471
 *
1472
 * @param ctx WMA Voice decoder context
1473
 * @param gb bit I/O context (s->gb or one for cross-packet superframes)
1474
 * @param frame_idx Frame number within superframe [0-2]
1475
 * @param samples pointer to output sample buffer, has space for at least 160
1476
 *                samples
1477
 * @param lsps LSP array
1478
 * @param prev_lsps array of previous frame's LSPs
1479
 * @param excitation target buffer for excitation signal
1480
 * @param synth target buffer for synthesized speech data
1481
 * @return 0 on success, <0 on error.
1482
 */
1483
3306
static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx,
1484
                       float *samples,
1485
                       const double *lsps, const double *prev_lsps,
1486
                       float *excitation, float *synth)
1487
{
1488
3306
    WMAVoiceContext *s = ctx->priv_data;
1489
3306
    int n, n_blocks_x2, log_n_blocks_x2, av_uninit(cur_pitch_val);
1490
3306
    int pitch[MAX_BLOCKS], av_uninit(last_block_pitch);
1491
1492
    /* Parse frame type ("frame header"), see frame_descs */
1493
3306
    int bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)], block_nsamples;
1494
1495
3306
    if (bd_idx < 0) {
1496
        av_log(ctx, AV_LOG_ERROR,
1497
               "Invalid frame type VLC code, skipping\n");
1498
        return AVERROR_INVALIDDATA;
1499
    }
1500
1501
3306
    block_nsamples = MAX_FRAMESIZE / frame_descs[bd_idx].n_blocks;
1502
1503
    /* Pitch calculation for ACB_TYPE_ASYMMETRIC ("pitch-per-frame") */
1504
3306
    if (frame_descs[bd_idx].acb_type == ACB_TYPE_ASYMMETRIC) {
1505
        /* Pitch is provided per frame, which is interpreted as the pitch of
1506
         * the last sample of the last block of this frame. We can interpolate
1507
         * the pitch of other blocks (and even pitch-per-sample) by gradually
1508
         * incrementing/decrementing prev_frame_pitch to cur_pitch_val. */
1509
560
        n_blocks_x2      = frame_descs[bd_idx].n_blocks << 1;
1510
560
        log_n_blocks_x2  = frame_descs[bd_idx].log_n_blocks + 1;
1511
560
        cur_pitch_val    = s->min_pitch_val + get_bits(gb, s->pitch_nbits);
1512
560
        cur_pitch_val    = FFMIN(cur_pitch_val, s->max_pitch_val - 1);
1513
560
        if (s->last_acb_type == ACB_TYPE_NONE ||
1514
524
            20 * abs(cur_pitch_val - s->last_pitch_val) >
1515
524
                (cur_pitch_val + s->last_pitch_val))
1516
138
            s->last_pitch_val = cur_pitch_val;
1517
1518
        /* pitch per block */
1519
1836
        for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1520
1276
            int fac = n * 2 + 1;
1521
1522
1276
            pitch[n] = (MUL16(fac,                 cur_pitch_val) +
1523
1276
                        MUL16((n_blocks_x2 - fac), s->last_pitch_val) +
1524
1276
                        frame_descs[bd_idx].n_blocks) >> log_n_blocks_x2;
1525
        }
1526
1527
        /* "pitch-diff-per-sample" for calculation of pitch per sample */
1528
560
        s->pitch_diff_sh16 =
1529
560
            (cur_pitch_val - s->last_pitch_val) * (1 << 16) / MAX_FRAMESIZE;
1530
    }
1531
1532
    /* Global gain (if silence) and pitch-adaptive window coordinates */
1533
3306
    switch (frame_descs[bd_idx].fcb_type) {
1534
499
    case FCB_TYPE_SILENCE:
1535
499
        s->silence_gain = wmavoice_gain_silence[get_bits(gb, 8)];
1536
499
        break;
1537
341
    case FCB_TYPE_AW_PULSES:
1538
341
        aw_parse_coords(s, gb, pitch);
1539
341
        break;
1540
    }
1541
1542
14089
    for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1543
        int bl_pitch_sh2;
1544
1545
        /* Pitch calculation for ACB_TYPE_HAMMING ("pitch-per-block") */
1546
10783
        switch (frame_descs[bd_idx].acb_type) {
1547
8464
        case ACB_TYPE_HAMMING: {
1548
            /* Pitch is given per block. Per-block pitches are encoded as an
1549
             * absolute value for the first block, and then delta values
1550
             * relative to this value) for all subsequent blocks. The scale of
1551
             * this pitch value is semi-logarithmic compared to its use in the
1552
             * decoder, so we convert it to normal scale also. */
1553
            int block_pitch,
1554
8464
                t1 = (s->block_conv_table[1] - s->block_conv_table[0]) << 2,
1555
8464
                t2 = (s->block_conv_table[2] - s->block_conv_table[1]) << 1,
1556
8464
                t3 =  s->block_conv_table[3] - s->block_conv_table[2] + 1;
1557
1558
8464
            if (n == 0) {
1559
1975
                block_pitch = get_bits(gb, s->block_pitch_nbits);
1560
            } else
1561
6489
                block_pitch = last_block_pitch - s->block_delta_pitch_hrange +
1562
6489
                                 get_bits(gb, s->block_delta_pitch_nbits);
1563
            /* Convert last_ so that any next delta is within _range */
1564
8464
            last_block_pitch = av_clip(block_pitch,
1565
                                       s->block_delta_pitch_hrange,
1566
8464
                                       s->block_pitch_range -
1567
8464
                                           s->block_delta_pitch_hrange);
1568
1569
            /* Convert semi-log-style scale back to normal scale */
1570
8464
            if (block_pitch < t1) {
1571
1491
                bl_pitch_sh2 = (s->block_conv_table[0] << 2) + block_pitch;
1572
            } else {
1573
6973
                block_pitch -= t1;
1574
6973
                if (block_pitch < t2) {
1575
5712
                    bl_pitch_sh2 =
1576
5712
                        (s->block_conv_table[1] << 2) + (block_pitch << 1);
1577
                } else {
1578
1261
                    block_pitch -= t2;
1579
1261
                    if (block_pitch < t3) {
1580
1261
                        bl_pitch_sh2 =
1581
1261
                            (s->block_conv_table[2] + block_pitch) << 2;
1582
                    } else
1583
                        bl_pitch_sh2 = s->block_conv_table[3] << 2;
1584
                }
1585
            }
1586
8464
            pitch[n] = bl_pitch_sh2 >> 2;
1587
8464
            break;
1588
        }
1589
1590
1276
        case ACB_TYPE_ASYMMETRIC: {
1591
1276
            bl_pitch_sh2 = pitch[n] << 2;
1592
1276
            break;
1593
        }
1594
1595
1043
        default: // ACB_TYPE_NONE has no pitch
1596
1043
            bl_pitch_sh2 = 0;
1597
1043
            break;
1598
        }
1599
1600
10783
        synth_block(s, gb, n, block_nsamples, bl_pitch_sh2,
1601
                    lsps, prev_lsps, &frame_descs[bd_idx],
1602
10783
                    &excitation[n * block_nsamples],
1603
10783
                    &synth[n * block_nsamples]);
1604
    }
1605
1606
    /* Averaging projection filter, if applicable. Else, just copy samples
1607
     * from synthesis buffer */
1608
3306
    if (s->do_apf) {
1609
        double i_lsps[MAX_LSPS];
1610
        float lpcs[MAX_LSPS];
1611
1612
46266
        for (n = 0; n < s->lsps; n++) // LSF -> LSP
1613
42960
            i_lsps[n] = cos(0.5 * (prev_lsps[n] + lsps[n]));
1614
3306
        ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1615
3306
        postfilter(s, synth, samples, 80, lpcs,
1616
3306
                   &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx],
1617
3306
                   frame_descs[bd_idx].fcb_type, pitch[0]);
1618
1619
46266
        for (n = 0; n < s->lsps; n++) // LSF -> LSP
1620
42960
            i_lsps[n] = cos(lsps[n]);
1621
3306
        ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1622
3306
        postfilter(s, &synth[80], &samples[80], 80, lpcs,
1623
3306
                   &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx + 80],
1624
3306
                   frame_descs[bd_idx].fcb_type, pitch[0]);
1625
    } else
1626
        memcpy(samples, synth, 160 * sizeof(synth[0]));
1627
1628
    /* Cache values for next frame */
1629
3306
    s->frame_cntr++;
1630
3306
    if (s->frame_cntr >= 0xFFFF) s->frame_cntr -= 0xFFFF; // i.e. modulo (%)
1631
3306
    s->last_acb_type = frame_descs[bd_idx].acb_type;
1632

3306
    switch (frame_descs[bd_idx].acb_type) {
1633
771
    case ACB_TYPE_NONE:
1634
771
        s->last_pitch_val = 0;
1635
771
        break;
1636
560
    case ACB_TYPE_ASYMMETRIC:
1637
560
        s->last_pitch_val = cur_pitch_val;
1638
560
        break;
1639
1975
    case ACB_TYPE_HAMMING:
1640
1975
        s->last_pitch_val = pitch[frame_descs[bd_idx].n_blocks - 1];
1641
1975
        break;
1642
    }
1643
1644
3306
    return 0;
1645
}
1646
1647
/**
1648
 * Ensure minimum value for first item, maximum value for last value,
1649
 * proper spacing between each value and proper ordering.
1650
 *
1651
 * @param lsps array of LSPs
1652
 * @param num size of LSP array
1653
 *
1654
 * @note basically a double version of #ff_acelp_reorder_lsf(), might be
1655
 *       useful to put in a generic location later on. Parts are also
1656
 *       present in #ff_set_min_dist_lsf() + #ff_sort_nearly_sorted_floats(),
1657
 *       which is in float.
1658
 */
1659
3306
static void stabilize_lsps(double *lsps, int num)
1660
{
1661
    int n, m, l;
1662
1663
    /* set minimum value for first, maximum value for last and minimum
1664
     * spacing between LSF values.
1665
     * Very similar to ff_set_min_dist_lsf(), but in double. */
1666
3306
    lsps[0]       = FFMAX(lsps[0],       0.0015 * M_PI);
1667
42960
    for (n = 1; n < num; n++)
1668
39654
        lsps[n]   = FFMAX(lsps[n],       lsps[n - 1] + 0.0125 * M_PI);
1669
3306
    lsps[num - 1] = FFMIN(lsps[num - 1], 0.9985 * M_PI);
1670
1671
    /* reorder (looks like one-time / non-recursed bubblesort).
1672
     * Very similar to ff_sort_nearly_sorted_floats(), but in double. */
1673
42960
    for (n = 1; n < num; n++) {
1674
39654
        if (lsps[n] < lsps[n - 1]) {
1675
            for (m = 1; m < num; m++) {
1676
                double tmp = lsps[m];
1677
                for (l = m - 1; l >= 0; l--) {
1678
                    if (lsps[l] <= tmp) break;
1679
                    lsps[l + 1] = lsps[l];
1680
                }
1681
                lsps[l + 1] = tmp;
1682
            }
1683
            break;
1684
        }
1685
    }
1686
3306
}
1687
1688
/**
1689
 * Synthesize output samples for a single superframe. If we have any data
1690
 * cached in s->sframe_cache, that will be used instead of whatever is loaded
1691
 * in s->gb.
1692
 *
1693
 * WMA Voice superframes contain 3 frames, each containing 160 audio samples,
1694
 * to give a total of 480 samples per frame. See #synth_frame() for frame
1695
 * parsing. In addition to 3 frames, superframes can also contain the LSPs
1696
 * (if these are globally specified for all frames (residually); they can
1697
 * also be specified individually per-frame. See the s->has_residual_lsps
1698
 * option), and can specify the number of samples encoded in this superframe
1699
 * (if less than 480), usually used to prevent blanks at track boundaries.
1700
 *
1701
 * @param ctx WMA Voice decoder context
1702
 * @return 0 on success, <0 on error or 1 if there was not enough data to
1703
 *         fully parse the superframe
1704
 */
1705
1102
static int synth_superframe(AVCodecContext *ctx, AVFrame *frame,
1706
                            int *got_frame_ptr)
1707
{
1708
1102
    WMAVoiceContext *s = ctx->priv_data;
1709
1102
    GetBitContext *gb = &s->gb, s_gb;
1710
1102
    int n, res, n_samples = MAX_SFRAMESIZE;
1711
    double lsps[MAX_FRAMES][MAX_LSPS];
1712
2204
    const double *mean_lsf = s->lsps == 16 ?
1713
1102
        wmavoice_mean_lsf16[s->lsp_def_mode] : wmavoice_mean_lsf10[s->lsp_def_mode];
1714
    float excitation[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE + 12];
1715
    float synth[MAX_LSPS + MAX_SFRAMESIZE];
1716
    float *samples;
1717
1718
1102
    memcpy(synth,      s->synth_history,
1719
1102
           s->lsps             * sizeof(*synth));
1720
1102
    memcpy(excitation, s->excitation_history,
1721
1102
           s->history_nsamples * sizeof(*excitation));
1722
1723
1102
    if (s->sframe_cache_size > 0) {
1724
185
        gb = &s_gb;
1725
185
        init_get_bits(gb, s->sframe_cache, s->sframe_cache_size);
1726
185
        s->sframe_cache_size = 0;
1727
    }
1728
1729
    /* First bit is speech/music bit, it differentiates between WMAVoice
1730
     * speech samples (the actual codec) and WMAVoice music samples, which
1731
     * are really WMAPro-in-WMAVoice-superframes. I've never seen those in
1732
     * the wild yet. */
1733
1102
    if (!get_bits1(gb)) {
1734
        avpriv_request_sample(ctx, "WMAPro-in-WMAVoice");
1735
        return AVERROR_PATCHWELCOME;
1736
    }
1737
1738
    /* (optional) nr. of samples in superframe; always <= 480 and >= 0 */
1739
1102
    if (get_bits1(gb)) {
1740
3
        if ((n_samples = get_bits(gb, 12)) > MAX_SFRAMESIZE) {
1741
            av_log(ctx, AV_LOG_ERROR,
1742
                   "Superframe encodes > %d samples (%d), not allowed\n",
1743
                   MAX_SFRAMESIZE, n_samples);
1744
            return AVERROR_INVALIDDATA;
1745
        }
1746
    }
1747
1748
    /* Parse LSPs, if global for the superframe (can also be per-frame). */
1749
1102
    if (s->has_residual_lsps) {
1750
        double prev_lsps[MAX_LSPS], a1[MAX_LSPS * 2], a2[MAX_LSPS * 2];
1751
1752
15422
        for (n = 0; n < s->lsps; n++)
1753
14320
            prev_lsps[n] = s->prev_lsps[n] - mean_lsf[n];
1754
1755
1102
        if (s->lsps == 10) {
1756
552
            dequant_lsp10r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1757
        } else /* s->lsps == 16 */
1758
550
            dequant_lsp16r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1759
1760
15422
        for (n = 0; n < s->lsps; n++) {
1761
14320
            lsps[0][n]  = mean_lsf[n] + (a1[n]           - a2[n * 2]);
1762
14320
            lsps[1][n]  = mean_lsf[n] + (a1[s->lsps + n] - a2[n * 2 + 1]);
1763
14320
            lsps[2][n] += mean_lsf[n];
1764
        }
1765
4408
        for (n = 0; n < 3; n++)
1766
3306
            stabilize_lsps(lsps[n], s->lsps);
1767
    }
1768
1769
    /* synth_superframe can run multiple times per packet
1770
     * free potential previous frame */
1771
1102
    av_frame_unref(frame);
1772
1773
    /* get output buffer */
1774
1102
    frame->nb_samples = MAX_SFRAMESIZE;
1775
1102
    if ((res = ff_get_buffer(ctx, frame, 0)) < 0)
1776
        return res;
1777
1102
    frame->nb_samples = n_samples;
1778
1102
    samples = (float *)frame->data[0];
1779
1780
    /* Parse frames, optionally preceded by per-frame (independent) LSPs. */
1781
4408
    for (n = 0; n < 3; n++) {
1782
3306
        if (!s->has_residual_lsps) {
1783
            int m;
1784
1785
            if (s->lsps == 10) {
1786
                dequant_lsp10i(gb, lsps[n]);
1787
            } else /* s->lsps == 16 */
1788
                dequant_lsp16i(gb, lsps[n]);
1789
1790
            for (m = 0; m < s->lsps; m++)
1791
                lsps[n][m] += mean_lsf[m];
1792
            stabilize_lsps(lsps[n], s->lsps);
1793
        }
1794
1795
4408
        if ((res = synth_frame(ctx, gb, n,
1796
3306
                               &samples[n * MAX_FRAMESIZE],
1797
3306
                               lsps[n], n == 0 ? s->prev_lsps : lsps[n - 1],
1798
3306
                               &excitation[s->history_nsamples + n * MAX_FRAMESIZE],
1799
3306
                               &synth[s->lsps + n * MAX_FRAMESIZE]))) {
1800
            *got_frame_ptr = 0;
1801
            return res;
1802
        }
1803
    }
1804
1805
    /* Statistics? FIXME - we don't check for length, a slight overrun
1806
     * will be caught by internal buffer padding, and anything else
1807
     * will be skipped, not read. */
1808
1102
    if (get_bits1(gb)) {
1809
        res = get_bits(gb, 4);
1810
        skip_bits(gb, 10 * (res + 1));
1811
    }
1812
1813
1102
    if (get_bits_left(gb) < 0) {
1814
        wmavoice_flush(ctx);
1815
        return AVERROR_INVALIDDATA;
1816
    }
1817
1818
1102
    *got_frame_ptr = 1;
1819
1820
    /* Update history */
1821
1102
    memcpy(s->prev_lsps,           lsps[2],
1822
1102
           s->lsps             * sizeof(*s->prev_lsps));
1823
1102
    memcpy(s->synth_history,      &synth[MAX_SFRAMESIZE],
1824
1102
           s->lsps             * sizeof(*synth));
1825
1102
    memcpy(s->excitation_history, &excitation[MAX_SFRAMESIZE],
1826
1102
           s->history_nsamples * sizeof(*excitation));
1827
1102
    if (s->do_apf)
1828
1102
        memmove(s->zero_exc_pf,       &s->zero_exc_pf[MAX_SFRAMESIZE],
1829
1102
                s->history_nsamples * sizeof(*s->zero_exc_pf));
1830
1831
1102
    return 0;
1832
}
1833
1834
/**
1835
 * Parse the packet header at the start of each packet (input data to this
1836
 * decoder).
1837
 *
1838
 * @param s WMA Voice decoding context private data
1839
 * @return <0 on error, nb_superframes on success.
1840
 */
1841
186
static int parse_packet_header(WMAVoiceContext *s)
1842
{
1843
186
    GetBitContext *gb = &s->gb;
1844
186
    unsigned int res, n_superframes = 0;
1845
1846
186
    skip_bits(gb, 4);          // packet sequence number
1847
186
    s->has_residual_lsps = get_bits1(gb);
1848
    do {
1849
186
        if (get_bits_left(gb) < 6 + s->spillover_bitsize)
1850
            return AVERROR_INVALIDDATA;
1851
1852
186
        res = get_bits(gb, 6); // number of superframes per packet
1853
                               // (minus first one if there is spillover)
1854
186
        n_superframes += res;
1855
186
    } while (res == 0x3F);
1856
186
    s->spillover_nbits   = get_bits(gb, s->spillover_bitsize);
1857
1858
186
    return get_bits_left(gb) >= 0 ? n_superframes : AVERROR_INVALIDDATA;
1859
}
1860
1861
/**
1862
 * Copy (unaligned) bits from gb/data/size to pb.
1863
 *
1864
 * @param pb target buffer to copy bits into
1865
 * @param data source buffer to copy bits from
1866
 * @param size size of the source data, in bytes
1867
 * @param gb bit I/O context specifying the current position in the source.
1868
 *           data. This function might use this to align the bit position to
1869
 *           a whole-byte boundary before calling #ff_copy_bits() on aligned
1870
 *           source data
1871
 * @param nbits the amount of bits to copy from source to target
1872
 *
1873
 * @note after calling this function, the current position in the input bit
1874
 *       I/O context is undefined.
1875
 */
1876
370
static void copy_bits(PutBitContext *pb,
1877
                      const uint8_t *data, int size,
1878
                      GetBitContext *gb, int nbits)
1879
{
1880
    int rmn_bytes, rmn_bits;
1881
1882
370
    rmn_bits = rmn_bytes = get_bits_left(gb);
1883
370
    if (rmn_bits < nbits)
1884
        return;
1885
370
    if (nbits > put_bits_left(pb))
1886
        return;
1887
370
    rmn_bits &= 7; rmn_bytes >>= 3;
1888
370
    if ((rmn_bits = FFMIN(rmn_bits, nbits)) > 0)
1889
290
        put_bits(pb, rmn_bits, get_bits(gb, rmn_bits));
1890
370
    ff_copy_bits(pb, data + size - rmn_bytes,
1891
370
                 FFMIN(nbits - rmn_bits, rmn_bytes << 3));
1892
}
1893
1894
/**
1895
 * Packet decoding: a packet is anything that the (ASF) demuxer contains,
1896
 * and we expect that the demuxer / application provides it to us as such
1897
 * (else you'll probably get garbage as output). Every packet has a size of
1898
 * ctx->block_align bytes, starts with a packet header (see
1899
 * #parse_packet_header()), and then a series of superframes. Superframe
1900
 * boundaries may exceed packets, i.e. superframes can split data over
1901
 * multiple (two) packets.
1902
 *
1903
 * For more information about frames, see #synth_superframe().
1904
 */
1905
1291
static int wmavoice_decode_packet(AVCodecContext *ctx, void *data,
1906
                                  int *got_frame_ptr, AVPacket *avpkt)
1907
{
1908
1291
    WMAVoiceContext *s = ctx->priv_data;
1909
1291
    GetBitContext *gb = &s->gb;
1910
    int size, res, pos;
1911
1912
    /* Packets are sometimes a multiple of ctx->block_align, with a packet
1913
     * header at each ctx->block_align bytes. However, FFmpeg's ASF demuxer
1914
     * feeds us ASF packets, which may concatenate multiple "codec" packets
1915
     * in a single "muxer" packet, so we artificially emulate that by
1916
     * capping the packet size at ctx->block_align. */
1917
1471
    for (size = avpkt->size; size > ctx->block_align; size -= ctx->block_align);
1918
1291
    init_get_bits8(&s->gb, avpkt->data, size);
1919
1920
    /* size == ctx->block_align is used to indicate whether we are dealing with
1921
     * a new packet or a packet of which we already read the packet header
1922
     * previously. */
1923
1291
    if (!(size % ctx->block_align)) { // new packet header
1924
191
        if (!size) {
1925
5
            s->spillover_nbits = 0;
1926
5
            s->nb_superframes = 0;
1927
        } else {
1928
186
            if ((res = parse_packet_header(s)) < 0)
1929
                return res;
1930
186
            s->nb_superframes = res;
1931
        }
1932
1933
        /* If the packet header specifies a s->spillover_nbits, then we want
1934
         * to push out all data of the previous packet (+ spillover) before
1935
         * continuing to parse new superframes in the current packet. */
1936
191
        if (s->sframe_cache_size > 0) {
1937
185
            int cnt = get_bits_count(gb);
1938
185
            if (cnt + s->spillover_nbits > avpkt->size * 8) {
1939
                s->spillover_nbits = avpkt->size * 8 - cnt;
1940
            }
1941
185
            copy_bits(&s->pb, avpkt->data, size, gb, s->spillover_nbits);
1942
185
            flush_put_bits(&s->pb);
1943
185
            s->sframe_cache_size += s->spillover_nbits;
1944
185
            if ((res = synth_superframe(ctx, data, got_frame_ptr)) == 0 &&
1945
185
                *got_frame_ptr) {
1946
185
                cnt += s->spillover_nbits;
1947
185
                s->skip_bits_next = cnt & 7;
1948
185
                res = cnt >> 3;
1949
185
                return res;
1950
            } else
1951
                skip_bits_long (gb, s->spillover_nbits - cnt +
1952
                                get_bits_count(gb)); // resync
1953
6
        } else if (s->spillover_nbits) {
1954
            skip_bits_long(gb, s->spillover_nbits);  // resync
1955
        }
1956
1100
    } else if (s->skip_bits_next)
1957
971
        skip_bits(gb, s->skip_bits_next);
1958
1959
    /* Try parsing superframes in current packet */
1960
1106
    s->sframe_cache_size = 0;
1961
1106
    s->skip_bits_next = 0;
1962
1106
    pos = get_bits_left(gb);
1963
1106
    if (s->nb_superframes-- == 0) {
1964
4
        *got_frame_ptr = 0;
1965
4
        return size;
1966
1102
    } else if (s->nb_superframes > 0) {
1967
917
        if ((res = synth_superframe(ctx, data, got_frame_ptr)) < 0) {
1968
            return res;
1969
917
        } else if (*got_frame_ptr) {
1970
917
            int cnt = get_bits_count(gb);
1971
917
            s->skip_bits_next = cnt & 7;
1972
917
            res = cnt >> 3;
1973
917
            return res;
1974
        }
1975
185
    } else if ((s->sframe_cache_size = pos) > 0) {
1976
        /* ... cache it for spillover in next packet */
1977
185
        init_put_bits(&s->pb, s->sframe_cache, SFRAME_CACHE_MAXSIZE);
1978
185
        copy_bits(&s->pb, avpkt->data, size, gb, s->sframe_cache_size);
1979
        // FIXME bad - just copy bytes as whole and add use the
1980
        // skip_bits_next field
1981
    }
1982
1983
185
    return size;
1984
}
1985
1986
8
static av_cold int wmavoice_decode_end(AVCodecContext *ctx)
1987
{
1988
8
    WMAVoiceContext *s = ctx->priv_data;
1989
1990
8
    if (s->do_apf) {
1991
8
        ff_rdft_end(&s->rdft);
1992
8
        ff_rdft_end(&s->irdft);
1993
8
        ff_dct_end(&s->dct);
1994
8
        ff_dct_end(&s->dst);
1995
    }
1996
1997
8
    return 0;
1998
}
1999
2000
AVCodec ff_wmavoice_decoder = {
2001
    .name             = "wmavoice",
2002
    .long_name        = NULL_IF_CONFIG_SMALL("Windows Media Audio Voice"),
2003
    .type             = AVMEDIA_TYPE_AUDIO,
2004
    .id               = AV_CODEC_ID_WMAVOICE,
2005
    .priv_data_size   = sizeof(WMAVoiceContext),
2006
    .init             = wmavoice_decode_init,
2007
    .close            = wmavoice_decode_end,
2008
    .decode           = wmavoice_decode_packet,
2009
    .capabilities     = AV_CODEC_CAP_SUBFRAMES | AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY,
2010
    .caps_internal    = FF_CODEC_CAP_INIT_CLEANUP,
2011
    .flush            = wmavoice_flush,
2012
};