FFmpeg coverage


Directory: ../../../ffmpeg/
File: src/libavcodec/wmavoice.c
Date: 2024-02-29 09:57:37
Exec Total Coverage
Lines: 680 774 87.9%
Functions: 28 29 96.6%
Branches: 308 394 78.2%

Line Branch Exec Source
1 /*
2 * Windows Media Audio Voice decoder.
3 * Copyright (c) 2009 Ronald S. Bultje
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22 /**
23 * @file
24 * @brief Windows Media Audio Voice compatible decoder
25 * @author Ronald S. Bultje <rsbultje@gmail.com>
26 */
27
28 #include <math.h>
29
30 #include "libavutil/channel_layout.h"
31 #include "libavutil/float_dsp.h"
32 #include "libavutil/mem_internal.h"
33 #include "libavutil/thread.h"
34 #include "libavutil/tx.h"
35 #include "avcodec.h"
36 #include "codec_internal.h"
37 #include "decode.h"
38 #include "get_bits.h"
39 #include "put_bits.h"
40 #include "wmavoice_data.h"
41 #include "celp_filters.h"
42 #include "acelp_vectors.h"
43 #include "acelp_filters.h"
44 #include "lsp.h"
45 #include "sinewin.h"
46
47 #define MAX_BLOCKS 8 ///< maximum number of blocks per frame
48 #define MAX_LSPS 16 ///< maximum filter order
49 #define MAX_LSPS_ALIGN16 16 ///< same as #MAX_LSPS; needs to be multiple
50 ///< of 16 for ASM input buffer alignment
51 #define MAX_FRAMES 3 ///< maximum number of frames per superframe
52 #define MAX_FRAMESIZE 160 ///< maximum number of samples per frame
53 #define MAX_SIGNAL_HISTORY 416 ///< maximum excitation signal history
54 #define MAX_SFRAMESIZE (MAX_FRAMESIZE * MAX_FRAMES)
55 ///< maximum number of samples per superframe
56 #define SFRAME_CACHE_MAXSIZE 256 ///< maximum cache size for frame data that
57 ///< was split over two packets
58 #define VLC_NBITS 6 ///< number of bits to read per VLC iteration
59
60 /**
61 * Frame type VLC coding.
62 */
63 static VLCElem frame_type_vlc[132];
64
65 /**
66 * Adaptive codebook types.
67 */
68 enum {
69 ACB_TYPE_NONE = 0, ///< no adaptive codebook (only hardcoded fixed)
70 ACB_TYPE_ASYMMETRIC = 1, ///< adaptive codebook with per-frame pitch, which
71 ///< we interpolate to get a per-sample pitch.
72 ///< Signal is generated using an asymmetric sinc
73 ///< window function
74 ///< @note see #wmavoice_ipol1_coeffs
75 ACB_TYPE_HAMMING = 2 ///< Per-block pitch with signal generation using
76 ///< a Hamming sinc window function
77 ///< @note see #wmavoice_ipol2_coeffs
78 };
79
80 /**
81 * Fixed codebook types.
82 */
83 enum {
84 FCB_TYPE_SILENCE = 0, ///< comfort noise during silence
85 ///< generated from a hardcoded (fixed) codebook
86 ///< with per-frame (low) gain values
87 FCB_TYPE_HARDCODED = 1, ///< hardcoded (fixed) codebook with per-block
88 ///< gain values
89 FCB_TYPE_AW_PULSES = 2, ///< Pitch-adaptive window (AW) pulse signals,
90 ///< used in particular for low-bitrate streams
91 FCB_TYPE_EXC_PULSES = 3, ///< Innovation (fixed) codebook pulse sets in
92 ///< combinations of either single pulses or
93 ///< pulse pairs
94 };
95
96 /**
97 * Description of frame types.
98 */
99 static const struct frame_type_desc {
100 uint8_t n_blocks; ///< amount of blocks per frame (each block
101 ///< (contains 160/#n_blocks samples)
102 uint8_t log_n_blocks; ///< log2(#n_blocks)
103 uint8_t acb_type; ///< Adaptive codebook type (ACB_TYPE_*)
104 uint8_t fcb_type; ///< Fixed codebook type (FCB_TYPE_*)
105 uint8_t dbl_pulses; ///< how many pulse vectors have pulse pairs
106 ///< (rather than just one single pulse)
107 ///< only if #fcb_type == #FCB_TYPE_EXC_PULSES
108 } frame_descs[17] = {
109 { 1, 0, ACB_TYPE_NONE, FCB_TYPE_SILENCE, 0 },
110 { 2, 1, ACB_TYPE_NONE, FCB_TYPE_HARDCODED, 0 },
111 { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_AW_PULSES, 0 },
112 { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2 },
113 { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5 },
114 { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 0 },
115 { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2 },
116 { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5 },
117 { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0 },
118 { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2 },
119 { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5 },
120 { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0 },
121 { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2 },
122 { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5 },
123 { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0 },
124 { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2 },
125 { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5 }
126 };
127
128 /**
129 * WMA Voice decoding context.
130 */
131 typedef struct WMAVoiceContext {
132 /**
133 * @name Global values specified in the stream header / extradata or used all over.
134 * @{
135 */
136 GetBitContext gb; ///< packet bitreader. During decoder init,
137 ///< it contains the extradata from the
138 ///< demuxer. During decoding, it contains
139 ///< packet data.
140 int8_t vbm_tree[25]; ///< converts VLC codes to frame type
141
142 int spillover_bitsize; ///< number of bits used to specify
143 ///< #spillover_nbits in the packet header
144 ///< = ceil(log2(ctx->block_align << 3))
145 int history_nsamples; ///< number of samples in history for signal
146 ///< prediction (through ACB)
147
148 /* postfilter specific values */
149 int do_apf; ///< whether to apply the averaged
150 ///< projection filter (APF)
151 int denoise_strength; ///< strength of denoising in Wiener filter
152 ///< [0-11]
153 int denoise_tilt_corr; ///< Whether to apply tilt correction to the
154 ///< Wiener filter coefficients (postfilter)
155 int dc_level; ///< Predicted amount of DC noise, based
156 ///< on which a DC removal filter is used
157
158 int lsps; ///< number of LSPs per frame [10 or 16]
159 int lsp_q_mode; ///< defines quantizer defaults [0, 1]
160 int lsp_def_mode; ///< defines different sets of LSP defaults
161 ///< [0, 1]
162
163 int min_pitch_val; ///< base value for pitch parsing code
164 int max_pitch_val; ///< max value + 1 for pitch parsing
165 int pitch_nbits; ///< number of bits used to specify the
166 ///< pitch value in the frame header
167 int block_pitch_nbits; ///< number of bits used to specify the
168 ///< first block's pitch value
169 int block_pitch_range; ///< range of the block pitch
170 int block_delta_pitch_nbits; ///< number of bits used to specify the
171 ///< delta pitch between this and the last
172 ///< block's pitch value, used in all but
173 ///< first block
174 int block_delta_pitch_hrange; ///< 1/2 range of the delta (full range is
175 ///< from -this to +this-1)
176 uint16_t block_conv_table[4]; ///< boundaries for block pitch unit/scale
177 ///< conversion
178
179 /**
180 * @}
181 *
182 * @name Packet values specified in the packet header or related to a packet.
183 *
184 * A packet is considered to be a single unit of data provided to this
185 * decoder by the demuxer.
186 * @{
187 */
188 int spillover_nbits; ///< number of bits of the previous packet's
189 ///< last superframe preceding this
190 ///< packet's first full superframe (useful
191 ///< for re-synchronization also)
192 int has_residual_lsps; ///< if set, superframes contain one set of
193 ///< LSPs that cover all frames, encoded as
194 ///< independent and residual LSPs; if not
195 ///< set, each frame contains its own, fully
196 ///< independent, LSPs
197 int skip_bits_next; ///< number of bits to skip at the next call
198 ///< to #wmavoice_decode_packet() (since
199 ///< they're part of the previous superframe)
200
201 uint8_t sframe_cache[SFRAME_CACHE_MAXSIZE + AV_INPUT_BUFFER_PADDING_SIZE];
202 ///< cache for superframe data split over
203 ///< multiple packets
204 int sframe_cache_size; ///< set to >0 if we have data from an
205 ///< (incomplete) superframe from a previous
206 ///< packet that spilled over in the current
207 ///< packet; specifies the amount of bits in
208 ///< #sframe_cache
209 PutBitContext pb; ///< bitstream writer for #sframe_cache
210
211 /**
212 * @}
213 *
214 * @name Frame and superframe values
215 * Superframe and frame data - these can change from frame to frame,
216 * although some of them do in that case serve as a cache / history for
217 * the next frame or superframe.
218 * @{
219 */
220 double prev_lsps[MAX_LSPS]; ///< LSPs of the last frame of the previous
221 ///< superframe
222 int last_pitch_val; ///< pitch value of the previous frame
223 int last_acb_type; ///< frame type [0-2] of the previous frame
224 int pitch_diff_sh16; ///< ((cur_pitch_val - #last_pitch_val)
225 ///< << 16) / #MAX_FRAMESIZE
226 float silence_gain; ///< set for use in blocks if #ACB_TYPE_NONE
227
228 int aw_idx_is_ext; ///< whether the AW index was encoded in
229 ///< 8 bits (instead of 6)
230 int aw_pulse_range; ///< the range over which #aw_pulse_set1()
231 ///< can apply the pulse, relative to the
232 ///< value in aw_first_pulse_off. The exact
233 ///< position of the first AW-pulse is within
234 ///< [pulse_off, pulse_off + this], and
235 ///< depends on bitstream values; [16 or 24]
236 int aw_n_pulses[2]; ///< number of AW-pulses in each block; note
237 ///< that this number can be negative (in
238 ///< which case it basically means "zero")
239 int aw_first_pulse_off[2]; ///< index of first sample to which to
240 ///< apply AW-pulses, or -0xff if unset
241 int aw_next_pulse_off_cache; ///< the position (relative to start of the
242 ///< second block) at which pulses should
243 ///< start to be positioned, serves as a
244 ///< cache for pitch-adaptive window pulses
245 ///< between blocks
246
247 int frame_cntr; ///< current frame index [0 - 0xFFFE]; is
248 ///< only used for comfort noise in #pRNG()
249 int nb_superframes; ///< number of superframes in current packet
250 float gain_pred_err[6]; ///< cache for gain prediction
251 float excitation_history[MAX_SIGNAL_HISTORY];
252 ///< cache of the signal of previous
253 ///< superframes, used as a history for
254 ///< signal generation
255 float synth_history[MAX_LSPS]; ///< see #excitation_history
256 /**
257 * @}
258 *
259 * @name Postfilter values
260 *
261 * Variables used for postfilter implementation, mostly history for
262 * smoothing and so on, and context variables for FFT/iFFT.
263 * @{
264 */
265 AVTXContext *rdft, *irdft; ///< contexts for FFT-calculation in the
266 av_tx_fn rdft_fn, irdft_fn; ///< postfilter (for denoise filter)
267 AVTXContext *dct, *dst; ///< contexts for phase shift (in Hilbert
268 av_tx_fn dct_fn, dst_fn; ///< transform, part of postfilter)
269 float sin[511], cos[511]; ///< 8-bit cosine/sine windows over [-pi,pi]
270 ///< range
271 float postfilter_agc; ///< gain control memory, used in
272 ///< #adaptive_gain_control()
273 float dcf_mem[2]; ///< DC filter history
274 float zero_exc_pf[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE];
275 ///< zero filter output (i.e. excitation)
276 ///< by postfilter
277 float denoise_filter_cache[MAX_FRAMESIZE];
278 int denoise_filter_cache_size; ///< samples in #denoise_filter_cache
279 DECLARE_ALIGNED(32, float, tilted_lpcs_pf)[0x82];
280 ///< aligned buffer for LPC tilting
281 DECLARE_ALIGNED(32, float, denoise_coeffs_pf)[0x82];
282 ///< aligned buffer for denoise coefficients
283 DECLARE_ALIGNED(32, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16];
284 ///< aligned buffer for postfilter speech
285 ///< synthesis
286 /**
287 * @}
288 */
289 } WMAVoiceContext;
290
291 /**
292 * Set up the variable bit mode (VBM) tree from container extradata.
293 * @param gb bit I/O context.
294 * The bit context (s->gb) should be loaded with byte 23-46 of the
295 * container extradata (i.e. the ones containing the VBM tree).
296 * @param vbm_tree pointer to array to which the decoded VBM tree will be
297 * written.
298 * @return 0 on success, <0 on error.
299 */
300 8 static av_cold int decode_vbmtree(GetBitContext *gb, int8_t vbm_tree[25])
301 {
302 8 int cntr[8] = { 0 }, n, res;
303
304 8 memset(vbm_tree, 0xff, sizeof(vbm_tree[0]) * 25);
305
2/2
✓ Branch 0 taken 136 times.
✓ Branch 1 taken 8 times.
144 for (n = 0; n < 17; n++) {
306 136 res = get_bits(gb, 3);
307
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 136 times.
136 if (cntr[res] > 3) // should be >= 3 + (res == 7))
308 return -1;
309 136 vbm_tree[res * 3 + cntr[res]++] = n;
310 }
311 8 return 0;
312 }
313
314 5 static av_cold void wmavoice_init_static_data(void)
315 {
316 static const uint8_t bits[] = {
317 2, 2, 2, 4, 4, 4,
318 6, 6, 6, 8, 8, 8,
319 10, 10, 10, 12, 12, 12,
320 14, 14, 14, 14
321 };
322
323 5 VLC_INIT_STATIC_TABLE_FROM_LENGTHS(frame_type_vlc, VLC_NBITS,
324 FF_ARRAY_ELEMS(bits), bits,
325 1, NULL, 0, 0, 0, 0);
326 5 }
327
328 static av_cold void wmavoice_flush(AVCodecContext *ctx)
329 {
330 WMAVoiceContext *s = ctx->priv_data;
331 int n;
332
333 s->postfilter_agc = 0;
334 s->sframe_cache_size = 0;
335 s->skip_bits_next = 0;
336 for (n = 0; n < s->lsps; n++)
337 s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
338 memset(s->excitation_history, 0,
339 sizeof(*s->excitation_history) * MAX_SIGNAL_HISTORY);
340 memset(s->synth_history, 0,
341 sizeof(*s->synth_history) * MAX_LSPS);
342 memset(s->gain_pred_err, 0,
343 sizeof(s->gain_pred_err));
344
345 if (s->do_apf) {
346 memset(&s->synth_filter_out_buf[MAX_LSPS_ALIGN16 - s->lsps], 0,
347 sizeof(*s->synth_filter_out_buf) * s->lsps);
348 memset(s->dcf_mem, 0,
349 sizeof(*s->dcf_mem) * 2);
350 memset(s->zero_exc_pf, 0,
351 sizeof(*s->zero_exc_pf) * s->history_nsamples);
352 memset(s->denoise_filter_cache, 0, sizeof(s->denoise_filter_cache));
353 }
354 }
355
356 /**
357 * Set up decoder with parameters from demuxer (extradata etc.).
358 */
359 8 static av_cold int wmavoice_decode_init(AVCodecContext *ctx)
360 {
361 static AVOnce init_static_once = AV_ONCE_INIT;
362 int n, flags, pitch_range, lsp16_flag, ret;
363 8 WMAVoiceContext *s = ctx->priv_data;
364
365 8 ff_thread_once(&init_static_once, wmavoice_init_static_data);
366
367 /**
368 * Extradata layout:
369 * - byte 0-18: WMAPro-in-WMAVoice extradata (see wmaprodec.c),
370 * - byte 19-22: flags field (annoyingly in LE; see below for known
371 * values),
372 * - byte 23-46: variable bitmode tree (really just 17 * 3 bits,
373 * rest is 0).
374 */
375
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 8 times.
8 if (ctx->extradata_size != 46) {
376 av_log(ctx, AV_LOG_ERROR,
377 "Invalid extradata size %d (should be 46)\n",
378 ctx->extradata_size);
379 return AVERROR_INVALIDDATA;
380 }
381
2/4
✓ Branch 0 taken 8 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 8 times.
8 if (ctx->block_align <= 0 || ctx->block_align > (1<<22)) {
382 av_log(ctx, AV_LOG_ERROR, "Invalid block alignment %d.\n", ctx->block_align);
383 return AVERROR_INVALIDDATA;
384 }
385
386 8 flags = AV_RL32(ctx->extradata + 18);
387 8 s->spillover_bitsize = 3 + av_ceil_log2(ctx->block_align);
388 8 s->do_apf = flags & 0x1;
389
1/2
✓ Branch 0 taken 8 times.
✗ Branch 1 not taken.
8 if (s->do_apf) {
390 8 float scale = 1.0f;
391
392 8 ret = av_tx_init(&s->rdft, &s->rdft_fn, AV_TX_FLOAT_RDFT, 0, 1 << 7, &scale, 0);
393
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 8 times.
8 if (ret < 0)
394 return ret;
395
396 8 ret = av_tx_init(&s->irdft, &s->irdft_fn, AV_TX_FLOAT_RDFT, 1, 1 << 7, &scale, 0);
397
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 8 times.
8 if (ret < 0)
398 return ret;
399
400 8 scale = 1.0 / (1 << 6);
401 8 ret = av_tx_init(&s->dct, &s->dct_fn, AV_TX_FLOAT_DCT_I, 0, 1 << 6, &scale, 0);
402
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 8 times.
8 if (ret < 0)
403 return ret;
404
405 8 scale = 1.0 / (1 << 6);
406 8 ret = av_tx_init(&s->dst, &s->dst_fn, AV_TX_FLOAT_DST_I, 0, 1 << 6, &scale, 0);
407
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 8 times.
8 if (ret < 0)
408 return ret;
409
410 8 ff_sine_window_init(s->cos, 256);
411 8 memcpy(&s->sin[255], s->cos, 256 * sizeof(s->cos[0]));
412
2/2
✓ Branch 0 taken 2040 times.
✓ Branch 1 taken 8 times.
2048 for (n = 0; n < 255; n++) {
413 2040 s->sin[n] = -s->sin[510 - n];
414 2040 s->cos[510 - n] = s->cos[n];
415 }
416 }
417 8 s->denoise_strength = (flags >> 2) & 0xF;
418
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 8 times.
8 if (s->denoise_strength >= 12) {
419 av_log(ctx, AV_LOG_ERROR,
420 "Invalid denoise filter strength %d (max=11)\n",
421 s->denoise_strength);
422 return AVERROR_INVALIDDATA;
423 }
424 8 s->denoise_tilt_corr = !!(flags & 0x40);
425 8 s->dc_level = (flags >> 7) & 0xF;
426 8 s->lsp_q_mode = !!(flags & 0x2000);
427 8 s->lsp_def_mode = !!(flags & 0x4000);
428 8 lsp16_flag = flags & 0x1000;
429
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
8 if (lsp16_flag) {
430 4 s->lsps = 16;
431 } else {
432 4 s->lsps = 10;
433 }
434
2/2
✓ Branch 0 taken 104 times.
✓ Branch 1 taken 8 times.
112 for (n = 0; n < s->lsps; n++)
435 104 s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
436
437 8 init_get_bits(&s->gb, ctx->extradata + 22, (ctx->extradata_size - 22) << 3);
438
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 8 times.
8 if (decode_vbmtree(&s->gb, s->vbm_tree) < 0) {
439 av_log(ctx, AV_LOG_ERROR, "Invalid VBM tree; broken extradata?\n");
440 return AVERROR_INVALIDDATA;
441 }
442
443
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 8 times.
8 if (ctx->sample_rate >= INT_MAX / (256 * 37))
444 return AVERROR_INVALIDDATA;
445
446 8 s->min_pitch_val = ((ctx->sample_rate << 8) / 400 + 50) >> 8;
447 8 s->max_pitch_val = ((ctx->sample_rate << 8) * 37 / 2000 + 50) >> 8;
448 8 pitch_range = s->max_pitch_val - s->min_pitch_val;
449
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 8 times.
8 if (pitch_range <= 0) {
450 av_log(ctx, AV_LOG_ERROR, "Invalid pitch range; broken extradata?\n");
451 return AVERROR_INVALIDDATA;
452 }
453 8 s->pitch_nbits = av_ceil_log2(pitch_range);
454 8 s->last_pitch_val = 40;
455 8 s->last_acb_type = ACB_TYPE_NONE;
456 8 s->history_nsamples = s->max_pitch_val + 8;
457
458
2/4
✓ Branch 0 taken 8 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 8 times.
8 if (s->min_pitch_val < 1 || s->history_nsamples > MAX_SIGNAL_HISTORY) {
459 int min_sr = ((((1 << 8) - 50) * 400) + 0xFF) >> 8,
460 max_sr = ((((MAX_SIGNAL_HISTORY - 8) << 8) + 205) * 2000 / 37) >> 8;
461
462 av_log(ctx, AV_LOG_ERROR,
463 "Unsupported samplerate %d (min=%d, max=%d)\n",
464 ctx->sample_rate, min_sr, max_sr); // 322-22097 Hz
465
466 return AVERROR(ENOSYS);
467 }
468
469 8 s->block_conv_table[0] = s->min_pitch_val;
470 8 s->block_conv_table[1] = (pitch_range * 25) >> 6;
471 8 s->block_conv_table[2] = (pitch_range * 44) >> 6;
472 8 s->block_conv_table[3] = s->max_pitch_val - 1;
473 8 s->block_delta_pitch_hrange = (pitch_range >> 3) & ~0xF;
474
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 8 times.
8 if (s->block_delta_pitch_hrange <= 0) {
475 av_log(ctx, AV_LOG_ERROR, "Invalid delta pitch hrange; broken extradata?\n");
476 return AVERROR_INVALIDDATA;
477 }
478 8 s->block_delta_pitch_nbits = 1 + av_ceil_log2(s->block_delta_pitch_hrange);
479 8 s->block_pitch_range = s->block_conv_table[2] +
480 8 s->block_conv_table[3] + 1 +
481 8 2 * (s->block_conv_table[1] - 2 * s->min_pitch_val);
482 8 s->block_pitch_nbits = av_ceil_log2(s->block_pitch_range);
483
484 8 av_channel_layout_uninit(&ctx->ch_layout);
485 8 ctx->ch_layout = (AVChannelLayout)AV_CHANNEL_LAYOUT_MONO;
486 8 ctx->sample_fmt = AV_SAMPLE_FMT_FLT;
487
488 8 return 0;
489 }
490
491 /**
492 * @name Postfilter functions
493 * Postfilter functions (gain control, wiener denoise filter, DC filter,
494 * kalman smoothening, plus surrounding code to wrap it)
495 * @{
496 */
497 /**
498 * Adaptive gain control (as used in postfilter).
499 *
500 * Identical to #ff_adaptive_gain_control() in acelp_vectors.c, except
501 * that the energy here is calculated using sum(abs(...)), whereas the
502 * other codecs (e.g. AMR-NB, SIPRO) use sqrt(dotproduct(...)).
503 *
504 * @param out output buffer for filtered samples
505 * @param in input buffer containing the samples as they are after the
506 * postfilter steps so far
507 * @param speech_synth input buffer containing speech synth before postfilter
508 * @param size input buffer size
509 * @param alpha exponential filter factor
510 * @param gain_mem pointer to filter memory (single float)
511 */
512 6612 static void adaptive_gain_control(float *out, const float *in,
513 const float *speech_synth,
514 int size, float alpha, float *gain_mem)
515 {
516 int i;
517 6612 float speech_energy = 0.0, postfilter_energy = 0.0, gain_scale_factor;
518 6612 float mem = *gain_mem;
519
520
2/2
✓ Branch 0 taken 528960 times.
✓ Branch 1 taken 6612 times.
535572 for (i = 0; i < size; i++) {
521 528960 speech_energy += fabsf(speech_synth[i]);
522 528960 postfilter_energy += fabsf(in[i]);
523 }
524
1/2
✓ Branch 0 taken 6612 times.
✗ Branch 1 not taken.
6612 gain_scale_factor = postfilter_energy == 0.0 ? 0.0 :
525 6612 (1.0 - alpha) * speech_energy / postfilter_energy;
526
527
2/2
✓ Branch 0 taken 528960 times.
✓ Branch 1 taken 6612 times.
535572 for (i = 0; i < size; i++) {
528 528960 mem = alpha * mem + gain_scale_factor;
529 528960 out[i] = in[i] * mem;
530 }
531
532 6612 *gain_mem = mem;
533 6612 }
534
535 /**
536 * Kalman smoothing function.
537 *
538 * This function looks back pitch +/- 3 samples back into history to find
539 * the best fitting curve (that one giving the optimal gain of the two
540 * signals, i.e. the highest dot product between the two), and then
541 * uses that signal history to smoothen the output of the speech synthesis
542 * filter.
543 *
544 * @param s WMA Voice decoding context
545 * @param pitch pitch of the speech signal
546 * @param in input speech signal
547 * @param out output pointer for smoothened signal
548 * @param size input/output buffer size
549 *
550 * @returns -1 if no smoothening took place, e.g. because no optimal
551 * fit could be found, or 0 on success.
552 */
553 5070 static int kalman_smoothen(WMAVoiceContext *s, int pitch,
554 const float *in, float *out, int size)
555 {
556 int n;
557 5070 float optimal_gain = 0, dot;
558
2/2
✓ Branch 0 taken 44 times.
✓ Branch 1 taken 5026 times.
5070 const float *ptr = &in[-FFMAX(s->min_pitch_val, pitch - 3)],
559 5070 *end = &in[-FFMIN(s->max_pitch_val, pitch + 3)],
560 5070 *best_hist_ptr = NULL;
561
562 /* find best fitting point in history */
563 do {
564 35388 dot = avpriv_scalarproduct_float_c(in, ptr, size);
565
2/2
✓ Branch 0 taken 12328 times.
✓ Branch 1 taken 23060 times.
35388 if (dot > optimal_gain) {
566 12328 optimal_gain = dot;
567 12328 best_hist_ptr = ptr;
568 }
569
2/2
✓ Branch 0 taken 30318 times.
✓ Branch 1 taken 5070 times.
35388 } while (--ptr >= end);
570
571
2/2
✓ Branch 0 taken 26 times.
✓ Branch 1 taken 5044 times.
5070 if (optimal_gain <= 0)
572 26 return -1;
573 5044 dot = avpriv_scalarproduct_float_c(best_hist_ptr, best_hist_ptr, size);
574
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 5044 times.
5044 if (dot <= 0) // would be 1.0
575 return -1;
576
577
2/2
✓ Branch 0 taken 4872 times.
✓ Branch 1 taken 172 times.
5044 if (optimal_gain <= dot) {
578 4872 dot = dot / (dot + 0.6 * optimal_gain); // 0.625-1.000
579 } else
580 172 dot = 0.625;
581
582 /* actual smoothing */
583
2/2
✓ Branch 0 taken 403520 times.
✓ Branch 1 taken 5044 times.
408564 for (n = 0; n < size; n++)
584 403520 out[n] = best_hist_ptr[n] + dot * (in[n] - best_hist_ptr[n]);
585
586 5044 return 0;
587 }
588
589 /**
590 * Get the tilt factor of a formant filter from its transfer function
591 * @see #tilt_factor() in amrnbdec.c, which does essentially the same,
592 * but somehow (??) it does a speech synthesis filter in the
593 * middle, which is missing here
594 *
595 * @param lpcs LPC coefficients
596 * @param n_lpcs Size of LPC buffer
597 * @returns the tilt factor
598 */
599 7098 static float tilt_factor(const float *lpcs, int n_lpcs)
600 {
601 float rh0, rh1;
602
603 7098 rh0 = 1.0 + avpriv_scalarproduct_float_c(lpcs, lpcs, n_lpcs);
604 7098 rh1 = lpcs[0] + avpriv_scalarproduct_float_c(lpcs, &lpcs[1], n_lpcs - 1);
605
606 7098 return rh1 / rh0;
607 }
608
609 /**
610 * Derive denoise filter coefficients (in real domain) from the LPCs.
611 */
612 5614 static void calc_input_response(WMAVoiceContext *s, float *lpcs_src,
613 int fcb_type, float *coeffs_dst, int remainder)
614 {
615 5614 float last_coeff, min = 15.0, max = -15.0;
616 float irange, angle_mul, gain_mul, range, sq;
617 5614 LOCAL_ALIGNED_32(float, coeffs, [0x82]);
618 5614 LOCAL_ALIGNED_32(float, lpcs, [0x82]);
619 5614 LOCAL_ALIGNED_32(float, lpcs_dct, [0x82]);
620 int n, idx;
621
622 5614 memcpy(coeffs, coeffs_dst, 0x82*sizeof(float));
623
624 /* Create frequency power spectrum of speech input (i.e. RDFT of LPCs) */
625 5614 s->rdft_fn(s->rdft, lpcs, lpcs_src, sizeof(float));
626 #define log_range(var, assign) do { \
627 float tmp = log10f(assign); var = tmp; \
628 max = FFMAX(max, tmp); min = FFMIN(min, tmp); \
629 } while (0)
630
2/4
✗ Branch 0 not taken.
✓ Branch 1 taken 5614 times.
✓ Branch 2 taken 5614 times.
✗ Branch 3 not taken.
5614 log_range(last_coeff, lpcs[64] * lpcs[64]);
631
2/2
✓ Branch 0 taken 353682 times.
✓ Branch 1 taken 5614 times.
359296 for (n = 1; n < 64; n++)
632
4/4
✓ Branch 0 taken 286139 times.
✓ Branch 1 taken 67543 times.
✓ Branch 2 taken 16276 times.
✓ Branch 3 taken 337406 times.
353682 log_range(lpcs[n], lpcs[n * 2] * lpcs[n * 2] +
633 lpcs[n * 2 + 1] * lpcs[n * 2 + 1]);
634
4/4
✓ Branch 0 taken 5604 times.
✓ Branch 1 taken 10 times.
✓ Branch 2 taken 1233 times.
✓ Branch 3 taken 4381 times.
5614 log_range(lpcs[0], lpcs[0] * lpcs[0]);
635 #undef log_range
636 5614 range = max - min;
637 5614 lpcs[64] = last_coeff;
638
639 /* Now, use this spectrum to pick out these frequencies with higher
640 * (relative) power/energy (which we then take to be "not noise"),
641 * and set up a table (still in lpc[]) of (relative) gains per frequency.
642 * These frequencies will be maintained, while others ("noise") will be
643 * decreased in the filter output. */
644 5614 irange = 64.0 / range; // so irange*(max-value) is in the range [0, 63]
645
2/2
✓ Branch 0 taken 544 times.
✓ Branch 1 taken 5070 times.
5614 gain_mul = range * (fcb_type == FCB_TYPE_HARDCODED ? (5.0 / 13.0) :
646 (5.0 / 14.7));
647 5614 angle_mul = gain_mul * (8.0 * M_LN10 / M_PI);
648
2/2
✓ Branch 0 taken 364910 times.
✓ Branch 1 taken 5614 times.
370524 for (n = 0; n <= 64; n++) {
649 float pwr;
650
651 364910 idx = lrint((max - lpcs[n]) * irange - 1);
652 364910 idx = FFMAX(0, idx);
653 364910 pwr = wmavoice_denoise_power_table[s->denoise_strength][idx];
654 364910 lpcs[n] = angle_mul * pwr;
655
656 /* 70.57 =~ 1/log10(1.0331663) */
657 364910 idx = av_clipf((pwr * gain_mul - 0.0295) * 70.570526123, 0, INT_MAX / 2);
658
659
2/2
✓ Branch 0 taken 9151 times.
✓ Branch 1 taken 355759 times.
364910 if (idx > 127) { // fall back if index falls outside table range
660 9151 coeffs[n] = wmavoice_energy_table[127] *
661 9151 powf(1.0331663, idx - 127);
662 } else
663 355759 coeffs[n] = wmavoice_energy_table[FFMAX(0, idx)];
664 }
665
666 /* calculate the Hilbert transform of the gains, which we do (since this
667 * is a sine input) by doing a phase shift (in theory, H(sin())=cos()).
668 * Hilbert_Transform(RDFT(x)) = Laplace_Transform(x), which calculates the
669 * "moment" of the LPCs in this filter. */
670 5614 s->dct_fn(s->dct, lpcs_dct, lpcs, sizeof(float));
671 5614 s->dst_fn(s->dst, lpcs, lpcs_dct, sizeof(float));
672
673 /* Split out the coefficient indexes into phase/magnitude pairs */
674 5614 idx = 255 + av_clip(lpcs[64], -255, 255);
675 5614 coeffs[0] = coeffs[0] * s->cos[idx];
676 5614 idx = 255 + av_clip(lpcs[64] - 2 * lpcs[63], -255, 255);
677 5614 last_coeff = coeffs[64] * s->cos[idx];
678 5614 for (n = 63;; n--) {
679 353682 idx = 255 + av_clip(-lpcs[64] - 2 * lpcs[n - 1], -255, 255);
680 179648 coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
681 179648 coeffs[n * 2] = coeffs[n] * s->cos[idx];
682
683
2/2
✓ Branch 0 taken 5614 times.
✓ Branch 1 taken 174034 times.
179648 if (!--n) break;
684
685 174034 idx = 255 + av_clip( lpcs[64] - 2 * lpcs[n - 1], -255, 255);
686 174034 coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
687 174034 coeffs[n * 2] = coeffs[n] * s->cos[idx];
688 }
689 5614 coeffs[64] = last_coeff;
690
691 /* move into real domain */
692 5614 s->irdft_fn(s->irdft, coeffs_dst, coeffs, sizeof(AVComplexFloat));
693
694 /* tilt correction and normalize scale */
695 5614 memset(&coeffs_dst[remainder], 0, sizeof(coeffs_dst[0]) * (128 - remainder));
696
2/2
✓ Branch 0 taken 1484 times.
✓ Branch 1 taken 4130 times.
5614 if (s->denoise_tilt_corr) {
697 1484 float tilt_mem = 0;
698
699 1484 coeffs_dst[remainder - 1] = 0;
700 1484 ff_tilt_compensation(&tilt_mem,
701 1484 -1.8 * tilt_factor(coeffs_dst, remainder - 1),
702 coeffs_dst, remainder);
703 }
704 5614 sq = (1.0 / 64.0) * sqrtf(1 / avpriv_scalarproduct_float_c(coeffs_dst, coeffs_dst,
705 remainder));
706
2/2
✓ Branch 0 taken 263858 times.
✓ Branch 1 taken 5614 times.
269472 for (n = 0; n < remainder; n++)
707 263858 coeffs_dst[n] *= sq;
708 5614 }
709
710 /**
711 * This function applies a Wiener filter on the (noisy) speech signal as
712 * a means to denoise it.
713 *
714 * - take RDFT of LPCs to get the power spectrum of the noise + speech;
715 * - using this power spectrum, calculate (for each frequency) the Wiener
716 * filter gain, which depends on the frequency power and desired level
717 * of noise subtraction (when set too high, this leads to artifacts)
718 * We can do this symmetrically over the X-axis (so 0-4kHz is the inverse
719 * of 4-8kHz);
720 * - by doing a phase shift, calculate the Hilbert transform of this array
721 * of per-frequency filter-gains to get the filtering coefficients;
722 * - smoothen/normalize/de-tilt these filter coefficients as desired;
723 * - take RDFT of noisy sound, apply the coefficients and take its IRDFT
724 * to get the denoised speech signal;
725 * - the leftover (i.e. output of the IRDFT on denoised speech data beyond
726 * the frame boundary) are saved and applied to subsequent frames by an
727 * overlap-add method (otherwise you get clicking-artifacts).
728 *
729 * @param s WMA Voice decoding context
730 * @param fcb_type Frame (codebook) type
731 * @param synth_pf input: the noisy speech signal, output: denoised speech
732 * data; should be 16-byte aligned (for ASM purposes)
733 * @param size size of the speech data
734 * @param lpcs LPCs used to synthesize this frame's speech data
735 */
736 6612 static void wiener_denoise(WMAVoiceContext *s, int fcb_type,
737 float *synth_pf, int size,
738 const float *lpcs)
739 {
740 int remainder, lim, n;
741
742
2/2
✓ Branch 0 taken 5614 times.
✓ Branch 1 taken 998 times.
6612 if (fcb_type != FCB_TYPE_SILENCE) {
743 5614 LOCAL_ALIGNED_32(float, coeffs_f, [0x82]);
744 5614 LOCAL_ALIGNED_32(float, synth_f, [0x82]);
745 5614 float *tilted_lpcs = s->tilted_lpcs_pf,
746 5614 *coeffs = s->denoise_coeffs_pf, tilt_mem = 0;
747
748 5614 tilted_lpcs[0] = 1.0;
749 5614 memcpy(&tilted_lpcs[1], lpcs, sizeof(lpcs[0]) * s->lsps);
750 5614 memset(&tilted_lpcs[s->lsps + 1], 0,
751 5614 sizeof(tilted_lpcs[0]) * (128 - s->lsps - 1));
752 5614 ff_tilt_compensation(&tilt_mem, 0.7 * tilt_factor(lpcs, s->lsps),
753 5614 tilted_lpcs, s->lsps + 2);
754
755 /* The IRDFT output (127 samples for 7-bit filter) beyond the frame
756 * size is applied to the next frame. All input beyond this is zero,
757 * and thus all output beyond this will go towards zero, hence we can
758 * limit to min(size-1, 127-size) as a performance consideration. */
759
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 5614 times.
5614 remainder = FFMIN(127 - size, size - 1);
760 5614 calc_input_response(s, tilted_lpcs, fcb_type, coeffs, remainder);
761
762 /* apply coefficients (in frequency spectrum domain), i.e. complex
763 * number multiplication */
764 5614 memset(&synth_pf[size], 0, sizeof(synth_pf[0]) * (128 - size));
765 5614 s->rdft_fn(s->rdft, synth_f, synth_pf, sizeof(float));
766 5614 s->rdft_fn(s->rdft, coeffs_f, coeffs, sizeof(float));
767 5614 synth_f[0] *= coeffs_f[0];
768 5614 synth_f[1] *= coeffs_f[1];
769
2/2
✓ Branch 0 taken 359296 times.
✓ Branch 1 taken 5614 times.
364910 for (n = 1; n <= 64; n++) {
770 359296 float v1 = synth_f[n * 2], v2 = synth_f[n * 2 + 1];
771 359296 synth_f[n * 2] = v1 * coeffs_f[n * 2] - v2 * coeffs_f[n * 2 + 1];
772 359296 synth_f[n * 2 + 1] = v2 * coeffs_f[n * 2] + v1 * coeffs_f[n * 2 + 1];
773 }
774 5614 s->irdft_fn(s->irdft, synth_pf, synth_f, sizeof(AVComplexFloat));
775 }
776
777 /* merge filter output with the history of previous runs */
778
2/2
✓ Branch 0 taken 5612 times.
✓ Branch 1 taken 1000 times.
6612 if (s->denoise_filter_cache_size) {
779 5612 lim = FFMIN(s->denoise_filter_cache_size, size);
780
2/2
✓ Branch 0 taken 263764 times.
✓ Branch 1 taken 5612 times.
269376 for (n = 0; n < lim; n++)
781 263764 synth_pf[n] += s->denoise_filter_cache[n];
782 5612 s->denoise_filter_cache_size -= lim;
783 5612 memmove(s->denoise_filter_cache, &s->denoise_filter_cache[size],
784 5612 sizeof(s->denoise_filter_cache[0]) * s->denoise_filter_cache_size);
785 }
786
787 /* move remainder of filter output into a cache for future runs */
788
2/2
✓ Branch 0 taken 5614 times.
✓ Branch 1 taken 998 times.
6612 if (fcb_type != FCB_TYPE_SILENCE) {
789 5614 lim = FFMIN(remainder, s->denoise_filter_cache_size);
790
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 5614 times.
5614 for (n = 0; n < lim; n++)
791 s->denoise_filter_cache[n] += synth_pf[size + n];
792
1/2
✓ Branch 0 taken 5614 times.
✗ Branch 1 not taken.
5614 if (lim < remainder) {
793 5614 memcpy(&s->denoise_filter_cache[lim], &synth_pf[size + lim],
794 5614 sizeof(s->denoise_filter_cache[0]) * (remainder - lim));
795 5614 s->denoise_filter_cache_size = remainder;
796 }
797 }
798 6612 }
799
800 /**
801 * Averaging projection filter, the postfilter used in WMAVoice.
802 *
803 * This uses the following steps:
804 * - A zero-synthesis filter (generate excitation from synth signal)
805 * - Kalman smoothing on excitation, based on pitch
806 * - Re-synthesized smoothened output
807 * - Iterative Wiener denoise filter
808 * - Adaptive gain filter
809 * - DC filter
810 *
811 * @param s WMAVoice decoding context
812 * @param synth Speech synthesis output (before postfilter)
813 * @param samples Output buffer for filtered samples
814 * @param size Buffer size of synth & samples
815 * @param lpcs Generated LPCs used for speech synthesis
816 * @param zero_exc_pf destination for zero synthesis filter (16-byte aligned)
817 * @param fcb_type Frame type (silence, hardcoded, AW-pulses or FCB-pulses)
818 * @param pitch Pitch of the input signal
819 */
820 6612 static void postfilter(WMAVoiceContext *s, const float *synth,
821 float *samples, int size,
822 const float *lpcs, float *zero_exc_pf,
823 int fcb_type, int pitch)
824 {
825 float synth_filter_in_buf[MAX_FRAMESIZE / 2],
826 6612 *synth_pf = &s->synth_filter_out_buf[MAX_LSPS_ALIGN16],
827 6612 *synth_filter_in = zero_exc_pf;
828
829
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 6612 times.
6612 av_assert0(size <= MAX_FRAMESIZE / 2);
830
831 /* generate excitation from input signal */
832 6612 ff_celp_lp_zero_synthesis_filterf(zero_exc_pf, lpcs, synth, size, s->lsps);
833
834
4/4
✓ Branch 0 taken 5070 times.
✓ Branch 1 taken 1542 times.
✓ Branch 2 taken 5044 times.
✓ Branch 3 taken 26 times.
11682 if (fcb_type >= FCB_TYPE_AW_PULSES &&
835 5070 !kalman_smoothen(s, pitch, zero_exc_pf, synth_filter_in_buf, size))
836 5044 synth_filter_in = synth_filter_in_buf;
837
838 /* re-synthesize speech after smoothening, and keep history */
839 6612 ff_celp_lp_synthesis_filterf(synth_pf, lpcs,
840 synth_filter_in, size, s->lsps);
841 6612 memcpy(&synth_pf[-s->lsps], &synth_pf[size - s->lsps],
842 6612 sizeof(synth_pf[0]) * s->lsps);
843
844 6612 wiener_denoise(s, fcb_type, synth_pf, size, lpcs);
845
846 6612 adaptive_gain_control(samples, synth_pf, synth, size, 0.99,
847 &s->postfilter_agc);
848
849
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 6612 times.
6612 if (s->dc_level > 8) {
850 /* remove ultra-low frequency DC noise / highpass filter;
851 * coefficients are identical to those used in SIPR decoding,
852 * and very closely resemble those used in AMR-NB decoding. */
853 ff_acelp_apply_order_2_transfer_function(samples, samples,
854 (const float[2]) { -1.99997, 1.0 },
855 (const float[2]) { -1.9330735188, 0.93589198496 },
856 0.93980580475, s->dcf_mem, size);
857 }
858 6612 }
859 /**
860 * @}
861 */
862
863 /**
864 * Dequantize LSPs
865 * @param lsps output pointer to the array that will hold the LSPs
866 * @param num number of LSPs to be dequantized
867 * @param values quantized values, contains n_stages values
868 * @param sizes range (i.e. max value) of each quantized value
869 * @param n_stages number of dequantization runs
870 * @param table dequantization table to be used
871 * @param mul_q LSF multiplier
872 * @param base_q base (lowest) LSF values
873 */
874 4404 static void dequant_lsps(double *lsps, int num,
875 const uint16_t *values,
876 const uint16_t *sizes,
877 int n_stages, const uint8_t *table,
878 const double *mul_q,
879 const double *base_q)
880 {
881 int n, m;
882
883 4404 memset(lsps, 0, num * sizeof(*lsps));
884
2/2
✓ Branch 0 taken 8264 times.
✓ Branch 1 taken 4404 times.
12668 for (n = 0; n < n_stages; n++) {
885 8264 const uint8_t *t_off = &table[values[n] * num];
886 8264 double base = base_q[n], mul = mul_q[n];
887
888
2/2
✓ Branch 0 taken 87100 times.
✓ Branch 1 taken 8264 times.
95364 for (m = 0; m < num; m++)
889 87100 lsps[m] += base + mul * t_off[m];
890
891 8264 table += sizes[n] * num;
892 }
893 4404 }
894
895 /**
896 * @name LSP dequantization routines
897 * LSP dequantization routines, for 10/16LSPs and independent/residual coding.
898 * lsp10i() consumes 24 bits; lsp10r() consumes an additional 24 bits;
899 * lsp16i() consumes 34 bits; lsp16r() consumes an additional 26 bits.
900 * @{
901 */
902 /**
903 * Parse 10 independently-coded LSPs.
904 */
905 552 static void dequant_lsp10i(GetBitContext *gb, double *lsps)
906 {
907 static const uint16_t vec_sizes[4] = { 256, 64, 32, 32 };
908 static const double mul_lsf[4] = {
909 5.2187144800e-3, 1.4626986422e-3,
910 9.6179549166e-4, 1.1325736225e-3
911 };
912 static const double base_lsf[4] = {
913 M_PI * -2.15522e-1, M_PI * -6.1646e-2,
914 M_PI * -3.3486e-2, M_PI * -5.7408e-2
915 };
916 uint16_t v[4];
917
918 552 v[0] = get_bits(gb, 8);
919 552 v[1] = get_bits(gb, 6);
920 552 v[2] = get_bits(gb, 5);
921 552 v[3] = get_bits(gb, 5);
922
923 552 dequant_lsps(lsps, 10, v, vec_sizes, 4, wmavoice_dq_lsp10i,
924 mul_lsf, base_lsf);
925 552 }
926
927 /**
928 * Parse 10 independently-coded LSPs, and then derive the tables to
929 * generate LSPs for the other frames from them (residual coding).
930 */
931 552 static void dequant_lsp10r(GetBitContext *gb,
932 double *i_lsps, const double *old,
933 double *a1, double *a2, int q_mode)
934 {
935 static const uint16_t vec_sizes[3] = { 128, 64, 64 };
936 static const double mul_lsf[3] = {
937 2.5807601174e-3, 1.2354460219e-3, 1.1763821673e-3
938 };
939 static const double base_lsf[3] = {
940 M_PI * -1.07448e-1, M_PI * -5.2706e-2, M_PI * -5.1634e-2
941 };
942 552 const float (*ipol_tab)[2][10] = q_mode ?
943
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 552 times.
552 wmavoice_lsp10_intercoeff_b : wmavoice_lsp10_intercoeff_a;
944 uint16_t interpol, v[3];
945 int n;
946
947 552 dequant_lsp10i(gb, i_lsps);
948
949 552 interpol = get_bits(gb, 5);
950 552 v[0] = get_bits(gb, 7);
951 552 v[1] = get_bits(gb, 6);
952 552 v[2] = get_bits(gb, 6);
953
954
2/2
✓ Branch 0 taken 5520 times.
✓ Branch 1 taken 552 times.
6072 for (n = 0; n < 10; n++) {
955 5520 double delta = old[n] - i_lsps[n];
956 5520 a1[n] = ipol_tab[interpol][0][n] * delta + i_lsps[n];
957 5520 a1[10 + n] = ipol_tab[interpol][1][n] * delta + i_lsps[n];
958 }
959
960 552 dequant_lsps(a2, 20, v, vec_sizes, 3, wmavoice_dq_lsp10r,
961 mul_lsf, base_lsf);
962 552 }
963
964 /**
965 * Parse 16 independently-coded LSPs.
966 */
967 550 static void dequant_lsp16i(GetBitContext *gb, double *lsps)
968 {
969 static const uint16_t vec_sizes[5] = { 256, 64, 128, 64, 128 };
970 static const double mul_lsf[5] = {
971 3.3439586280e-3, 6.9908173703e-4,
972 3.3216608306e-3, 1.0334960326e-3,
973 3.1899104283e-3
974 };
975 static const double base_lsf[5] = {
976 M_PI * -1.27576e-1, M_PI * -2.4292e-2,
977 M_PI * -1.28094e-1, M_PI * -3.2128e-2,
978 M_PI * -1.29816e-1
979 };
980 uint16_t v[5];
981
982 550 v[0] = get_bits(gb, 8);
983 550 v[1] = get_bits(gb, 6);
984 550 v[2] = get_bits(gb, 7);
985 550 v[3] = get_bits(gb, 6);
986 550 v[4] = get_bits(gb, 7);
987
988 550 dequant_lsps( lsps, 5, v, vec_sizes, 2,
989 wmavoice_dq_lsp16i1, mul_lsf, base_lsf);
990 550 dequant_lsps(&lsps[5], 5, &v[2], &vec_sizes[2], 2,
991 wmavoice_dq_lsp16i2, &mul_lsf[2], &base_lsf[2]);
992 550 dequant_lsps(&lsps[10], 6, &v[4], &vec_sizes[4], 1,
993 wmavoice_dq_lsp16i3, &mul_lsf[4], &base_lsf[4]);
994 550 }
995
996 /**
997 * Parse 16 independently-coded LSPs, and then derive the tables to
998 * generate LSPs for the other frames from them (residual coding).
999 */
1000 550 static void dequant_lsp16r(GetBitContext *gb,
1001 double *i_lsps, const double *old,
1002 double *a1, double *a2, int q_mode)
1003 {
1004 static const uint16_t vec_sizes[3] = { 128, 128, 128 };
1005 static const double mul_lsf[3] = {
1006 1.2232979501e-3, 1.4062241527e-3, 1.6114744851e-3
1007 };
1008 static const double base_lsf[3] = {
1009 M_PI * -5.5830e-2, M_PI * -5.2908e-2, M_PI * -5.4776e-2
1010 };
1011 550 const float (*ipol_tab)[2][16] = q_mode ?
1012
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 550 times.
550 wmavoice_lsp16_intercoeff_b : wmavoice_lsp16_intercoeff_a;
1013 uint16_t interpol, v[3];
1014 int n;
1015
1016 550 dequant_lsp16i(gb, i_lsps);
1017
1018 550 interpol = get_bits(gb, 5);
1019 550 v[0] = get_bits(gb, 7);
1020 550 v[1] = get_bits(gb, 7);
1021 550 v[2] = get_bits(gb, 7);
1022
1023
2/2
✓ Branch 0 taken 8800 times.
✓ Branch 1 taken 550 times.
9350 for (n = 0; n < 16; n++) {
1024 8800 double delta = old[n] - i_lsps[n];
1025 8800 a1[n] = ipol_tab[interpol][0][n] * delta + i_lsps[n];
1026 8800 a1[16 + n] = ipol_tab[interpol][1][n] * delta + i_lsps[n];
1027 }
1028
1029 550 dequant_lsps( a2, 10, v, vec_sizes, 1,
1030 wmavoice_dq_lsp16r1, mul_lsf, base_lsf);
1031 550 dequant_lsps(&a2[10], 10, &v[1], &vec_sizes[1], 1,
1032 wmavoice_dq_lsp16r2, &mul_lsf[1], &base_lsf[1]);
1033 550 dequant_lsps(&a2[20], 12, &v[2], &vec_sizes[2], 1,
1034 wmavoice_dq_lsp16r3, &mul_lsf[2], &base_lsf[2]);
1035 550 }
1036
1037 /**
1038 * @}
1039 * @name Pitch-adaptive window coding functions
1040 * The next few functions are for pitch-adaptive window coding.
1041 * @{
1042 */
1043 /**
1044 * Parse the offset of the first pitch-adaptive window pulses, and
1045 * the distribution of pulses between the two blocks in this frame.
1046 * @param s WMA Voice decoding context private data
1047 * @param gb bit I/O context
1048 * @param pitch pitch for each block in this frame
1049 */
1050 341 static void aw_parse_coords(WMAVoiceContext *s, GetBitContext *gb,
1051 const int *pitch)
1052 {
1053 static const int16_t start_offset[94] = {
1054 -11, -9, -7, -5, -3, -1, 1, 3, 5, 7, 9, 11,
1055 13, 15, 18, 17, 19, 20, 21, 22, 23, 24, 25, 26,
1056 27, 28, 29, 30, 31, 32, 33, 35, 37, 39, 41, 43,
1057 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67,
1058 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91,
1059 93, 95, 97, 99, 101, 103, 105, 107, 109, 111, 113, 115,
1060 117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139,
1061 141, 143, 145, 147, 149, 151, 153, 155, 157, 159
1062 };
1063 int bits, offset;
1064
1065 /* position of pulse */
1066 341 s->aw_idx_is_ext = 0;
1067
2/2
✓ Branch 1 taken 10 times.
✓ Branch 2 taken 331 times.
341 if ((bits = get_bits(gb, 6)) >= 54) {
1068 10 s->aw_idx_is_ext = 1;
1069 10 bits += (bits - 54) * 3 + get_bits(gb, 2);
1070 }
1071
1072 /* for a repeated pulse at pulse_off with a pitch_lag of pitch[], count
1073 * the distribution of the pulses in each block contained in this frame. */
1074
2/2
✓ Branch 0 taken 338 times.
✓ Branch 1 taken 3 times.
341 s->aw_pulse_range = FFMIN(pitch[0], pitch[1]) > 32 ? 24 : 16;
1075
2/2
✓ Branch 0 taken 50 times.
✓ Branch 1 taken 341 times.
391 for (offset = start_offset[bits]; offset < 0; offset += pitch[0]) ;
1076 341 s->aw_n_pulses[0] = (pitch[0] - 1 + MAX_FRAMESIZE / 2 - offset) / pitch[0];
1077 341 s->aw_first_pulse_off[0] = offset - s->aw_pulse_range / 2;
1078 341 offset += s->aw_n_pulses[0] * pitch[0];
1079 341 s->aw_n_pulses[1] = (pitch[1] - 1 + MAX_FRAMESIZE - offset) / pitch[1];
1080 341 s->aw_first_pulse_off[1] = offset - (MAX_FRAMESIZE + s->aw_pulse_range) / 2;
1081
1082 /* if continuing from a position before the block, reset position to
1083 * start of block (when corrected for the range over which it can be
1084 * spread in aw_pulse_set1()). */
1085
2/2
✓ Branch 0 taken 331 times.
✓ Branch 1 taken 10 times.
341 if (start_offset[bits] < MAX_FRAMESIZE / 2) {
1086
2/2
✓ Branch 0 taken 56 times.
✓ Branch 1 taken 331 times.
387 while (s->aw_first_pulse_off[1] - pitch[1] + s->aw_pulse_range > 0)
1087 56 s->aw_first_pulse_off[1] -= pitch[1];
1088
2/2
✓ Branch 0 taken 50 times.
✓ Branch 1 taken 281 times.
331 if (start_offset[bits] < 0)
1089
2/2
✓ Branch 0 taken 50 times.
✓ Branch 1 taken 50 times.
100 while (s->aw_first_pulse_off[0] - pitch[0] + s->aw_pulse_range > 0)
1090 50 s->aw_first_pulse_off[0] -= pitch[0];
1091 }
1092 341 }
1093
1094 /**
1095 * Apply second set of pitch-adaptive window pulses.
1096 * @param s WMA Voice decoding context private data
1097 * @param gb bit I/O context
1098 * @param block_idx block index in frame [0, 1]
1099 * @param fcb structure containing fixed codebook vector info
1100 * @return -1 on error, 0 otherwise
1101 */
1102 682 static int aw_pulse_set2(WMAVoiceContext *s, GetBitContext *gb,
1103 int block_idx, AMRFixed *fcb)
1104 {
1105 uint16_t use_mask_mem[9]; // only 5 are used, rest is padding
1106 682 uint16_t *use_mask = use_mask_mem + 2;
1107 /* in this function, idx is the index in the 80-bit (+ padding) use_mask
1108 * bit-array. Since use_mask consists of 16-bit values, the lower 4 bits
1109 * of idx are the position of the bit within a particular item in the
1110 * array (0 being the most significant bit, and 15 being the least
1111 * significant bit), and the remainder (>> 4) is the index in the
1112 * use_mask[]-array. This is faster and uses less memory than using a
1113 * 80-byte/80-int array. */
1114 682 int pulse_off = s->aw_first_pulse_off[block_idx],
1115 682 pulse_start, n, idx, range, aidx, start_off = 0;
1116
1117 /* set offset of first pulse to within this block */
1118
2/2
✓ Branch 0 taken 657 times.
✓ Branch 1 taken 25 times.
682 if (s->aw_n_pulses[block_idx] > 0)
1119
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 657 times.
657 while (pulse_off + s->aw_pulse_range < 1)
1120 pulse_off += fcb->pitch_lag;
1121
1122 /* find range per pulse */
1123
2/2
✓ Branch 0 taken 646 times.
✓ Branch 1 taken 36 times.
682 if (s->aw_n_pulses[0] > 0) {
1124
2/2
✓ Branch 0 taken 323 times.
✓ Branch 1 taken 323 times.
646 if (block_idx == 0) {
1125 323 range = 32;
1126 } else /* block_idx = 1 */ {
1127 323 range = 8;
1128
2/2
✓ Branch 0 taken 316 times.
✓ Branch 1 taken 7 times.
323 if (s->aw_n_pulses[block_idx] > 0)
1129 316 pulse_off = s->aw_next_pulse_off_cache;
1130 }
1131 } else
1132 36 range = 16;
1133
2/2
✓ Branch 0 taken 657 times.
✓ Branch 1 taken 25 times.
682 pulse_start = s->aw_n_pulses[block_idx] > 0 ? pulse_off - range / 2 : 0;
1134
1135 /* aw_pulse_set1() already applies pulses around pulse_off (to be exactly,
1136 * in the range of [pulse_off, pulse_off + s->aw_pulse_range], and thus
1137 * we exclude that range from being pulsed again in this function. */
1138 682 memset(&use_mask[-2], 0, 2 * sizeof(use_mask[0]));
1139 682 memset( use_mask, -1, 5 * sizeof(use_mask[0]));
1140 682 memset(&use_mask[5], 0, 2 * sizeof(use_mask[0]));
1141
2/2
✓ Branch 0 taken 657 times.
✓ Branch 1 taken 25 times.
682 if (s->aw_n_pulses[block_idx] > 0)
1142
2/2
✓ Branch 0 taken 911 times.
✓ Branch 1 taken 657 times.
1568 for (idx = pulse_off; idx < MAX_FRAMESIZE / 2; idx += fcb->pitch_lag) {
1143 911 int excl_range = s->aw_pulse_range; // always 16 or 24
1144 911 uint16_t *use_mask_ptr = &use_mask[idx >> 4];
1145 911 int first_sh = 16 - (idx & 15);
1146 911 *use_mask_ptr++ &= 0xFFFFu << first_sh;
1147 911 excl_range -= first_sh;
1148
2/2
✓ Branch 0 taken 468 times.
✓ Branch 1 taken 443 times.
911 if (excl_range >= 16) {
1149 468 *use_mask_ptr++ = 0;
1150 468 *use_mask_ptr &= 0xFFFF >> (excl_range - 16);
1151 } else
1152 443 *use_mask_ptr &= 0xFFFF >> excl_range;
1153 }
1154
1155 /* find the 'aidx'th offset that is not excluded */
1156
2/2
✓ Branch 0 taken 646 times.
✓ Branch 1 taken 36 times.
682 aidx = get_bits(gb, s->aw_n_pulses[0] > 0 ? 5 - 2 * block_idx : 4);
1157
2/2
✓ Branch 0 taken 16143 times.
✓ Branch 1 taken 682 times.
16825 for (n = 0; n <= aidx; pulse_start++) {
1158
2/2
✓ Branch 0 taken 2315 times.
✓ Branch 1 taken 16143 times.
18458 for (idx = pulse_start; idx < 0; idx += fcb->pitch_lag) ;
1159
2/2
✓ Branch 0 taken 538 times.
✓ Branch 1 taken 15605 times.
16143 if (idx >= MAX_FRAMESIZE / 2) { // find from zero
1160
2/2
✓ Branch 0 taken 415 times.
✓ Branch 1 taken 123 times.
538 if (use_mask[0]) idx = 0x0F;
1161
2/2
✓ Branch 0 taken 105 times.
✓ Branch 1 taken 18 times.
123 else if (use_mask[1]) idx = 0x1F;
1162
1/2
✓ Branch 0 taken 18 times.
✗ Branch 1 not taken.
18 else if (use_mask[2]) idx = 0x2F;
1163 else if (use_mask[3]) idx = 0x3F;
1164 else if (use_mask[4]) idx = 0x4F;
1165 else return -1;
1166 538 idx -= av_log2_16bit(use_mask[idx >> 4]);
1167 }
1168
2/2
✓ Branch 0 taken 7465 times.
✓ Branch 1 taken 8678 times.
16143 if (use_mask[idx >> 4] & (0x8000 >> (idx & 15))) {
1169 7465 use_mask[idx >> 4] &= ~(0x8000 >> (idx & 15));
1170 7465 n++;
1171 7465 start_off = idx;
1172 }
1173 }
1174
1175 682 fcb->x[fcb->n] = start_off;
1176
2/2
✓ Branch 1 taken 365 times.
✓ Branch 2 taken 317 times.
682 fcb->y[fcb->n] = get_bits1(gb) ? -1.0 : 1.0;
1177 682 fcb->n++;
1178
1179 /* set offset for next block, relative to start of that block */
1180 682 n = (MAX_FRAMESIZE / 2 - start_off) % fcb->pitch_lag;
1181
2/2
✓ Branch 0 taken 676 times.
✓ Branch 1 taken 6 times.
682 s->aw_next_pulse_off_cache = n ? fcb->pitch_lag - n : 0;
1182 682 return 0;
1183 }
1184
1185 /**
1186 * Apply first set of pitch-adaptive window pulses.
1187 * @param s WMA Voice decoding context private data
1188 * @param gb bit I/O context
1189 * @param block_idx block index in frame [0, 1]
1190 * @param fcb storage location for fixed codebook pulse info
1191 */
1192 682 static void aw_pulse_set1(WMAVoiceContext *s, GetBitContext *gb,
1193 int block_idx, AMRFixed *fcb)
1194 {
1195
4/4
✓ Branch 0 taken 20 times.
✓ Branch 1 taken 662 times.
✓ Branch 2 taken 10 times.
✓ Branch 3 taken 10 times.
682 int val = get_bits(gb, 12 - 2 * (s->aw_idx_is_ext && !block_idx));
1196 float v;
1197
1198
2/2
✓ Branch 0 taken 657 times.
✓ Branch 1 taken 25 times.
682 if (s->aw_n_pulses[block_idx] > 0) {
1199 int n, v_mask, i_mask, sh, n_pulses;
1200
1201
2/2
✓ Branch 0 taken 652 times.
✓ Branch 1 taken 5 times.
657 if (s->aw_pulse_range == 24) { // 3 pulses, 1:sign + 3:index each
1202 652 n_pulses = 3;
1203 652 v_mask = 8;
1204 652 i_mask = 7;
1205 652 sh = 4;
1206 } else { // 4 pulses, 1:sign + 2:index each
1207 5 n_pulses = 4;
1208 5 v_mask = 4;
1209 5 i_mask = 3;
1210 5 sh = 3;
1211 }
1212
1213
2/2
✓ Branch 0 taken 1976 times.
✓ Branch 1 taken 657 times.
2633 for (n = n_pulses - 1; n >= 0; n--, val >>= sh) {
1214
2/2
✓ Branch 0 taken 950 times.
✓ Branch 1 taken 1026 times.
1976 fcb->y[fcb->n] = (val & v_mask) ? -1.0 : 1.0;
1215 1976 fcb->x[fcb->n] = (val & i_mask) * n_pulses + n +
1216 1976 s->aw_first_pulse_off[block_idx];
1217
2/2
✓ Branch 0 taken 241 times.
✓ Branch 1 taken 1976 times.
2217 while (fcb->x[fcb->n] < 0)
1218 241 fcb->x[fcb->n] += fcb->pitch_lag;
1219
2/2
✓ Branch 0 taken 1959 times.
✓ Branch 1 taken 17 times.
1976 if (fcb->x[fcb->n] < MAX_FRAMESIZE / 2)
1220 1959 fcb->n++;
1221 }
1222 } else {
1223 25 int num2 = (val & 0x1FF) >> 1, delta, idx;
1224
1225
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 21 times.
25 if (num2 < 1 * 79) { delta = 1; idx = num2 + 1; }
1226
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 15 times.
21 else if (num2 < 2 * 78) { delta = 3; idx = num2 + 1 - 1 * 77; }
1227
2/2
✓ Branch 0 taken 10 times.
✓ Branch 1 taken 5 times.
15 else if (num2 < 3 * 77) { delta = 5; idx = num2 + 1 - 2 * 76; }
1228 5 else { delta = 7; idx = num2 + 1 - 3 * 75; }
1229
2/2
✓ Branch 0 taken 11 times.
✓ Branch 1 taken 14 times.
25 v = (val & 0x200) ? -1.0 : 1.0;
1230
1231 25 fcb->no_repeat_mask |= 3 << fcb->n;
1232 25 fcb->x[fcb->n] = idx - delta;
1233 25 fcb->y[fcb->n] = v;
1234 25 fcb->x[fcb->n + 1] = idx;
1235
2/2
✓ Branch 0 taken 7 times.
✓ Branch 1 taken 18 times.
25 fcb->y[fcb->n + 1] = (val & 1) ? -v : v;
1236 25 fcb->n += 2;
1237 }
1238 682 }
1239
1240 /**
1241 * @}
1242 *
1243 * Generate a random number from frame_cntr and block_idx, which will live
1244 * in the range [0, 1000 - block_size] (so it can be used as an index in a
1245 * table of size 1000 of which you want to read block_size entries).
1246 *
1247 * @param frame_cntr current frame number
1248 * @param block_num current block index
1249 * @param block_size amount of entries we want to read from a table
1250 * that has 1000 entries
1251 * @return a (non-)random number in the [0, 1000 - block_size] range.
1252 */
1253 499 static int pRNG(int frame_cntr, int block_num, int block_size)
1254 {
1255 /* array to simplify the calculation of z:
1256 * y = (x % 9) * 5 + 6;
1257 * z = (49995 * x) / y;
1258 * Since y only has 9 values, we can remove the division by using a
1259 * LUT and using FASTDIV-style divisions. For each of the 9 values
1260 * of y, we can rewrite z as:
1261 * z = x * (49995 / y) + x * ((49995 % y) / y)
1262 * In this table, each col represents one possible value of y, the
1263 * first number is 49995 / y, and the second is the FASTDIV variant
1264 * of 49995 % y / y. */
1265 static const unsigned int div_tbl[9][2] = {
1266 { 8332, 3 * 715827883U }, // y = 6
1267 { 4545, 0 * 390451573U }, // y = 11
1268 { 3124, 11 * 268435456U }, // y = 16
1269 { 2380, 15 * 204522253U }, // y = 21
1270 { 1922, 23 * 165191050U }, // y = 26
1271 { 1612, 23 * 138547333U }, // y = 31
1272 { 1388, 27 * 119304648U }, // y = 36
1273 { 1219, 16 * 104755300U }, // y = 41
1274 { 1086, 39 * 93368855U } // y = 46
1275 };
1276 499 unsigned int z, y, x = MUL16(block_num, 1877) + frame_cntr;
1277
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 499 times.
499 if (x >= 0xFFFF) x -= 0xFFFF; // max value of x is 8*1877+0xFFFE=0x13AA6,
1278 // so this is effectively a modulo (%)
1279 499 y = x - 9 * MULH(477218589, x); // x % 9
1280 499 z = (uint16_t) (x * div_tbl[y][0] + UMULH(x, div_tbl[y][1]));
1281 // z = x * 49995 / (y * 5 + 6)
1282 499 return z % (1000 - block_size);
1283 }
1284
1285 /**
1286 * Parse hardcoded signal for a single block.
1287 * @note see #synth_block().
1288 */
1289 1043 static void synth_block_hardcoded(WMAVoiceContext *s, GetBitContext *gb,
1290 int block_idx, int size,
1291 const struct frame_type_desc *frame_desc,
1292 float *excitation)
1293 {
1294 float gain;
1295 int n, r_idx;
1296
1297
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1043 times.
1043 av_assert0(size <= MAX_FRAMESIZE);
1298
1299 /* Set the offset from which we start reading wmavoice_std_codebook */
1300
2/2
✓ Branch 0 taken 499 times.
✓ Branch 1 taken 544 times.
1043 if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
1301 499 r_idx = pRNG(s->frame_cntr, block_idx, size);
1302 499 gain = s->silence_gain;
1303 } else /* FCB_TYPE_HARDCODED */ {
1304 544 r_idx = get_bits(gb, 8);
1305 544 gain = wmavoice_gain_universal[get_bits(gb, 6)];
1306 }
1307
1308 /* Clear gain prediction parameters */
1309 1043 memset(s->gain_pred_err, 0, sizeof(s->gain_pred_err));
1310
1311 /* Apply gain to hardcoded codebook and use that as excitation signal */
1312
2/2
✓ Branch 0 taken 123360 times.
✓ Branch 1 taken 1043 times.
124403 for (n = 0; n < size; n++)
1313 123360 excitation[n] = wmavoice_std_codebook[r_idx + n] * gain;
1314 1043 }
1315
1316 /**
1317 * Parse FCB/ACB signal for a single block.
1318 * @note see #synth_block().
1319 */
1320 9740 static void synth_block_fcb_acb(WMAVoiceContext *s, GetBitContext *gb,
1321 int block_idx, int size,
1322 int block_pitch_sh2,
1323 const struct frame_type_desc *frame_desc,
1324 float *excitation)
1325 {
1326 static const float gain_coeff[6] = {
1327 0.8169, -0.06545, 0.1726, 0.0185, -0.0359, 0.0458
1328 };
1329 float pulses[MAX_FRAMESIZE / 2], pred_err, acb_gain, fcb_gain;
1330 int n, idx, gain_weight;
1331 AMRFixed fcb;
1332
1333
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 9740 times.
9740 av_assert0(size <= MAX_FRAMESIZE / 2);
1334 9740 memset(pulses, 0, sizeof(*pulses) * size);
1335
1336 9740 fcb.pitch_lag = block_pitch_sh2 >> 2;
1337 9740 fcb.pitch_fac = 1.0;
1338 9740 fcb.no_repeat_mask = 0;
1339 9740 fcb.n = 0;
1340
1341 /* For the other frame types, this is where we apply the innovation
1342 * (fixed) codebook pulses of the speech signal. */
1343
2/2
✓ Branch 0 taken 682 times.
✓ Branch 1 taken 9058 times.
9740 if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1344 682 aw_pulse_set1(s, gb, block_idx, &fcb);
1345
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 682 times.
682 if (aw_pulse_set2(s, gb, block_idx, &fcb)) {
1346 /* Conceal the block with silence and return.
1347 * Skip the correct amount of bits to read the next
1348 * block from the correct offset. */
1349 int r_idx = pRNG(s->frame_cntr, block_idx, size);
1350
1351 for (n = 0; n < size; n++)
1352 excitation[n] =
1353 wmavoice_std_codebook[r_idx + n] * s->silence_gain;
1354 skip_bits(gb, 7 + 1);
1355 return;
1356 }
1357 } else /* FCB_TYPE_EXC_PULSES */ {
1358 9058 int offset_nbits = 5 - frame_desc->log_n_blocks;
1359
1360 9058 fcb.no_repeat_mask = -1;
1361 /* similar to ff_decode_10_pulses_35bits(), but with single pulses
1362 * (instead of double) for a subset of pulses */
1363
2/2
✓ Branch 0 taken 45290 times.
✓ Branch 1 taken 9058 times.
54348 for (n = 0; n < 5; n++) {
1364 float sign;
1365 int pos1, pos2;
1366
1367
2/2
✓ Branch 1 taken 22708 times.
✓ Branch 2 taken 22582 times.
45290 sign = get_bits1(gb) ? 1.0 : -1.0;
1368 45290 pos1 = get_bits(gb, offset_nbits);
1369 45290 fcb.x[fcb.n] = n + 5 * pos1;
1370 45290 fcb.y[fcb.n++] = sign;
1371
2/2
✓ Branch 0 taken 36270 times.
✓ Branch 1 taken 9020 times.
45290 if (n < frame_desc->dbl_pulses) {
1372 36270 pos2 = get_bits(gb, offset_nbits);
1373 36270 fcb.x[fcb.n] = n + 5 * pos2;
1374
2/2
✓ Branch 0 taken 21490 times.
✓ Branch 1 taken 14780 times.
36270 fcb.y[fcb.n++] = (pos1 < pos2) ? -sign : sign;
1375 }
1376 }
1377 }
1378 9740 ff_set_fixed_vector(pulses, &fcb, 1.0, size);
1379
1380 /* Calculate gain for adaptive & fixed codebook signal.
1381 * see ff_amr_set_fixed_gain(). */
1382 9740 idx = get_bits(gb, 7);
1383 9740 fcb_gain = expf(avpriv_scalarproduct_float_c(s->gain_pred_err,
1384 9740 gain_coeff, 6) -
1385 9740 5.2409161640 + wmavoice_gain_codebook_fcb[idx]);
1386 9740 acb_gain = wmavoice_gain_codebook_acb[idx];
1387 9740 pred_err = av_clipf(wmavoice_gain_codebook_fcb[idx],
1388 -2.9957322736 /* log(0.05) */,
1389 1.6094379124 /* log(5.0) */);
1390
1391 9740 gain_weight = 8 >> frame_desc->log_n_blocks;
1392 9740 memmove(&s->gain_pred_err[gain_weight], s->gain_pred_err,
1393 9740 sizeof(*s->gain_pred_err) * (6 - gain_weight));
1394
2/2
✓ Branch 0 taken 20280 times.
✓ Branch 1 taken 9740 times.
30020 for (n = 0; n < gain_weight; n++)
1395 20280 s->gain_pred_err[n] = pred_err;
1396
1397 /* Calculation of adaptive codebook */
1398
2/2
✓ Branch 0 taken 1276 times.
✓ Branch 1 taken 8464 times.
9740 if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
1399 int len;
1400
2/2
✓ Branch 0 taken 17876 times.
✓ Branch 1 taken 1276 times.
19152 for (n = 0; n < size; n += len) {
1401 int next_idx_sh16;
1402 17876 int abs_idx = block_idx * size + n;
1403 17876 int pitch_sh16 = (s->last_pitch_val << 16) +
1404 17876 s->pitch_diff_sh16 * abs_idx;
1405 17876 int pitch = (pitch_sh16 + 0x6FFF) >> 16;
1406 17876 int idx_sh16 = ((pitch << 16) - pitch_sh16) * 8 + 0x58000;
1407 17876 idx = idx_sh16 >> 16;
1408
2/2
✓ Branch 0 taken 17442 times.
✓ Branch 1 taken 434 times.
17876 if (s->pitch_diff_sh16) {
1409
2/2
✓ Branch 0 taken 10526 times.
✓ Branch 1 taken 6916 times.
17442 if (s->pitch_diff_sh16 > 0) {
1410 10526 next_idx_sh16 = (idx_sh16) &~ 0xFFFF;
1411 } else
1412 6916 next_idx_sh16 = (idx_sh16 + 0x10000) &~ 0xFFFF;
1413 17442 len = av_clip((idx_sh16 - next_idx_sh16) / s->pitch_diff_sh16 / 8,
1414 1, size - n);
1415 } else
1416 434 len = size;
1417
1418 17876 ff_acelp_interpolatef(&excitation[n], &excitation[n - pitch],
1419 wmavoice_ipol1_coeffs, 17,
1420 idx, 9, len);
1421 }
1422 } else /* ACB_TYPE_HAMMING */ {
1423 8464 int block_pitch = block_pitch_sh2 >> 2;
1424 8464 idx = block_pitch_sh2 & 3;
1425
2/2
✓ Branch 0 taken 3652 times.
✓ Branch 1 taken 4812 times.
8464 if (idx) {
1426 3652 ff_acelp_interpolatef(excitation, &excitation[-block_pitch],
1427 wmavoice_ipol2_coeffs, 4,
1428 idx, 8, size);
1429 } else
1430 4812 av_memcpy_backptr((uint8_t *) excitation, sizeof(float) * block_pitch,
1431 sizeof(float) * size);
1432 }
1433
1434 /* Interpolate ACB/FCB and use as excitation signal */
1435 9740 ff_weighted_vector_sumf(excitation, excitation, pulses,
1436 acb_gain, fcb_gain, size);
1437 }
1438
1439 /**
1440 * Parse data in a single block.
1441 *
1442 * @param s WMA Voice decoding context private data
1443 * @param gb bit I/O context
1444 * @param block_idx index of the to-be-read block
1445 * @param size amount of samples to be read in this block
1446 * @param block_pitch_sh2 pitch for this block << 2
1447 * @param lsps LSPs for (the end of) this frame
1448 * @param prev_lsps LSPs for the last frame
1449 * @param frame_desc frame type descriptor
1450 * @param excitation target memory for the ACB+FCB interpolated signal
1451 * @param synth target memory for the speech synthesis filter output
1452 * @return 0 on success, <0 on error.
1453 */
1454 10783 static void synth_block(WMAVoiceContext *s, GetBitContext *gb,
1455 int block_idx, int size,
1456 int block_pitch_sh2,
1457 const double *lsps, const double *prev_lsps,
1458 const struct frame_type_desc *frame_desc,
1459 float *excitation, float *synth)
1460 {
1461 double i_lsps[MAX_LSPS];
1462 float lpcs[MAX_LSPS];
1463 float fac;
1464 int n;
1465
1466
2/2
✓ Branch 0 taken 1043 times.
✓ Branch 1 taken 9740 times.
10783 if (frame_desc->acb_type == ACB_TYPE_NONE)
1467 1043 synth_block_hardcoded(s, gb, block_idx, size, frame_desc, excitation);
1468 else
1469 9740 synth_block_fcb_acb(s, gb, block_idx, size, block_pitch_sh2,
1470 frame_desc, excitation);
1471
1472 /* convert interpolated LSPs to LPCs */
1473 10783 fac = (block_idx + 0.5) / frame_desc->n_blocks;
1474
2/2
✓ Branch 0 taken 140776 times.
✓ Branch 1 taken 10783 times.
151559 for (n = 0; n < s->lsps; n++) // LSF -> LSP
1475 140776 i_lsps[n] = cos(prev_lsps[n] + fac * (lsps[n] - prev_lsps[n]));
1476 10783 ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1477
1478 /* Speech synthesis */
1479 10783 ff_celp_lp_synthesis_filterf(synth, lpcs, excitation, size, s->lsps);
1480 10783 }
1481
1482 /**
1483 * Synthesize output samples for a single frame.
1484 *
1485 * @param ctx WMA Voice decoder context
1486 * @param gb bit I/O context (s->gb or one for cross-packet superframes)
1487 * @param frame_idx Frame number within superframe [0-2]
1488 * @param samples pointer to output sample buffer, has space for at least 160
1489 * samples
1490 * @param lsps LSP array
1491 * @param prev_lsps array of previous frame's LSPs
1492 * @param excitation target buffer for excitation signal
1493 * @param synth target buffer for synthesized speech data
1494 * @return 0 on success, <0 on error.
1495 */
1496 3306 static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx,
1497 float *samples,
1498 const double *lsps, const double *prev_lsps,
1499 float *excitation, float *synth)
1500 {
1501 3306 WMAVoiceContext *s = ctx->priv_data;
1502 3306 int n, n_blocks_x2, log_n_blocks_x2, av_uninit(cur_pitch_val);
1503 3306 int pitch[MAX_BLOCKS], av_uninit(last_block_pitch);
1504
1505 /* Parse frame type ("frame header"), see frame_descs */
1506 3306 int bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc, 6, 3)], block_nsamples;
1507
1508
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 3306 times.
3306 if (bd_idx < 0) {
1509 av_log(ctx, AV_LOG_ERROR,
1510 "Invalid frame type VLC code, skipping\n");
1511 return AVERROR_INVALIDDATA;
1512 }
1513
1514 3306 block_nsamples = MAX_FRAMESIZE / frame_descs[bd_idx].n_blocks;
1515
1516 /* Pitch calculation for ACB_TYPE_ASYMMETRIC ("pitch-per-frame") */
1517
2/2
✓ Branch 0 taken 560 times.
✓ Branch 1 taken 2746 times.
3306 if (frame_descs[bd_idx].acb_type == ACB_TYPE_ASYMMETRIC) {
1518 /* Pitch is provided per frame, which is interpreted as the pitch of
1519 * the last sample of the last block of this frame. We can interpolate
1520 * the pitch of other blocks (and even pitch-per-sample) by gradually
1521 * incrementing/decrementing prev_frame_pitch to cur_pitch_val. */
1522 560 n_blocks_x2 = frame_descs[bd_idx].n_blocks << 1;
1523 560 log_n_blocks_x2 = frame_descs[bd_idx].log_n_blocks + 1;
1524 560 cur_pitch_val = s->min_pitch_val + get_bits(gb, s->pitch_nbits);
1525
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 560 times.
560 cur_pitch_val = FFMIN(cur_pitch_val, s->max_pitch_val - 1);
1526
2/2
✓ Branch 0 taken 524 times.
✓ Branch 1 taken 36 times.
560 if (s->last_acb_type == ACB_TYPE_NONE ||
1527 524 20 * abs(cur_pitch_val - s->last_pitch_val) >
1528
2/2
✓ Branch 0 taken 102 times.
✓ Branch 1 taken 422 times.
524 (cur_pitch_val + s->last_pitch_val))
1529 138 s->last_pitch_val = cur_pitch_val;
1530
1531 /* pitch per block */
1532
2/2
✓ Branch 0 taken 1276 times.
✓ Branch 1 taken 560 times.
1836 for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1533 1276 int fac = n * 2 + 1;
1534
1535 1276 pitch[n] = (MUL16(fac, cur_pitch_val) +
1536 1276 MUL16((n_blocks_x2 - fac), s->last_pitch_val) +
1537 1276 frame_descs[bd_idx].n_blocks) >> log_n_blocks_x2;
1538 }
1539
1540 /* "pitch-diff-per-sample" for calculation of pitch per sample */
1541 560 s->pitch_diff_sh16 =
1542 560 (cur_pitch_val - s->last_pitch_val) * (1 << 16) / MAX_FRAMESIZE;
1543 }
1544
1545 /* Global gain (if silence) and pitch-adaptive window coordinates */
1546
3/3
✓ Branch 0 taken 499 times.
✓ Branch 1 taken 341 times.
✓ Branch 2 taken 2466 times.
3306 switch (frame_descs[bd_idx].fcb_type) {
1547 499 case FCB_TYPE_SILENCE:
1548 499 s->silence_gain = wmavoice_gain_silence[get_bits(gb, 8)];
1549 499 break;
1550 341 case FCB_TYPE_AW_PULSES:
1551 341 aw_parse_coords(s, gb, pitch);
1552 341 break;
1553 }
1554
1555
2/2
✓ Branch 0 taken 10783 times.
✓ Branch 1 taken 3306 times.
14089 for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1556 int bl_pitch_sh2;
1557
1558 /* Pitch calculation for ACB_TYPE_HAMMING ("pitch-per-block") */
1559
3/3
✓ Branch 0 taken 8464 times.
✓ Branch 1 taken 1276 times.
✓ Branch 2 taken 1043 times.
10783 switch (frame_descs[bd_idx].acb_type) {
1560 8464 case ACB_TYPE_HAMMING: {
1561 /* Pitch is given per block. Per-block pitches are encoded as an
1562 * absolute value for the first block, and then delta values
1563 * relative to this value) for all subsequent blocks. The scale of
1564 * this pitch value is semi-logarithmic compared to its use in the
1565 * decoder, so we convert it to normal scale also. */
1566 int block_pitch,
1567 8464 t1 = (s->block_conv_table[1] - s->block_conv_table[0]) << 2,
1568 8464 t2 = (s->block_conv_table[2] - s->block_conv_table[1]) << 1,
1569 8464 t3 = s->block_conv_table[3] - s->block_conv_table[2] + 1;
1570
1571
2/2
✓ Branch 0 taken 1975 times.
✓ Branch 1 taken 6489 times.
8464 if (n == 0) {
1572 1975 block_pitch = get_bits(gb, s->block_pitch_nbits);
1573 } else
1574 6489 block_pitch = last_block_pitch - s->block_delta_pitch_hrange +
1575 6489 get_bits(gb, s->block_delta_pitch_nbits);
1576 /* Convert last_ so that any next delta is within _range */
1577 8464 last_block_pitch = av_clip(block_pitch,
1578 s->block_delta_pitch_hrange,
1579 8464 s->block_pitch_range -
1580 8464 s->block_delta_pitch_hrange);
1581
1582 /* Convert semi-log-style scale back to normal scale */
1583
2/2
✓ Branch 0 taken 1491 times.
✓ Branch 1 taken 6973 times.
8464 if (block_pitch < t1) {
1584 1491 bl_pitch_sh2 = (s->block_conv_table[0] << 2) + block_pitch;
1585 } else {
1586 6973 block_pitch -= t1;
1587
2/2
✓ Branch 0 taken 5712 times.
✓ Branch 1 taken 1261 times.
6973 if (block_pitch < t2) {
1588 5712 bl_pitch_sh2 =
1589 5712 (s->block_conv_table[1] << 2) + (block_pitch << 1);
1590 } else {
1591 1261 block_pitch -= t2;
1592
1/2
✓ Branch 0 taken 1261 times.
✗ Branch 1 not taken.
1261 if (block_pitch < t3) {
1593 1261 bl_pitch_sh2 =
1594 1261 (s->block_conv_table[2] + block_pitch) << 2;
1595 } else
1596 bl_pitch_sh2 = s->block_conv_table[3] << 2;
1597 }
1598 }
1599 8464 pitch[n] = bl_pitch_sh2 >> 2;
1600 8464 break;
1601 }
1602
1603 1276 case ACB_TYPE_ASYMMETRIC: {
1604 1276 bl_pitch_sh2 = pitch[n] << 2;
1605 1276 break;
1606 }
1607
1608 1043 default: // ACB_TYPE_NONE has no pitch
1609 1043 bl_pitch_sh2 = 0;
1610 1043 break;
1611 }
1612
1613 10783 synth_block(s, gb, n, block_nsamples, bl_pitch_sh2,
1614 lsps, prev_lsps, &frame_descs[bd_idx],
1615 10783 &excitation[n * block_nsamples],
1616 10783 &synth[n * block_nsamples]);
1617 }
1618
1619 /* Averaging projection filter, if applicable. Else, just copy samples
1620 * from synthesis buffer */
1621
1/2
✓ Branch 0 taken 3306 times.
✗ Branch 1 not taken.
3306 if (s->do_apf) {
1622 double i_lsps[MAX_LSPS];
1623 float lpcs[MAX_LSPS];
1624
1625
2/2
✓ Branch 0 taken 42960 times.
✓ Branch 1 taken 3306 times.
46266 for (n = 0; n < s->lsps; n++) // LSF -> LSP
1626 42960 i_lsps[n] = cos(0.5 * (prev_lsps[n] + lsps[n]));
1627 3306 ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1628 3306 postfilter(s, synth, samples, 80, lpcs,
1629 3306 &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx],
1630 3306 frame_descs[bd_idx].fcb_type, pitch[0]);
1631
1632
2/2
✓ Branch 0 taken 42960 times.
✓ Branch 1 taken 3306 times.
46266 for (n = 0; n < s->lsps; n++) // LSF -> LSP
1633 42960 i_lsps[n] = cos(lsps[n]);
1634 3306 ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1635 3306 postfilter(s, &synth[80], &samples[80], 80, lpcs,
1636 3306 &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx + 80],
1637 3306 frame_descs[bd_idx].fcb_type, pitch[0]);
1638 } else
1639 memcpy(samples, synth, 160 * sizeof(synth[0]));
1640
1641 /* Cache values for next frame */
1642 3306 s->frame_cntr++;
1643
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 3306 times.
3306 if (s->frame_cntr >= 0xFFFF) s->frame_cntr -= 0xFFFF; // i.e. modulo (%)
1644 3306 s->last_acb_type = frame_descs[bd_idx].acb_type;
1645
3/4
✓ Branch 0 taken 771 times.
✓ Branch 1 taken 560 times.
✓ Branch 2 taken 1975 times.
✗ Branch 3 not taken.
3306 switch (frame_descs[bd_idx].acb_type) {
1646 771 case ACB_TYPE_NONE:
1647 771 s->last_pitch_val = 0;
1648 771 break;
1649 560 case ACB_TYPE_ASYMMETRIC:
1650 560 s->last_pitch_val = cur_pitch_val;
1651 560 break;
1652 1975 case ACB_TYPE_HAMMING:
1653 1975 s->last_pitch_val = pitch[frame_descs[bd_idx].n_blocks - 1];
1654 1975 break;
1655 }
1656
1657 3306 return 0;
1658 }
1659
1660 /**
1661 * Ensure minimum value for first item, maximum value for last value,
1662 * proper spacing between each value and proper ordering.
1663 *
1664 * @param lsps array of LSPs
1665 * @param num size of LSP array
1666 *
1667 * @note basically a double version of #ff_acelp_reorder_lsf(), might be
1668 * useful to put in a generic location later on. Parts are also
1669 * present in #ff_set_min_dist_lsf() + #ff_sort_nearly_sorted_floats(),
1670 * which is in float.
1671 */
1672 3306 static void stabilize_lsps(double *lsps, int num)
1673 {
1674 int n, m, l;
1675
1676 /* set minimum value for first, maximum value for last and minimum
1677 * spacing between LSF values.
1678 * Very similar to ff_set_min_dist_lsf(), but in double. */
1679
1/2
✓ Branch 0 taken 3306 times.
✗ Branch 1 not taken.
3306 lsps[0] = FFMAX(lsps[0], 0.0015 * M_PI);
1680
2/2
✓ Branch 0 taken 39654 times.
✓ Branch 1 taken 3306 times.
42960 for (n = 1; n < num; n++)
1681
2/2
✓ Branch 0 taken 39138 times.
✓ Branch 1 taken 516 times.
39654 lsps[n] = FFMAX(lsps[n], lsps[n - 1] + 0.0125 * M_PI);
1682
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 3306 times.
3306 lsps[num - 1] = FFMIN(lsps[num - 1], 0.9985 * M_PI);
1683
1684 /* reorder (looks like one-time / non-recursed bubblesort).
1685 * Very similar to ff_sort_nearly_sorted_floats(), but in double. */
1686
2/2
✓ Branch 0 taken 39654 times.
✓ Branch 1 taken 3306 times.
42960 for (n = 1; n < num; n++) {
1687
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 39654 times.
39654 if (lsps[n] < lsps[n - 1]) {
1688 for (m = 1; m < num; m++) {
1689 double tmp = lsps[m];
1690 for (l = m - 1; l >= 0; l--) {
1691 if (lsps[l] <= tmp) break;
1692 lsps[l + 1] = lsps[l];
1693 }
1694 lsps[l + 1] = tmp;
1695 }
1696 break;
1697 }
1698 }
1699 3306 }
1700
1701 /**
1702 * Synthesize output samples for a single superframe. If we have any data
1703 * cached in s->sframe_cache, that will be used instead of whatever is loaded
1704 * in s->gb.
1705 *
1706 * WMA Voice superframes contain 3 frames, each containing 160 audio samples,
1707 * to give a total of 480 samples per frame. See #synth_frame() for frame
1708 * parsing. In addition to 3 frames, superframes can also contain the LSPs
1709 * (if these are globally specified for all frames (residually); they can
1710 * also be specified individually per-frame. See the s->has_residual_lsps
1711 * option), and can specify the number of samples encoded in this superframe
1712 * (if less than 480), usually used to prevent blanks at track boundaries.
1713 *
1714 * @param ctx WMA Voice decoder context
1715 * @return 0 on success, <0 on error or 1 if there was not enough data to
1716 * fully parse the superframe
1717 */
1718 1102 static int synth_superframe(AVCodecContext *ctx, AVFrame *frame,
1719 int *got_frame_ptr)
1720 {
1721 1102 WMAVoiceContext *s = ctx->priv_data;
1722 1102 GetBitContext *gb = &s->gb, s_gb;
1723 1102 int n, res, n_samples = MAX_SFRAMESIZE;
1724 double lsps[MAX_FRAMES][MAX_LSPS];
1725 2204 const double *mean_lsf = s->lsps == 16 ?
1726
2/2
✓ Branch 0 taken 550 times.
✓ Branch 1 taken 552 times.
1102 wmavoice_mean_lsf16[s->lsp_def_mode] : wmavoice_mean_lsf10[s->lsp_def_mode];
1727 float excitation[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE + 12];
1728 float synth[MAX_LSPS + MAX_SFRAMESIZE];
1729 float *samples;
1730
1731 1102 memcpy(synth, s->synth_history,
1732 1102 s->lsps * sizeof(*synth));
1733 1102 memcpy(excitation, s->excitation_history,
1734 1102 s->history_nsamples * sizeof(*excitation));
1735
1736
2/2
✓ Branch 0 taken 185 times.
✓ Branch 1 taken 917 times.
1102 if (s->sframe_cache_size > 0) {
1737 185 gb = &s_gb;
1738 185 init_get_bits(gb, s->sframe_cache, s->sframe_cache_size);
1739 185 s->sframe_cache_size = 0;
1740 }
1741
1742 /* First bit is speech/music bit, it differentiates between WMAVoice
1743 * speech samples (the actual codec) and WMAVoice music samples, which
1744 * are really WMAPro-in-WMAVoice-superframes. I've never seen those in
1745 * the wild yet. */
1746
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 1102 times.
1102 if (!get_bits1(gb)) {
1747 avpriv_request_sample(ctx, "WMAPro-in-WMAVoice");
1748 return AVERROR_PATCHWELCOME;
1749 }
1750
1751 /* (optional) nr. of samples in superframe; always <= 480 and >= 0 */
1752
2/2
✓ Branch 1 taken 3 times.
✓ Branch 2 taken 1099 times.
1102 if (get_bits1(gb)) {
1753
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 3 times.
3 if ((n_samples = get_bits(gb, 12)) > MAX_SFRAMESIZE) {
1754 av_log(ctx, AV_LOG_ERROR,
1755 "Superframe encodes > %d samples (%d), not allowed\n",
1756 MAX_SFRAMESIZE, n_samples);
1757 return AVERROR_INVALIDDATA;
1758 }
1759 }
1760
1761 /* Parse LSPs, if global for the superframe (can also be per-frame). */
1762
1/2
✓ Branch 0 taken 1102 times.
✗ Branch 1 not taken.
1102 if (s->has_residual_lsps) {
1763 double prev_lsps[MAX_LSPS], a1[MAX_LSPS * 2], a2[MAX_LSPS * 2];
1764
1765
2/2
✓ Branch 0 taken 14320 times.
✓ Branch 1 taken 1102 times.
15422 for (n = 0; n < s->lsps; n++)
1766 14320 prev_lsps[n] = s->prev_lsps[n] - mean_lsf[n];
1767
1768
2/2
✓ Branch 0 taken 552 times.
✓ Branch 1 taken 550 times.
1102 if (s->lsps == 10) {
1769 552 dequant_lsp10r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1770 } else /* s->lsps == 16 */
1771 550 dequant_lsp16r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1772
1773
2/2
✓ Branch 0 taken 14320 times.
✓ Branch 1 taken 1102 times.
15422 for (n = 0; n < s->lsps; n++) {
1774 14320 lsps[0][n] = mean_lsf[n] + (a1[n] - a2[n * 2]);
1775 14320 lsps[1][n] = mean_lsf[n] + (a1[s->lsps + n] - a2[n * 2 + 1]);
1776 14320 lsps[2][n] += mean_lsf[n];
1777 }
1778
2/2
✓ Branch 0 taken 3306 times.
✓ Branch 1 taken 1102 times.
4408 for (n = 0; n < 3; n++)
1779 3306 stabilize_lsps(lsps[n], s->lsps);
1780 }
1781
1782 /* synth_superframe can run multiple times per packet
1783 * free potential previous frame */
1784 1102 av_frame_unref(frame);
1785
1786 /* get output buffer */
1787 1102 frame->nb_samples = MAX_SFRAMESIZE;
1788
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 1102 times.
1102 if ((res = ff_get_buffer(ctx, frame, 0)) < 0)
1789 return res;
1790 1102 frame->nb_samples = n_samples;
1791 1102 samples = (float *)frame->data[0];
1792
1793 /* Parse frames, optionally preceded by per-frame (independent) LSPs. */
1794
2/2
✓ Branch 0 taken 3306 times.
✓ Branch 1 taken 1102 times.
4408 for (n = 0; n < 3; n++) {
1795
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 3306 times.
3306 if (!s->has_residual_lsps) {
1796 int m;
1797
1798 if (s->lsps == 10) {
1799 dequant_lsp10i(gb, lsps[n]);
1800 } else /* s->lsps == 16 */
1801 dequant_lsp16i(gb, lsps[n]);
1802
1803 for (m = 0; m < s->lsps; m++)
1804 lsps[n][m] += mean_lsf[m];
1805 stabilize_lsps(lsps[n], s->lsps);
1806 }
1807
1808
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 3306 times.
4408 if ((res = synth_frame(ctx, gb, n,
1809 3306 &samples[n * MAX_FRAMESIZE],
1810 3306 lsps[n], n == 0 ? s->prev_lsps : lsps[n - 1],
1811 3306 &excitation[s->history_nsamples + n * MAX_FRAMESIZE],
1812
2/2
✓ Branch 0 taken 1102 times.
✓ Branch 1 taken 2204 times.
3306 &synth[s->lsps + n * MAX_FRAMESIZE]))) {
1813 *got_frame_ptr = 0;
1814 return res;
1815 }
1816 }
1817
1818 /* Statistics? FIXME - we don't check for length, a slight overrun
1819 * will be caught by internal buffer padding, and anything else
1820 * will be skipped, not read. */
1821
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 1102 times.
1102 if (get_bits1(gb)) {
1822 res = get_bits(gb, 4);
1823 skip_bits(gb, 10 * (res + 1));
1824 }
1825
1826
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 1102 times.
1102 if (get_bits_left(gb) < 0) {
1827 wmavoice_flush(ctx);
1828 return AVERROR_INVALIDDATA;
1829 }
1830
1831 1102 *got_frame_ptr = 1;
1832
1833 /* Update history */
1834 1102 memcpy(s->prev_lsps, lsps[2],
1835 1102 s->lsps * sizeof(*s->prev_lsps));
1836 1102 memcpy(s->synth_history, &synth[MAX_SFRAMESIZE],
1837 1102 s->lsps * sizeof(*synth));
1838 1102 memcpy(s->excitation_history, &excitation[MAX_SFRAMESIZE],
1839 1102 s->history_nsamples * sizeof(*excitation));
1840
1/2
✓ Branch 0 taken 1102 times.
✗ Branch 1 not taken.
1102 if (s->do_apf)
1841 1102 memmove(s->zero_exc_pf, &s->zero_exc_pf[MAX_SFRAMESIZE],
1842 1102 s->history_nsamples * sizeof(*s->zero_exc_pf));
1843
1844 1102 return 0;
1845 }
1846
1847 /**
1848 * Parse the packet header at the start of each packet (input data to this
1849 * decoder).
1850 *
1851 * @param s WMA Voice decoding context private data
1852 * @return <0 on error, nb_superframes on success.
1853 */
1854 186 static int parse_packet_header(WMAVoiceContext *s)
1855 {
1856 186 GetBitContext *gb = &s->gb;
1857 186 unsigned int res, n_superframes = 0;
1858
1859 186 skip_bits(gb, 4); // packet sequence number
1860 186 s->has_residual_lsps = get_bits1(gb);
1861 do {
1862
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 186 times.
186 if (get_bits_left(gb) < 6 + s->spillover_bitsize)
1863 return AVERROR_INVALIDDATA;
1864
1865 186 res = get_bits(gb, 6); // number of superframes per packet
1866 // (minus first one if there is spillover)
1867 186 n_superframes += res;
1868
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 186 times.
186 } while (res == 0x3F);
1869 186 s->spillover_nbits = get_bits(gb, s->spillover_bitsize);
1870
1871
1/2
✓ Branch 1 taken 186 times.
✗ Branch 2 not taken.
186 return get_bits_left(gb) >= 0 ? n_superframes : AVERROR_INVALIDDATA;
1872 }
1873
1874 /**
1875 * Copy (unaligned) bits from gb/data/size to pb.
1876 *
1877 * @param pb target buffer to copy bits into
1878 * @param data source buffer to copy bits from
1879 * @param size size of the source data, in bytes
1880 * @param gb bit I/O context specifying the current position in the source.
1881 * data. This function might use this to align the bit position to
1882 * a whole-byte boundary before calling #ff_copy_bits() on aligned
1883 * source data
1884 * @param nbits the amount of bits to copy from source to target
1885 *
1886 * @note after calling this function, the current position in the input bit
1887 * I/O context is undefined.
1888 */
1889 370 static void copy_bits(PutBitContext *pb,
1890 const uint8_t *data, int size,
1891 GetBitContext *gb, int nbits)
1892 {
1893 int rmn_bytes, rmn_bits;
1894
1895 370 rmn_bits = rmn_bytes = get_bits_left(gb);
1896
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 370 times.
370 if (rmn_bits < nbits)
1897 return;
1898
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 370 times.
370 if (nbits > put_bits_left(pb))
1899 return;
1900 370 rmn_bits &= 7; rmn_bytes >>= 3;
1901
2/2
✓ Branch 0 taken 290 times.
✓ Branch 1 taken 80 times.
370 if ((rmn_bits = FFMIN(rmn_bits, nbits)) > 0)
1902 290 put_bits(pb, rmn_bits, get_bits(gb, rmn_bits));
1903 370 ff_copy_bits(pb, data + size - rmn_bytes,
1904 370 FFMIN(nbits - rmn_bits, rmn_bytes << 3));
1905 }
1906
1907 /**
1908 * Packet decoding: a packet is anything that the (ASF) demuxer contains,
1909 * and we expect that the demuxer / application provides it to us as such
1910 * (else you'll probably get garbage as output). Every packet has a size of
1911 * ctx->block_align bytes, starts with a packet header (see
1912 * #parse_packet_header()), and then a series of superframes. Superframe
1913 * boundaries may exceed packets, i.e. superframes can split data over
1914 * multiple (two) packets.
1915 *
1916 * For more information about frames, see #synth_superframe().
1917 */
1918 1291 static int wmavoice_decode_packet(AVCodecContext *ctx, AVFrame *frame,
1919 int *got_frame_ptr, AVPacket *avpkt)
1920 {
1921 1291 WMAVoiceContext *s = ctx->priv_data;
1922 1291 GetBitContext *gb = &s->gb;
1923 1291 const uint8_t *buf = avpkt->data;
1924 uint8_t dummy[1];
1925 int size, res, pos;
1926
1927 /* Packets are sometimes a multiple of ctx->block_align, with a packet
1928 * header at each ctx->block_align bytes. However, FFmpeg's ASF demuxer
1929 * feeds us ASF packets, which may concatenate multiple "codec" packets
1930 * in a single "muxer" packet, so we artificially emulate that by
1931 * capping the packet size at ctx->block_align. */
1932
2/2
✓ Branch 0 taken 180 times.
✓ Branch 1 taken 1291 times.
1471 for (size = avpkt->size; size > ctx->block_align; size -= ctx->block_align);
1933
2/2
✓ Branch 0 taken 5 times.
✓ Branch 1 taken 1286 times.
1291 buf = size ? buf : dummy;
1934 1291 res = init_get_bits8(&s->gb, buf, size);
1935
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1291 times.
1291 if (res < 0)
1936 return res;
1937
1938 /* size == ctx->block_align is used to indicate whether we are dealing with
1939 * a new packet or a packet of which we already read the packet header
1940 * previously. */
1941
2/2
✓ Branch 0 taken 191 times.
✓ Branch 1 taken 1100 times.
1291 if (!(size % ctx->block_align)) { // new packet header
1942
2/2
✓ Branch 0 taken 5 times.
✓ Branch 1 taken 186 times.
191 if (!size) {
1943 5 s->spillover_nbits = 0;
1944 5 s->nb_superframes = 0;
1945 } else {
1946
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 186 times.
186 if ((res = parse_packet_header(s)) < 0)
1947 return res;
1948 186 s->nb_superframes = res;
1949 }
1950
1951 /* If the packet header specifies a s->spillover_nbits, then we want
1952 * to push out all data of the previous packet (+ spillover) before
1953 * continuing to parse new superframes in the current packet. */
1954
2/2
✓ Branch 0 taken 185 times.
✓ Branch 1 taken 6 times.
191 if (s->sframe_cache_size > 0) {
1955 185 int cnt = get_bits_count(gb);
1956
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 185 times.
185 if (cnt + s->spillover_nbits > avpkt->size * 8) {
1957 s->spillover_nbits = avpkt->size * 8 - cnt;
1958 }
1959 185 copy_bits(&s->pb, buf, size, gb, s->spillover_nbits);
1960 185 flush_put_bits(&s->pb);
1961 185 s->sframe_cache_size += s->spillover_nbits;
1962
1/2
✓ Branch 1 taken 185 times.
✗ Branch 2 not taken.
185 if ((res = synth_superframe(ctx, frame, got_frame_ptr)) == 0 &&
1963
1/2
✓ Branch 0 taken 185 times.
✗ Branch 1 not taken.
185 *got_frame_ptr) {
1964 185 cnt += s->spillover_nbits;
1965 185 s->skip_bits_next = cnt & 7;
1966 185 res = cnt >> 3;
1967 185 return res;
1968 } else
1969 skip_bits_long (gb, s->spillover_nbits - cnt +
1970 get_bits_count(gb)); // resync
1971
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.
6 } else if (s->spillover_nbits) {
1972 skip_bits_long(gb, s->spillover_nbits); // resync
1973 }
1974
2/2
✓ Branch 0 taken 971 times.
✓ Branch 1 taken 129 times.
1100 } else if (s->skip_bits_next)
1975 971 skip_bits(gb, s->skip_bits_next);
1976
1977 /* Try parsing superframes in current packet */
1978 1106 s->sframe_cache_size = 0;
1979 1106 s->skip_bits_next = 0;
1980 1106 pos = get_bits_left(gb);
1981
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 1102 times.
1106 if (s->nb_superframes-- == 0) {
1982 4 *got_frame_ptr = 0;
1983 4 return size;
1984
2/2
✓ Branch 0 taken 917 times.
✓ Branch 1 taken 185 times.
1102 } else if (s->nb_superframes > 0) {
1985
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 917 times.
917 if ((res = synth_superframe(ctx, frame, got_frame_ptr)) < 0) {
1986 return res;
1987
1/2
✓ Branch 0 taken 917 times.
✗ Branch 1 not taken.
917 } else if (*got_frame_ptr) {
1988 917 int cnt = get_bits_count(gb);
1989 917 s->skip_bits_next = cnt & 7;
1990 917 res = cnt >> 3;
1991 917 return res;
1992 }
1993
1/2
✓ Branch 0 taken 185 times.
✗ Branch 1 not taken.
185 } else if ((s->sframe_cache_size = pos) > 0) {
1994 /* ... cache it for spillover in next packet */
1995 185 init_put_bits(&s->pb, s->sframe_cache, SFRAME_CACHE_MAXSIZE);
1996 185 copy_bits(&s->pb, buf, size, gb, s->sframe_cache_size);
1997 // FIXME bad - just copy bytes as whole and add use the
1998 // skip_bits_next field
1999 }
2000
2001 185 return size;
2002 }
2003
2004 8 static av_cold int wmavoice_decode_end(AVCodecContext *ctx)
2005 {
2006 8 WMAVoiceContext *s = ctx->priv_data;
2007
2008
1/2
✓ Branch 0 taken 8 times.
✗ Branch 1 not taken.
8 if (s->do_apf) {
2009 8 av_tx_uninit(&s->rdft);
2010 8 av_tx_uninit(&s->irdft);
2011 8 av_tx_uninit(&s->dct);
2012 8 av_tx_uninit(&s->dst);
2013 }
2014
2015 8 return 0;
2016 }
2017
2018 const FFCodec ff_wmavoice_decoder = {
2019 .p.name = "wmavoice",
2020 CODEC_LONG_NAME("Windows Media Audio Voice"),
2021 .p.type = AVMEDIA_TYPE_AUDIO,
2022 .p.id = AV_CODEC_ID_WMAVOICE,
2023 .priv_data_size = sizeof(WMAVoiceContext),
2024 .init = wmavoice_decode_init,
2025 .close = wmavoice_decode_end,
2026 FF_CODEC_DECODE_CB(wmavoice_decode_packet),
2027 .p.capabilities =
2028 #if FF_API_SUBFRAMES
2029 AV_CODEC_CAP_SUBFRAMES |
2030 #endif
2031 AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY,
2032 .caps_internal = FF_CODEC_CAP_INIT_CLEANUP,
2033 .flush = wmavoice_flush,
2034 };
2035