GCC Code Coverage Report
Directory: ../../../ffmpeg/ Exec Total Coverage
File: src/libavcodec/vp9dsp_template.c Lines: 1360 1391 97.8 %
Date: 2020-09-25 23:16:12 Branches: 326 337 96.7 %

Line Branch Exec Source
1
/*
2
 * VP9 compatible video decoder
3
 *
4
 * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
5
 * Copyright (C) 2013 Clément Bœsch <u pkh me>
6
 *
7
 * This file is part of FFmpeg.
8
 *
9
 * FFmpeg is free software; you can redistribute it and/or
10
 * modify it under the terms of the GNU Lesser General Public
11
 * License as published by the Free Software Foundation; either
12
 * version 2.1 of the License, or (at your option) any later version.
13
 *
14
 * FFmpeg is distributed in the hope that it will be useful,
15
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
 * Lesser General Public License for more details.
18
 *
19
 * You should have received a copy of the GNU Lesser General Public
20
 * License along with FFmpeg; if not, write to the Free Software
21
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22
 */
23
24
#include "libavutil/common.h"
25
#include "bit_depth_template.c"
26
#include "vp9dsp.h"
27
28
#if BIT_DEPTH != 12
29
30
// FIXME see whether we can merge parts of this (perhaps at least 4x4 and 8x8)
31
// back with h264pred.[ch]
32
33
124261
static void vert_4x4_c(uint8_t *_dst, ptrdiff_t stride,
34
                       const uint8_t *left, const uint8_t *_top)
35
{
36
124261
    pixel *dst = (pixel *) _dst;
37
124261
    const pixel *top = (const pixel *) _top;
38
124261
    pixel4 p4 = AV_RN4PA(top);
39
40
124261
    stride /= sizeof(pixel);
41
124261
    AV_WN4PA(dst + stride * 0, p4);
42
124261
    AV_WN4PA(dst + stride * 1, p4);
43
124261
    AV_WN4PA(dst + stride * 2, p4);
44
124261
    AV_WN4PA(dst + stride * 3, p4);
45
124261
}
46
47
24839
static void vert_8x8_c(uint8_t *_dst, ptrdiff_t stride,
48
                       const uint8_t *left, const uint8_t *_top)
49
{
50
24839
    pixel *dst = (pixel *) _dst;
51
24839
    const pixel *top = (const pixel *) _top;
52
24839
    pixel4 p4a = AV_RN4PA(top + 0);
53
24839
    pixel4 p4b = AV_RN4PA(top + 4);
54
    int y;
55
56
24839
    stride /= sizeof(pixel);
57
223551
    for (y = 0; y < 8; y++) {
58
198712
        AV_WN4PA(dst + 0, p4a);
59
198712
        AV_WN4PA(dst + 4, p4b);
60
198712
        dst += stride;
61
    }
62
24839
}
63
64
5710
static void vert_16x16_c(uint8_t *_dst, ptrdiff_t stride,
65
                         const uint8_t *left, const uint8_t *_top)
66
{
67
5710
    pixel *dst = (pixel *) _dst;
68
5710
    const pixel *top = (const pixel *) _top;
69
5710
    pixel4 p4a = AV_RN4PA(top +  0);
70
5710
    pixel4 p4b = AV_RN4PA(top +  4);
71
5710
    pixel4 p4c = AV_RN4PA(top +  8);
72
5710
    pixel4 p4d = AV_RN4PA(top + 12);
73
    int y;
74
75
5710
    stride /= sizeof(pixel);
76
97070
    for (y = 0; y < 16; y++) {
77
91360
        AV_WN4PA(dst +  0, p4a);
78
91360
        AV_WN4PA(dst +  4, p4b);
79
91360
        AV_WN4PA(dst +  8, p4c);
80
91360
        AV_WN4PA(dst + 12, p4d);
81
91360
        dst += stride;
82
    }
83
5710
}
84
85
756
static void vert_32x32_c(uint8_t *_dst, ptrdiff_t stride,
86
                         const uint8_t *left, const uint8_t *_top)
87
{
88
756
    pixel *dst = (pixel *) _dst;
89
756
    const pixel *top = (const pixel *) _top;
90
756
    pixel4 p4a = AV_RN4PA(top +  0);
91
756
    pixel4 p4b = AV_RN4PA(top +  4);
92
756
    pixel4 p4c = AV_RN4PA(top +  8);
93
756
    pixel4 p4d = AV_RN4PA(top + 12);
94
756
    pixel4 p4e = AV_RN4PA(top + 16);
95
756
    pixel4 p4f = AV_RN4PA(top + 20);
96
756
    pixel4 p4g = AV_RN4PA(top + 24);
97
756
    pixel4 p4h = AV_RN4PA(top + 28);
98
    int y;
99
100
756
    stride /= sizeof(pixel);
101
24948
    for (y = 0; y < 32; y++) {
102
24192
        AV_WN4PA(dst +  0, p4a);
103
24192
        AV_WN4PA(dst +  4, p4b);
104
24192
        AV_WN4PA(dst +  8, p4c);
105
24192
        AV_WN4PA(dst + 12, p4d);
106
24192
        AV_WN4PA(dst + 16, p4e);
107
24192
        AV_WN4PA(dst + 20, p4f);
108
24192
        AV_WN4PA(dst + 24, p4g);
109
24192
        AV_WN4PA(dst + 28, p4h);
110
24192
        dst += stride;
111
    }
112
756
}
113
114
275802
static void hor_4x4_c(uint8_t *_dst, ptrdiff_t stride,
115
                      const uint8_t *_left, const uint8_t *top)
116
{
117
275802
    pixel *dst = (pixel *) _dst;
118
275802
    const pixel *left = (const pixel *) _left;
119
120
275802
    stride /= sizeof(pixel);
121
275802
    AV_WN4PA(dst + stride * 0, PIXEL_SPLAT_X4(left[3]));
122
275802
    AV_WN4PA(dst + stride * 1, PIXEL_SPLAT_X4(left[2]));
123
275802
    AV_WN4PA(dst + stride * 2, PIXEL_SPLAT_X4(left[1]));
124
275802
    AV_WN4PA(dst + stride * 3, PIXEL_SPLAT_X4(left[0]));
125
275802
}
126
127
71361
static void hor_8x8_c(uint8_t *_dst, ptrdiff_t stride,
128
                      const uint8_t *_left, const uint8_t *top)
129
{
130
71361
    pixel *dst = (pixel *) _dst;
131
71361
    const pixel *left = (const pixel *) _left;
132
    int y;
133
134
71361
    stride /= sizeof(pixel);
135
642249
    for (y = 0; y < 8; y++) {
136
570888
        pixel4 p4 = PIXEL_SPLAT_X4(left[7 - y]);
137
138
570888
        AV_WN4PA(dst + 0, p4);
139
570888
        AV_WN4PA(dst + 4, p4);
140
570888
        dst += stride;
141
    }
142
71361
}
143
144
12721
static void hor_16x16_c(uint8_t *_dst, ptrdiff_t stride,
145
                        const uint8_t *_left, const uint8_t *top)
146
{
147
12721
    pixel *dst = (pixel *) _dst;
148
12721
    const pixel *left = (const pixel *) _left;
149
    int y;
150
151
12721
    stride /= sizeof(pixel);
152
216257
    for (y = 0; y < 16; y++) {
153
203536
        pixel4 p4 = PIXEL_SPLAT_X4(left[15 - y]);
154
155
203536
        AV_WN4PA(dst +  0, p4);
156
203536
        AV_WN4PA(dst +  4, p4);
157
203536
        AV_WN4PA(dst +  8, p4);
158
203536
        AV_WN4PA(dst + 12, p4);
159
203536
        dst += stride;
160
    }
161
12721
}
162
163
965
static void hor_32x32_c(uint8_t *_dst, ptrdiff_t stride,
164
                        const uint8_t *_left, const uint8_t *top)
165
{
166
965
    pixel *dst = (pixel *) _dst;
167
965
    const pixel *left = (const pixel *) _left;
168
    int y;
169
170
965
    stride /= sizeof(pixel);
171
31845
    for (y = 0; y < 32; y++) {
172
30880
        pixel4 p4 = PIXEL_SPLAT_X4(left[31 - y]);
173
174
30880
        AV_WN4PA(dst +  0, p4);
175
30880
        AV_WN4PA(dst +  4, p4);
176
30880
        AV_WN4PA(dst +  8, p4);
177
30880
        AV_WN4PA(dst + 12, p4);
178
30880
        AV_WN4PA(dst + 16, p4);
179
30880
        AV_WN4PA(dst + 20, p4);
180
30880
        AV_WN4PA(dst + 24, p4);
181
30880
        AV_WN4PA(dst + 28, p4);
182
30880
        dst += stride;
183
    }
184
965
}
185
186
#endif /* BIT_DEPTH != 12 */
187
188
76973
static void tm_4x4_c(uint8_t *_dst, ptrdiff_t stride,
189
                     const uint8_t *_left, const uint8_t *_top)
190
{
191
76973
    pixel *dst = (pixel *) _dst;
192
76973
    const pixel *left = (const pixel *) _left;
193
76973
    const pixel *top = (const pixel *) _top;
194
76973
    int y, tl = top[-1];
195
196
76973
    stride /= sizeof(pixel);
197
384865
    for (y = 0; y < 4; y++) {
198
307892
        int l_m_tl = left[3 - y] - tl;
199
200
307892
        dst[0] = av_clip_pixel(top[0] + l_m_tl);
201
307892
        dst[1] = av_clip_pixel(top[1] + l_m_tl);
202
307892
        dst[2] = av_clip_pixel(top[2] + l_m_tl);
203
307892
        dst[3] = av_clip_pixel(top[3] + l_m_tl);
204
307892
        dst += stride;
205
    }
206
76973
}
207
208
19067
static void tm_8x8_c(uint8_t *_dst, ptrdiff_t stride,
209
                     const uint8_t *_left, const uint8_t *_top)
210
{
211
19067
    pixel *dst = (pixel *) _dst;
212
19067
    const pixel *left = (const pixel *) _left;
213
19067
    const pixel *top = (const pixel *) _top;
214
19067
    int y, tl = top[-1];
215
216
19067
    stride /= sizeof(pixel);
217
171603
    for (y = 0; y < 8; y++) {
218
152536
        int l_m_tl = left[7 - y] - tl;
219
220
152536
        dst[0] = av_clip_pixel(top[0] + l_m_tl);
221
152536
        dst[1] = av_clip_pixel(top[1] + l_m_tl);
222
152536
        dst[2] = av_clip_pixel(top[2] + l_m_tl);
223
152536
        dst[3] = av_clip_pixel(top[3] + l_m_tl);
224
152536
        dst[4] = av_clip_pixel(top[4] + l_m_tl);
225
152536
        dst[5] = av_clip_pixel(top[5] + l_m_tl);
226
152536
        dst[6] = av_clip_pixel(top[6] + l_m_tl);
227
152536
        dst[7] = av_clip_pixel(top[7] + l_m_tl);
228
152536
        dst += stride;
229
    }
230
19067
}
231
232
2367
static void tm_16x16_c(uint8_t *_dst, ptrdiff_t stride,
233
                       const uint8_t *_left, const uint8_t *_top)
234
{
235
2367
    pixel *dst = (pixel *) _dst;
236
2367
    const pixel *left = (const pixel *) _left;
237
2367
    const pixel *top = (const pixel *) _top;
238
2367
    int y, tl = top[-1];
239
240
2367
    stride /= sizeof(pixel);
241
40239
    for (y = 0; y < 16; y++) {
242
37872
        int l_m_tl = left[15 - y] - tl;
243
244
37872
        dst[ 0] = av_clip_pixel(top[ 0] + l_m_tl);
245
37872
        dst[ 1] = av_clip_pixel(top[ 1] + l_m_tl);
246
37872
        dst[ 2] = av_clip_pixel(top[ 2] + l_m_tl);
247
37872
        dst[ 3] = av_clip_pixel(top[ 3] + l_m_tl);
248
37872
        dst[ 4] = av_clip_pixel(top[ 4] + l_m_tl);
249
37872
        dst[ 5] = av_clip_pixel(top[ 5] + l_m_tl);
250
37872
        dst[ 6] = av_clip_pixel(top[ 6] + l_m_tl);
251
37872
        dst[ 7] = av_clip_pixel(top[ 7] + l_m_tl);
252
37872
        dst[ 8] = av_clip_pixel(top[ 8] + l_m_tl);
253
37872
        dst[ 9] = av_clip_pixel(top[ 9] + l_m_tl);
254
37872
        dst[10] = av_clip_pixel(top[10] + l_m_tl);
255
37872
        dst[11] = av_clip_pixel(top[11] + l_m_tl);
256
37872
        dst[12] = av_clip_pixel(top[12] + l_m_tl);
257
37872
        dst[13] = av_clip_pixel(top[13] + l_m_tl);
258
37872
        dst[14] = av_clip_pixel(top[14] + l_m_tl);
259
37872
        dst[15] = av_clip_pixel(top[15] + l_m_tl);
260
37872
        dst += stride;
261
    }
262
2367
}
263
264
184
static void tm_32x32_c(uint8_t *_dst, ptrdiff_t stride,
265
                       const uint8_t *_left, const uint8_t *_top)
266
{
267
184
    pixel *dst = (pixel *) _dst;
268
184
    const pixel *left = (const pixel *) _left;
269
184
    const pixel *top = (const pixel *) _top;
270
184
    int y, tl = top[-1];
271
272
184
    stride /= sizeof(pixel);
273
6072
    for (y = 0; y < 32; y++) {
274
5888
        int l_m_tl = left[31 - y] - tl;
275
276
5888
        dst[ 0] = av_clip_pixel(top[ 0] + l_m_tl);
277
5888
        dst[ 1] = av_clip_pixel(top[ 1] + l_m_tl);
278
5888
        dst[ 2] = av_clip_pixel(top[ 2] + l_m_tl);
279
5888
        dst[ 3] = av_clip_pixel(top[ 3] + l_m_tl);
280
5888
        dst[ 4] = av_clip_pixel(top[ 4] + l_m_tl);
281
5888
        dst[ 5] = av_clip_pixel(top[ 5] + l_m_tl);
282
5888
        dst[ 6] = av_clip_pixel(top[ 6] + l_m_tl);
283
5888
        dst[ 7] = av_clip_pixel(top[ 7] + l_m_tl);
284
5888
        dst[ 8] = av_clip_pixel(top[ 8] + l_m_tl);
285
5888
        dst[ 9] = av_clip_pixel(top[ 9] + l_m_tl);
286
5888
        dst[10] = av_clip_pixel(top[10] + l_m_tl);
287
5888
        dst[11] = av_clip_pixel(top[11] + l_m_tl);
288
5888
        dst[12] = av_clip_pixel(top[12] + l_m_tl);
289
5888
        dst[13] = av_clip_pixel(top[13] + l_m_tl);
290
5888
        dst[14] = av_clip_pixel(top[14] + l_m_tl);
291
5888
        dst[15] = av_clip_pixel(top[15] + l_m_tl);
292
5888
        dst[16] = av_clip_pixel(top[16] + l_m_tl);
293
5888
        dst[17] = av_clip_pixel(top[17] + l_m_tl);
294
5888
        dst[18] = av_clip_pixel(top[18] + l_m_tl);
295
5888
        dst[19] = av_clip_pixel(top[19] + l_m_tl);
296
5888
        dst[20] = av_clip_pixel(top[20] + l_m_tl);
297
5888
        dst[21] = av_clip_pixel(top[21] + l_m_tl);
298
5888
        dst[22] = av_clip_pixel(top[22] + l_m_tl);
299
5888
        dst[23] = av_clip_pixel(top[23] + l_m_tl);
300
5888
        dst[24] = av_clip_pixel(top[24] + l_m_tl);
301
5888
        dst[25] = av_clip_pixel(top[25] + l_m_tl);
302
5888
        dst[26] = av_clip_pixel(top[26] + l_m_tl);
303
5888
        dst[27] = av_clip_pixel(top[27] + l_m_tl);
304
5888
        dst[28] = av_clip_pixel(top[28] + l_m_tl);
305
5888
        dst[29] = av_clip_pixel(top[29] + l_m_tl);
306
5888
        dst[30] = av_clip_pixel(top[30] + l_m_tl);
307
5888
        dst[31] = av_clip_pixel(top[31] + l_m_tl);
308
5888
        dst += stride;
309
    }
310
184
}
311
312
#if BIT_DEPTH != 12
313
314
328506
static void dc_4x4_c(uint8_t *_dst, ptrdiff_t stride,
315
                     const uint8_t *_left, const uint8_t *_top)
316
{
317
328506
    pixel *dst = (pixel *) _dst;
318
328506
    const pixel *left = (const pixel *) _left;
319
328506
    const pixel *top = (const pixel *) _top;
320
328506
    pixel4 dc = PIXEL_SPLAT_X4((left[0] + left[1] + left[2] + left[3] +
321
                                top[0] + top[1] + top[2] + top[3] + 4) >> 3);
322
323
328506
    stride /= sizeof(pixel);
324
328506
    AV_WN4PA(dst + stride * 0, dc);
325
328506
    AV_WN4PA(dst + stride * 1, dc);
326
328506
    AV_WN4PA(dst + stride * 2, dc);
327
328506
    AV_WN4PA(dst + stride * 3, dc);
328
328506
}
329
330
131949
static void dc_8x8_c(uint8_t *_dst, ptrdiff_t stride,
331
                     const uint8_t *_left, const uint8_t *_top)
332
{
333
131949
    pixel *dst = (pixel *) _dst;
334
131949
    const pixel *left = (const pixel *) _left;
335
131949
    const pixel *top = (const pixel *) _top;
336
131949
    pixel4 dc = PIXEL_SPLAT_X4
337
        ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] +
338
          left[6] + left[7] + top[0] + top[1] + top[2] + top[3] +
339
          top[4] + top[5] + top[6] + top[7] + 8) >> 4);
340
    int y;
341
342
131949
    stride /= sizeof(pixel);
343
1187541
    for (y = 0; y < 8; y++) {
344
1055592
        AV_WN4PA(dst + 0, dc);
345
1055592
        AV_WN4PA(dst + 4, dc);
346
1055592
        dst += stride;
347
    }
348
131949
}
349
350
17317
static void dc_16x16_c(uint8_t *_dst, ptrdiff_t stride,
351
                       const uint8_t *_left, const uint8_t *_top)
352
{
353
17317
    pixel *dst = (pixel *) _dst;
354
17317
    const pixel *left = (const pixel *) _left;
355
17317
    const pixel *top = (const pixel *) _top;
356
17317
    pixel4 dc = PIXEL_SPLAT_X4
357
        ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] + left[6] +
358
          left[7] + left[8] + left[9] + left[10] + left[11] + left[12] +
359
          left[13] + left[14] + left[15] + top[0] + top[1] + top[2] + top[3] +
360
          top[4] + top[5] + top[6] + top[7] + top[8] + top[9] + top[10] +
361
          top[11] + top[12] + top[13] + top[14] + top[15] + 16) >> 5);
362
    int y;
363
364
17317
    stride /= sizeof(pixel);
365
294389
    for (y = 0; y < 16; y++) {
366
277072
        AV_WN4PA(dst +  0, dc);
367
277072
        AV_WN4PA(dst +  4, dc);
368
277072
        AV_WN4PA(dst +  8, dc);
369
277072
        AV_WN4PA(dst + 12, dc);
370
277072
        dst += stride;
371
    }
372
17317
}
373
374
4572
static void dc_32x32_c(uint8_t *_dst, ptrdiff_t stride,
375
                       const uint8_t *_left, const uint8_t *_top)
376
{
377
4572
    pixel *dst = (pixel *) _dst;
378
4572
    const pixel *left = (const pixel *) _left;
379
4572
    const pixel *top = (const pixel *) _top;
380
4572
    pixel4 dc = PIXEL_SPLAT_X4
381
        ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] + left[6] +
382
          left[7] + left[8] + left[9] + left[10] + left[11] + left[12] +
383
          left[13] + left[14] + left[15] + left[16] + left[17] + left[18] +
384
          left[19] + left[20] + left[21] + left[22] + left[23] + left[24] +
385
          left[25] + left[26] + left[27] + left[28] + left[29] + left[30] +
386
          left[31] + top[0] + top[1] + top[2] + top[3] + top[4] + top[5] +
387
          top[6] + top[7] + top[8] + top[9] + top[10] + top[11] + top[12] +
388
          top[13] + top[14] + top[15] + top[16] + top[17] + top[18] + top[19] +
389
          top[20] + top[21] + top[22] + top[23] + top[24] + top[25] + top[26] +
390
          top[27] + top[28] + top[29] + top[30] + top[31] + 32) >> 6);
391
    int y;
392
393
4572
    stride /= sizeof(pixel);
394
150876
    for (y = 0; y < 32; y++) {
395
146304
        AV_WN4PA(dst +  0, dc);
396
146304
        AV_WN4PA(dst +  4, dc);
397
146304
        AV_WN4PA(dst +  8, dc);
398
146304
        AV_WN4PA(dst + 12, dc);
399
146304
        AV_WN4PA(dst + 16, dc);
400
146304
        AV_WN4PA(dst + 20, dc);
401
146304
        AV_WN4PA(dst + 24, dc);
402
146304
        AV_WN4PA(dst + 28, dc);
403
146304
        dst += stride;
404
    }
405
4572
}
406
407
9050
static void dc_left_4x4_c(uint8_t *_dst, ptrdiff_t stride,
408
                          const uint8_t *_left, const uint8_t *top)
409
{
410
9050
    pixel *dst = (pixel *) _dst;
411
9050
    const pixel *left = (const pixel *) _left;
412
9050
    pixel4 dc = PIXEL_SPLAT_X4((left[0] + left[1] + left[2] + left[3] + 2) >> 2);
413
414
9050
    stride /= sizeof(pixel);
415
9050
    AV_WN4PA(dst + stride * 0, dc);
416
9050
    AV_WN4PA(dst + stride * 1, dc);
417
9050
    AV_WN4PA(dst + stride * 2, dc);
418
9050
    AV_WN4PA(dst + stride * 3, dc);
419
9050
}
420
421
3251
static void dc_left_8x8_c(uint8_t *_dst, ptrdiff_t stride,
422
                          const uint8_t *_left, const uint8_t *top)
423
{
424
3251
    pixel *dst = (pixel *) _dst;
425
3251
    const pixel *left = (const pixel *) _left;
426
3251
    pixel4 dc = PIXEL_SPLAT_X4
427
        ((left[0] + left[1] + left[2] + left[3] +
428
          left[4] + left[5] + left[6] + left[7] + 4) >> 3);
429
    int y;
430
431
3251
    stride /= sizeof(pixel);
432
29259
    for (y = 0; y < 8; y++) {
433
26008
        AV_WN4PA(dst + 0, dc);
434
26008
        AV_WN4PA(dst + 4, dc);
435
26008
        dst += stride;
436
    }
437
3251
}
438
439
1050
static void dc_left_16x16_c(uint8_t *_dst, ptrdiff_t stride,
440
                            const uint8_t *_left, const uint8_t *top)
441
{
442
1050
    pixel *dst = (pixel *) _dst;
443
1050
    const pixel *left = (const pixel *) _left;
444
1050
    pixel4 dc = PIXEL_SPLAT_X4
445
        ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] +
446
          left[6] + left[7] + left[8] + left[9] + left[10] + left[11] +
447
          left[12] + left[13] + left[14] + left[15] + 8) >> 4);
448
    int y;
449
450
1050
    stride /= sizeof(pixel);
451
17850
    for (y = 0; y < 16; y++) {
452
16800
        AV_WN4PA(dst +  0, dc);
453
16800
        AV_WN4PA(dst +  4, dc);
454
16800
        AV_WN4PA(dst +  8, dc);
455
16800
        AV_WN4PA(dst + 12, dc);
456
16800
        dst += stride;
457
    }
458
1050
}
459
460
1219
static void dc_left_32x32_c(uint8_t *_dst, ptrdiff_t stride,
461
                            const uint8_t *_left, const uint8_t *top)
462
{
463
1219
    pixel *dst = (pixel *) _dst;
464
1219
    const pixel *left = (const pixel *) _left;
465
1219
    pixel4 dc = PIXEL_SPLAT_X4
466
        ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] +
467
          left[6] + left[7] + left[8] + left[9] + left[10] + left[11] +
468
          left[12] + left[13] + left[14] + left[15] + left[16] + left[17] +
469
          left[18] + left[19] + left[20] + left[21] + left[22] + left[23] +
470
          left[24] + left[25] + left[26] + left[27] + left[28] + left[29] +
471
          left[30] + left[31] + 16) >> 5);
472
    int y;
473
474
1219
    stride /= sizeof(pixel);
475
40227
    for (y = 0; y < 32; y++) {
476
39008
        AV_WN4PA(dst +  0, dc);
477
39008
        AV_WN4PA(dst +  4, dc);
478
39008
        AV_WN4PA(dst +  8, dc);
479
39008
        AV_WN4PA(dst + 12, dc);
480
39008
        AV_WN4PA(dst + 16, dc);
481
39008
        AV_WN4PA(dst + 20, dc);
482
39008
        AV_WN4PA(dst + 24, dc);
483
39008
        AV_WN4PA(dst + 28, dc);
484
39008
        dst += stride;
485
    }
486
1219
}
487
488
10302
static void dc_top_4x4_c(uint8_t *_dst, ptrdiff_t stride,
489
                         const uint8_t *left, const uint8_t *_top)
490
{
491
10302
    pixel *dst = (pixel *) _dst;
492
10302
    const pixel *top = (const pixel *) _top;
493
10302
    pixel4 dc = PIXEL_SPLAT_X4((top[0] + top[1] + top[2] + top[3] + 2) >> 2);
494
495
10302
    stride /= sizeof(pixel);
496
10302
    AV_WN4PA(dst + stride * 0, dc);
497
10302
    AV_WN4PA(dst + stride * 1, dc);
498
10302
    AV_WN4PA(dst + stride * 2, dc);
499
10302
    AV_WN4PA(dst + stride * 3, dc);
500
10302
}
501
502
6789
static void dc_top_8x8_c(uint8_t *_dst, ptrdiff_t stride,
503
                         const uint8_t *left, const uint8_t *_top)
504
{
505
6789
    pixel *dst = (pixel *) _dst;
506
6789
    const pixel *top = (const pixel *) _top;
507
6789
    pixel4 dc = PIXEL_SPLAT_X4
508
        ((top[0] + top[1] + top[2] + top[3] +
509
          top[4] + top[5] + top[6] + top[7] + 4) >> 3);
510
    int y;
511
512
6789
    stride /= sizeof(pixel);
513
61101
    for (y = 0; y < 8; y++) {
514
54312
        AV_WN4PA(dst + 0, dc);
515
54312
        AV_WN4PA(dst + 4, dc);
516
54312
        dst += stride;
517
    }
518
6789
}
519
520
2291
static void dc_top_16x16_c(uint8_t *_dst, ptrdiff_t stride,
521
                           const uint8_t *left, const uint8_t *_top)
522
{
523
2291
    pixel *dst = (pixel *) _dst;
524
2291
    const pixel *top = (const pixel *) _top;
525
2291
    pixel4 dc = PIXEL_SPLAT_X4
526
        ((top[0] + top[1] + top[2] + top[3] + top[4] + top[5] +
527
          top[6] + top[7] + top[8] + top[9] + top[10] + top[11] +
528
          top[12] + top[13] + top[14] + top[15] + 8) >> 4);
529
    int y;
530
531
2291
    stride /= sizeof(pixel);
532
38947
    for (y = 0; y < 16; y++) {
533
36656
        AV_WN4PA(dst +  0, dc);
534
36656
        AV_WN4PA(dst +  4, dc);
535
36656
        AV_WN4PA(dst +  8, dc);
536
36656
        AV_WN4PA(dst + 12, dc);
537
36656
        dst += stride;
538
    }
539
2291
}
540
541
547
static void dc_top_32x32_c(uint8_t *_dst, ptrdiff_t stride,
542
                           const uint8_t *left, const uint8_t *_top)
543
{
544
547
    pixel *dst = (pixel *) _dst;
545
547
    const pixel *top = (const pixel *) _top;
546
547
    pixel4 dc = PIXEL_SPLAT_X4
547
        ((top[0] + top[1] + top[2] + top[3] + top[4] + top[5] +
548
          top[6] + top[7] + top[8] + top[9] + top[10] + top[11] +
549
          top[12] + top[13] + top[14] + top[15] + top[16] + top[17] +
550
          top[18] + top[19] + top[20] + top[21] + top[22] + top[23] +
551
          top[24] + top[25] + top[26] + top[27] + top[28] + top[29] +
552
          top[30] + top[31] + 16) >> 5);
553
    int y;
554
555
547
    stride /= sizeof(pixel);
556
18051
    for (y = 0; y < 32; y++) {
557
17504
        AV_WN4PA(dst +  0, dc);
558
17504
        AV_WN4PA(dst +  4, dc);
559
17504
        AV_WN4PA(dst +  8, dc);
560
17504
        AV_WN4PA(dst + 12, dc);
561
17504
        AV_WN4PA(dst + 16, dc);
562
17504
        AV_WN4PA(dst + 20, dc);
563
17504
        AV_WN4PA(dst + 24, dc);
564
17504
        AV_WN4PA(dst + 28, dc);
565
17504
        dst += stride;
566
    }
567
547
}
568
569
#endif /* BIT_DEPTH != 12 */
570
571
378
static void dc_128_4x4_c(uint8_t *_dst, ptrdiff_t stride,
572
                         const uint8_t *left, const uint8_t *top)
573
{
574
378
    pixel *dst = (pixel *) _dst;
575
378
    pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8));
576
577
378
    stride /= sizeof(pixel);
578
378
    AV_WN4PA(dst + stride * 0, val);
579
378
    AV_WN4PA(dst + stride * 1, val);
580
378
    AV_WN4PA(dst + stride * 2, val);
581
378
    AV_WN4PA(dst + stride * 3, val);
582
378
}
583
584
220
static void dc_128_8x8_c(uint8_t *_dst, ptrdiff_t stride,
585
                         const uint8_t *left, const uint8_t *top)
586
{
587
220
    pixel *dst = (pixel *) _dst;
588
220
    pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8));
589
    int y;
590
591
220
    stride /= sizeof(pixel);
592
1980
    for (y = 0; y < 8; y++) {
593
1760
        AV_WN4PA(dst + 0, val);
594
1760
        AV_WN4PA(dst + 4, val);
595
1760
        dst += stride;
596
    }
597
220
}
598
599
118
static void dc_128_16x16_c(uint8_t *_dst, ptrdiff_t stride,
600
                           const uint8_t *left, const uint8_t *top)
601
{
602
118
    pixel *dst = (pixel *) _dst;
603
118
    pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8));
604
    int y;
605
606
118
    stride /= sizeof(pixel);
607
2006
    for (y = 0; y < 16; y++) {
608
1888
        AV_WN4PA(dst +  0, val);
609
1888
        AV_WN4PA(dst +  4, val);
610
1888
        AV_WN4PA(dst +  8, val);
611
1888
        AV_WN4PA(dst + 12, val);
612
1888
        dst += stride;
613
    }
614
118
}
615
616
85
static void dc_128_32x32_c(uint8_t *_dst, ptrdiff_t stride,
617
                           const uint8_t *left, const uint8_t *top)
618
{
619
85
    pixel *dst = (pixel *) _dst;
620
85
    pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8));
621
    int y;
622
623
85
    stride /= sizeof(pixel);
624
2805
    for (y = 0; y < 32; y++) {
625
2720
        AV_WN4PA(dst +  0, val);
626
2720
        AV_WN4PA(dst +  4, val);
627
2720
        AV_WN4PA(dst +  8, val);
628
2720
        AV_WN4PA(dst + 12, val);
629
2720
        AV_WN4PA(dst + 16, val);
630
2720
        AV_WN4PA(dst + 20, val);
631
2720
        AV_WN4PA(dst + 24, val);
632
2720
        AV_WN4PA(dst + 28, val);
633
2720
        dst += stride;
634
    }
635
85
}
636
637
2482
static void dc_127_4x4_c(uint8_t *_dst, ptrdiff_t stride,
638
                         const uint8_t *left, const uint8_t *top)
639
{
640
2482
    pixel *dst = (pixel *) _dst;
641
2482
    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1);
642
643
2482
    stride /= sizeof(pixel);
644
2482
    AV_WN4PA(dst + stride * 0, val);
645
2482
    AV_WN4PA(dst + stride * 1, val);
646
2482
    AV_WN4PA(dst + stride * 2, val);
647
2482
    AV_WN4PA(dst + stride * 3, val);}
648
649
328
static void dc_127_8x8_c(uint8_t *_dst, ptrdiff_t stride,
650
                         const uint8_t *left, const uint8_t *top)
651
{
652
328
    pixel *dst = (pixel *) _dst;
653
328
    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1);
654
    int y;
655
656
328
    stride /= sizeof(pixel);
657
2952
    for (y = 0; y < 8; y++) {
658
2624
        AV_WN4PA(dst + 0, val);
659
2624
        AV_WN4PA(dst + 4, val);
660
2624
        dst += stride;
661
    }
662
328
}
663
664
154
static void dc_127_16x16_c(uint8_t *_dst, ptrdiff_t stride,
665
                           const uint8_t *left, const uint8_t *top)
666
{
667
154
    pixel *dst = (pixel *) _dst;
668
154
    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1);
669
    int y;
670
671
154
    stride /= sizeof(pixel);
672
2618
    for (y = 0; y < 16; y++) {
673
2464
        AV_WN4PA(dst +  0, val);
674
2464
        AV_WN4PA(dst +  4, val);
675
2464
        AV_WN4PA(dst +  8, val);
676
2464
        AV_WN4PA(dst + 12, val);
677
2464
        dst += stride;
678
    }
679
154
}
680
681
83
static void dc_127_32x32_c(uint8_t *_dst, ptrdiff_t stride,
682
                           const uint8_t *left, const uint8_t *top)
683
{
684
83
    pixel *dst = (pixel *) _dst;
685
83
    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1);
686
    int y;
687
688
83
    stride /= sizeof(pixel);
689
2739
    for (y = 0; y < 32; y++) {
690
2656
        AV_WN4PA(dst +  0, val);
691
2656
        AV_WN4PA(dst +  4, val);
692
2656
        AV_WN4PA(dst +  8, val);
693
2656
        AV_WN4PA(dst + 12, val);
694
2656
        AV_WN4PA(dst + 16, val);
695
2656
        AV_WN4PA(dst + 20, val);
696
2656
        AV_WN4PA(dst + 24, val);
697
2656
        AV_WN4PA(dst + 28, val);
698
2656
        dst += stride;
699
    }
700
83
}
701
702
2184
static void dc_129_4x4_c(uint8_t *_dst, ptrdiff_t stride,
703
                         const uint8_t *left, const uint8_t *top)
704
{
705
2184
    pixel *dst = (pixel *) _dst;
706
2184
    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1);
707
708
2184
    stride /= sizeof(pixel);
709
2184
    AV_WN4PA(dst + stride * 0, val);
710
2184
    AV_WN4PA(dst + stride * 1, val);
711
2184
    AV_WN4PA(dst + stride * 2, val);
712
2184
    AV_WN4PA(dst + stride * 3, val);
713
2184
}
714
715
282
static void dc_129_8x8_c(uint8_t *_dst, ptrdiff_t stride,
716
                         const uint8_t *left, const uint8_t *top)
717
{
718
282
    pixel *dst = (pixel *) _dst;
719
282
    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1);
720
    int y;
721
722
282
    stride /= sizeof(pixel);
723
2538
    for (y = 0; y < 8; y++) {
724
2256
        AV_WN4PA(dst + 0, val);
725
2256
        AV_WN4PA(dst + 4, val);
726
2256
        dst += stride;
727
    }
728
282
}
729
730
250
static void dc_129_16x16_c(uint8_t *_dst, ptrdiff_t stride,
731
                           const uint8_t *left, const uint8_t *top)
732
{
733
250
    pixel *dst = (pixel *) _dst;
734
250
    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1);
735
    int y;
736
737
250
    stride /= sizeof(pixel);
738
4250
    for (y = 0; y < 16; y++) {
739
4000
        AV_WN4PA(dst +  0, val);
740
4000
        AV_WN4PA(dst +  4, val);
741
4000
        AV_WN4PA(dst +  8, val);
742
4000
        AV_WN4PA(dst + 12, val);
743
4000
        dst += stride;
744
    }
745
250
}
746
747
80
static void dc_129_32x32_c(uint8_t *_dst, ptrdiff_t stride,
748
                           const uint8_t *left, const uint8_t *top)
749
{
750
80
    pixel *dst = (pixel *) _dst;
751
80
    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1);
752
    int y;
753
754
80
    stride /= sizeof(pixel);
755
2640
    for (y = 0; y < 32; y++) {
756
2560
        AV_WN4PA(dst +  0, val);
757
2560
        AV_WN4PA(dst +  4, val);
758
2560
        AV_WN4PA(dst +  8, val);
759
2560
        AV_WN4PA(dst + 12, val);
760
2560
        AV_WN4PA(dst + 16, val);
761
2560
        AV_WN4PA(dst + 20, val);
762
2560
        AV_WN4PA(dst + 24, val);
763
2560
        AV_WN4PA(dst + 28, val);
764
2560
        dst += stride;
765
    }
766
80
}
767
768
#if BIT_DEPTH != 12
769
770
#if BIT_DEPTH == 8
771
#define memset_bpc memset
772
#else
773
3176
static inline void memset_bpc(uint16_t *dst, int val, int len) {
774
    int n;
775
23328
    for (n = 0; n < len; n++) {
776
20152
        dst[n] = val;
777
    }
778
3176
}
779
#endif
780
781
#define DST(x, y) dst[(x) + (y) * stride]
782
783
20010
static void diag_downleft_4x4_c(uint8_t *_dst, ptrdiff_t stride,
784
                                const uint8_t *left, const uint8_t *_top)
785
{
786
20010
    pixel *dst = (pixel *) _dst;
787
20010
    const pixel *top = (const pixel *) _top;
788
20010
    int a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
789
20010
        a4 = top[4], a5 = top[5], a6 = top[6], a7 = top[7];
790
791
20010
    stride /= sizeof(pixel);
792
20010
    DST(0,0) = (a0 + a1 * 2 + a2 + 2) >> 2;
793
20010
    DST(1,0) = DST(0,1) = (a1 + a2 * 2 + a3 + 2) >> 2;
794
20010
    DST(2,0) = DST(1,1) = DST(0,2) = (a2 + a3 * 2 + a4 + 2) >> 2;
795
20010
    DST(3,0) = DST(2,1) = DST(1,2) = DST(0,3) = (a3 + a4 * 2 + a5 + 2) >> 2;
796
20010
    DST(3,1) = DST(2,2) = DST(1,3) = (a4 + a5 * 2 + a6 + 2) >> 2;
797
20010
    DST(3,2) = DST(2,3) = (a5 + a6 * 2 + a7 + 2) >> 2;
798
20010
    DST(3,3) = a7;  // note: this is different from vp8 and such
799
20010
}
800
801
#define def_diag_downleft(size) \
802
static void diag_downleft_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
803
                                              const uint8_t *left, const uint8_t *_top) \
804
{ \
805
    pixel *dst = (pixel *) _dst; \
806
    const pixel *top = (const pixel *) _top; \
807
    int i, j; \
808
    pixel v[size - 1]; \
809
\
810
    stride /= sizeof(pixel); \
811
    for (i = 0; i < size - 2; i++) \
812
        v[i] = (top[i] + top[i + 1] * 2 + top[i + 2] + 2) >> 2; \
813
    v[size - 2] = (top[size - 2] + top[size - 1] * 3 + 2) >> 2; \
814
\
815
    for (j = 0; j < size; j++) { \
816
        memcpy(dst + j*stride, v + j, (size - 1 - j) * sizeof(pixel)); \
817
        memset_bpc(dst + j*stride + size - 1 - j, top[size - 1], j + 1); \
818
    } \
819
}
820
821

82140
def_diag_downleft(8)
822

34689
def_diag_downleft(16)
823

8820
def_diag_downleft(32)
824
825
56436
static void diag_downright_4x4_c(uint8_t *_dst, ptrdiff_t stride,
826
                                 const uint8_t *_left, const uint8_t *_top)
827
{
828
56436
    pixel *dst = (pixel *) _dst;
829
56436
    const pixel *top = (const pixel *) _top;
830
56436
    const pixel *left = (const pixel *) _left;
831
56436
    int tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
832
56436
        l0 = left[3], l1 = left[2], l2 = left[1], l3 = left[0];
833
834
56436
    stride /= sizeof(pixel);
835
56436
    DST(0,3) = (l1 + l2 * 2 + l3 + 2) >> 2;
836
56436
    DST(0,2) = DST(1,3) = (l0 + l1 * 2 + l2 + 2) >> 2;
837
56436
    DST(0,1) = DST(1,2) = DST(2,3) = (tl + l0 * 2 + l1 + 2) >> 2;
838
56436
    DST(0,0) = DST(1,1) = DST(2,2) = DST(3,3) = (l0 + tl * 2 + a0 + 2) >> 2;
839
56436
    DST(1,0) = DST(2,1) = DST(3,2) = (tl + a0 * 2 + a1 + 2) >> 2;
840
56436
    DST(2,0) = DST(3,1) = (a0 + a1 * 2 + a2 + 2) >> 2;
841
56436
    DST(3,0) = (a1 + a2 * 2 + a3 + 2) >> 2;
842
56436
}
843
844
#define def_diag_downright(size) \
845
static void diag_downright_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
846
                                               const uint8_t *_left, const uint8_t *_top) \
847
{ \
848
    pixel *dst = (pixel *) _dst; \
849
    const pixel *top = (const pixel *) _top; \
850
    const pixel *left = (const pixel *) _left; \
851
    int i, j; \
852
    pixel v[size + size - 1]; \
853
\
854
    stride /= sizeof(pixel); \
855
    for (i = 0; i < size - 2; i++) { \
856
        v[i           ] = (left[i] + left[i + 1] * 2 + left[i + 2] + 2) >> 2; \
857
        v[size + 1 + i] = (top[i]  + top[i + 1]  * 2 + top[i + 2]  + 2) >> 2; \
858
    } \
859
    v[size - 2] = (left[size - 2] + left[size - 1] * 2 + top[-1] + 2) >> 2; \
860
    v[size - 1] = (left[size - 1] + top[-1] * 2 + top[ 0] + 2) >> 2; \
861
    v[size    ] = (top[-1] + top[0]  * 2 + top[ 1] + 2) >> 2; \
862
\
863
    for (j = 0; j < size; j++) \
864
        memcpy(dst + j*stride, v + size - 1 - j, size * sizeof(pixel)); \
865
}
866
867

95625
def_diag_downright(8)
868

25947
def_diag_downright(16)
869

8316
def_diag_downright(32)
870
871
43285
static void vert_right_4x4_c(uint8_t *_dst, ptrdiff_t stride,
872
                             const uint8_t *_left, const uint8_t *_top)
873
{
874
43285
    pixel *dst = (pixel *) _dst;
875
43285
    const pixel *top = (const pixel *) _top;
876
43285
    const pixel *left = (const pixel *) _left;
877
43285
    int tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
878
43285
        l0 = left[3], l1 = left[2], l2 = left[1];
879
880
43285
    stride /= sizeof(pixel);
881
43285
    DST(0,3) = (l0 + l1 * 2 + l2 + 2) >> 2;
882
43285
    DST(0,2) = (tl + l0 * 2 + l1 + 2) >> 2;
883
43285
    DST(0,0) = DST(1,2) = (tl + a0 + 1) >> 1;
884
43285
    DST(0,1) = DST(1,3) = (l0 + tl * 2 + a0 + 2) >> 2;
885
43285
    DST(1,0) = DST(2,2) = (a0 + a1 + 1) >> 1;
886
43285
    DST(1,1) = DST(2,3) = (tl + a0 * 2 + a1 + 2) >> 2;
887
43285
    DST(2,0) = DST(3,2) = (a1 + a2 + 1) >> 1;
888
43285
    DST(2,1) = DST(3,3) = (a0 + a1 * 2 + a2 + 2) >> 2;
889
43285
    DST(3,0) = (a2 + a3 + 1) >> 1;
890
43285
    DST(3,1) = (a1 + a2 * 2 + a3 + 2) >> 2;
891
43285
}
892
893
#define def_vert_right(size) \
894
static void vert_right_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
895
                                           const uint8_t *_left, const uint8_t *_top) \
896
{ \
897
    pixel *dst = (pixel *) _dst; \
898
    const pixel *top = (const pixel *) _top; \
899
    const pixel *left = (const pixel *) _left; \
900
    int i, j; \
901
    pixel ve[size + size/2 - 1], vo[size + size/2 - 1]; \
902
\
903
    stride /= sizeof(pixel); \
904
    for (i = 0; i < size/2 - 2; i++) { \
905
        vo[i] = (left[i*2 + 3] + left[i*2 + 2] * 2 + left[i*2 + 1] + 2) >> 2; \
906
        ve[i] = (left[i*2 + 4] + left[i*2 + 3] * 2 + left[i*2 + 2] + 2) >> 2; \
907
    } \
908
    vo[size/2 - 2] = (left[size - 1] + left[size - 2] * 2 + left[size - 3] + 2) >> 2; \
909
    ve[size/2 - 2] = (top[-1] + left[size - 1] * 2 + left[size - 2] + 2) >> 2; \
910
\
911
    ve[size/2 - 1] = (top[-1] + top[0] + 1) >> 1; \
912
    vo[size/2 - 1] = (left[size - 1] + top[-1] * 2 + top[0] + 2) >> 2; \
913
    for (i = 0; i < size - 1; i++) { \
914
        ve[size/2 + i] = (top[i] + top[i + 1] + 1) >> 1; \
915
        vo[size/2 + i] = (top[i - 1] + top[i] * 2 + top[i + 1] + 2) >> 2; \
916
    } \
917
\
918
    for (j = 0; j < size / 2; j++) { \
919
        memcpy(dst +  j*2     *stride, ve + size/2 - 1 - j, size * sizeof(pixel)); \
920
        memcpy(dst + (j*2 + 1)*stride, vo + size/2 - 1 - j, size * sizeof(pixel)); \
921
    } \
922
}
923
924

62720
def_vert_right(8)
925

26460
def_vert_right(16)
926

10478
def_vert_right(32)
927
928
47033
static void hor_down_4x4_c(uint8_t *_dst, ptrdiff_t stride,
929
                           const uint8_t *_left, const uint8_t *_top)
930
{
931
47033
    pixel *dst = (pixel *) _dst;
932
47033
    const pixel *top = (const pixel *) _top;
933
47033
    const pixel *left = (const pixel *) _left;
934
47033
    int l0 = left[3], l1 = left[2], l2 = left[1], l3 = left[0],
935
47033
        tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2];
936
937
47033
    stride /= sizeof(pixel);
938
47033
    DST(2,0) = (tl + a0 * 2 + a1 + 2) >> 2;
939
47033
    DST(3,0) = (a0 + a1 * 2 + a2 + 2) >> 2;
940
47033
    DST(0,0) = DST(2,1) = (tl + l0 + 1) >> 1;
941
47033
    DST(1,0) = DST(3,1) = (a0 + tl * 2 + l0 + 2) >> 2;
942
47033
    DST(0,1) = DST(2,2) = (l0 + l1 + 1) >> 1;
943
47033
    DST(1,1) = DST(3,2) = (tl + l0 * 2 + l1 + 2) >> 2;
944
47033
    DST(0,2) = DST(2,3) = (l1 + l2 + 1) >> 1;
945
47033
    DST(1,2) = DST(3,3) = (l0 + l1 * 2 + l2 + 2) >> 2;
946
47033
    DST(0,3) = (l2 + l3 + 1) >> 1;
947
47033
    DST(1,3) = (l1 + l2 * 2 + l3 + 2) >> 2;
948
47033
}
949
950
#define def_hor_down(size) \
951
static void hor_down_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
952
                                         const uint8_t *_left, const uint8_t *_top) \
953
{ \
954
    pixel *dst = (pixel *) _dst; \
955
    const pixel *top = (const pixel *) _top; \
956
    const pixel *left = (const pixel *) _left; \
957
    int i, j; \
958
    pixel v[size * 3 - 2]; \
959
\
960
    stride /= sizeof(pixel); \
961
    for (i = 0; i < size - 2; i++) { \
962
        v[i*2       ] = (left[i + 1] + left[i + 0] + 1) >> 1; \
963
        v[i*2    + 1] = (left[i + 2] + left[i + 1] * 2 + left[i + 0] + 2) >> 2; \
964
        v[size*2 + i] = (top[i - 1] + top[i] * 2 + top[i + 1] + 2) >> 2; \
965
    } \
966
    v[size*2 - 2] = (top[-1] + left[size - 1] + 1) >> 1; \
967
    v[size*2 - 4] = (left[size - 1] + left[size - 2] + 1) >> 1; \
968
    v[size*2 - 1] = (top[0]  + top[-1] * 2 + left[size - 1] + 2) >> 2; \
969
    v[size*2 - 3] = (top[-1] + left[size - 1] * 2 + left[size - 2] + 2) >> 2; \
970
\
971
    for (j = 0; j < size; j++) \
972
        memcpy(dst + j*stride, v + size*2 - 2 - j*2, size * sizeof(pixel)); \
973
}
974
975

73785
def_hor_down(8)
976

17794
def_hor_down(16)
977

3591
def_hor_down(32)
978
979
36270
static void vert_left_4x4_c(uint8_t *_dst, ptrdiff_t stride,
980
                            const uint8_t *left, const uint8_t *_top)
981
{
982
36270
    pixel *dst = (pixel *) _dst;
983
36270
    const pixel *top = (const pixel *) _top;
984
36270
    int a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
985
36270
        a4 = top[4], a5 = top[5], a6 = top[6];
986
987
36270
    stride /= sizeof(pixel);
988
36270
    DST(0,0) = (a0 + a1 + 1) >> 1;
989
36270
    DST(0,1) = (a0 + a1 * 2 + a2 + 2) >> 2;
990
36270
    DST(1,0) = DST(0,2) = (a1 + a2 + 1) >> 1;
991
36270
    DST(1,1) = DST(0,3) = (a1 + a2 * 2 + a3 + 2) >> 2;
992
36270
    DST(2,0) = DST(1,2) = (a2 + a3 + 1) >> 1;
993
36270
    DST(2,1) = DST(1,3) = (a2 + a3 * 2 + a4 + 2) >> 2;
994
36270
    DST(3,0) = DST(2,2) = (a3 + a4 + 1) >> 1;
995
36270
    DST(3,1) = DST(2,3) = (a3 + a4 * 2 + a5 + 2) >> 2;
996
36270
    DST(3,2) = (a4 + a5 + 1) >> 1;
997
36270
    DST(3,3) = (a4 + a5 * 2 + a6 + 2) >> 2;
998
36270
}
999
1000
#define def_vert_left(size) \
1001
static void vert_left_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
1002
                                          const uint8_t *left, const uint8_t *_top) \
1003
{ \
1004
    pixel *dst = (pixel *) _dst; \
1005
    const pixel *top = (const pixel *) _top; \
1006
    int i, j; \
1007
    pixel ve[size - 1], vo[size - 1]; \
1008
\
1009
    stride /= sizeof(pixel); \
1010
    for (i = 0; i < size - 2; i++) { \
1011
        ve[i] = (top[i] + top[i + 1] + 1) >> 1; \
1012
        vo[i] = (top[i] + top[i + 1] * 2 + top[i + 2] + 2) >> 2; \
1013
    } \
1014
    ve[size - 2] = (top[size - 2] + top[size - 1] + 1) >> 1; \
1015
    vo[size - 2] = (top[size - 2] + top[size - 1] * 3 + 2) >> 2; \
1016
\
1017
    for (j = 0; j < size / 2; j++) { \
1018
        memcpy(dst +  j*2      * stride, ve + j, (size - j - 1) * sizeof(pixel)); \
1019
        memset_bpc(dst +  j*2      * stride + size - j - 1, top[size - 1], j + 1); \
1020
        memcpy(dst + (j*2 + 1) * stride, vo + j, (size - j - 1) * sizeof(pixel)); \
1021
        memset_bpc(dst + (j*2 + 1) * stride + size - j - 1, top[size - 1], j + 1); \
1022
    } \
1023
}
1024
1025

69696
def_vert_left(8)
1026

29739
def_vert_left(16)
1027

8836
def_vert_left(32)
1028
1029
50770
static void hor_up_4x4_c(uint8_t *_dst, ptrdiff_t stride,
1030
                         const uint8_t *_left, const uint8_t *top)
1031
{
1032
50770
    pixel *dst = (pixel *) _dst;
1033
50770
    const pixel *left = (const pixel *) _left;
1034
50770
    int l0 = left[0], l1 = left[1], l2 = left[2], l3 = left[3];
1035
1036
50770
    stride /= sizeof(pixel);
1037
50770
    DST(0,0) = (l0 + l1 + 1) >> 1;
1038
50770
    DST(1,0) = (l0 + l1 * 2 + l2 + 2) >> 2;
1039
50770
    DST(0,1) = DST(2,0) = (l1 + l2 + 1) >> 1;
1040
50770
    DST(1,1) = DST(3,0) = (l1 + l2 * 2 + l3 + 2) >> 2;
1041
50770
    DST(0,2) = DST(2,1) = (l2 + l3 + 1) >> 1;
1042
50770
    DST(1,2) = DST(3,1) = (l2 + l3 * 3 + 2) >> 2;
1043
50770
    DST(0,3) = DST(1,3) = DST(2,2) = DST(2,3) = DST(3,2) = DST(3,3) = l3;
1044
50770
}
1045
1046
#define def_hor_up(size) \
1047
static void hor_up_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
1048
                                       const uint8_t *_left, const uint8_t *top) \
1049
{ \
1050
    pixel *dst = (pixel *) _dst; \
1051
    const pixel *left = (const pixel *) _left; \
1052
    int i, j; \
1053
    pixel v[size*2 - 2]; \
1054
\
1055
    stride /= sizeof(pixel); \
1056
    for (i = 0; i < size - 2; i++) { \
1057
        v[i*2    ] = (left[i] + left[i + 1] + 1) >> 1; \
1058
        v[i*2 + 1] = (left[i] + left[i + 1] * 2 + left[i + 2] + 2) >> 2; \
1059
    } \
1060
    v[size*2 - 4] = (left[size - 2] + left[size - 1] + 1) >> 1; \
1061
    v[size*2 - 3] = (left[size - 2] + left[size - 1] * 3 + 2) >> 2; \
1062
\
1063
    for (j = 0; j < size / 2; j++) \
1064
        memcpy(dst + j*stride, v + j*2, size * sizeof(pixel)); \
1065
    for (j = size / 2; j < size; j++) { \
1066
        memcpy(dst + j*stride, v + j*2, (size*2 - 2 - j*2) * sizeof(pixel)); \
1067
        memset_bpc(dst + j*stride + size*2 - 2 - j*2, left[size - 1], \
1068
                   2 + j*2 - size); \
1069
    } \
1070
}
1071
1072

140715
def_hor_up(8)
1073

54560
def_hor_up(16)
1074

10458
def_hor_up(32)
1075
1076
#undef DST
1077
1078
#endif /* BIT_DEPTH != 12 */
1079
1080
#if BIT_DEPTH != 8
1081
void ff_vp9dsp_intrapred_init_10(VP9DSPContext *dsp);
1082
#endif
1083
#if BIT_DEPTH != 10
1084
static
1085
#endif
1086
735
av_cold void FUNC(ff_vp9dsp_intrapred_init)(VP9DSPContext *dsp)
1087
{
1088
#define init_intra_pred_bd_aware(tx, sz) \
1089
    dsp->intra_pred[tx][TM_VP8_PRED]          = tm_##sz##_c; \
1090
    dsp->intra_pred[tx][DC_128_PRED]          = dc_128_##sz##_c; \
1091
    dsp->intra_pred[tx][DC_127_PRED]          = dc_127_##sz##_c; \
1092
    dsp->intra_pred[tx][DC_129_PRED]          = dc_129_##sz##_c
1093
1094
#if BIT_DEPTH == 12
1095
74
    ff_vp9dsp_intrapred_init_10(dsp);
1096
#define init_intra_pred(tx, sz) \
1097
    init_intra_pred_bd_aware(tx, sz)
1098
#else
1099
    #define init_intra_pred(tx, sz) \
1100
    dsp->intra_pred[tx][VERT_PRED]            = vert_##sz##_c; \
1101
    dsp->intra_pred[tx][HOR_PRED]             = hor_##sz##_c; \
1102
    dsp->intra_pred[tx][DC_PRED]              = dc_##sz##_c; \
1103
    dsp->intra_pred[tx][DIAG_DOWN_LEFT_PRED]  = diag_downleft_##sz##_c; \
1104
    dsp->intra_pred[tx][DIAG_DOWN_RIGHT_PRED] = diag_downright_##sz##_c; \
1105
    dsp->intra_pred[tx][VERT_RIGHT_PRED]      = vert_right_##sz##_c; \
1106
    dsp->intra_pred[tx][HOR_DOWN_PRED]        = hor_down_##sz##_c; \
1107
    dsp->intra_pred[tx][VERT_LEFT_PRED]       = vert_left_##sz##_c; \
1108
    dsp->intra_pred[tx][HOR_UP_PRED]          = hor_up_##sz##_c; \
1109
    dsp->intra_pred[tx][LEFT_DC_PRED]         = dc_left_##sz##_c; \
1110
    dsp->intra_pred[tx][TOP_DC_PRED]          = dc_top_##sz##_c; \
1111
    init_intra_pred_bd_aware(tx, sz)
1112
#endif
1113
1114
735
    init_intra_pred(TX_4X4,   4x4);
1115
735
    init_intra_pred(TX_8X8,   8x8);
1116
735
    init_intra_pred(TX_16X16, 16x16);
1117
735
    init_intra_pred(TX_32X32, 32x32);
1118
1119
#undef init_intra_pred
1120
#undef init_intra_pred_bd_aware
1121
735
}
1122
1123
#define itxfm_wrapper(type_a, type_b, sz, bits, has_dconly) \
1124
static void type_a##_##type_b##_##sz##x##sz##_add_c(uint8_t *_dst, \
1125
                                                    ptrdiff_t stride, \
1126
                                                    int16_t *_block, int eob) \
1127
{ \
1128
    int i, j; \
1129
    pixel *dst = (pixel *) _dst; \
1130
    dctcoef *block = (dctcoef *) _block, tmp[sz * sz], out[sz]; \
1131
\
1132
    stride /= sizeof(pixel); \
1133
    if (has_dconly && eob == 1) { \
1134
        const int t  = ((((dctint) block[0] * 11585 + (1 << 13)) >> 14) \
1135
                                            * 11585 + (1 << 13)) >> 14; \
1136
        block[0] = 0; \
1137
        for (i = 0; i < sz; i++) { \
1138
            for (j = 0; j < sz; j++) \
1139
                dst[j * stride] = av_clip_pixel(dst[j * stride] + \
1140
                                                (bits ? \
1141
                                                 (t + (1 << (bits - 1))) >> bits : \
1142
                                                 t)); \
1143
            dst++; \
1144
        } \
1145
        return; \
1146
    } \
1147
\
1148
    for (i = 0; i < sz; i++) \
1149
        type_a##sz##_1d(block + i, sz, tmp + i * sz, 0); \
1150
    memset(block, 0, sz * sz * sizeof(*block)); \
1151
    for (i = 0; i < sz; i++) { \
1152
        type_b##sz##_1d(tmp + i, sz, out, 1); \
1153
        for (j = 0; j < sz; j++) \
1154
            dst[j * stride] = av_clip_pixel(dst[j * stride] + \
1155
                                            (bits ? \
1156
                                             (out[j] + (1 << (bits - 1))) >> bits : \
1157
                                             out[j])); \
1158
        dst++; \
1159
    } \
1160
}
1161
1162
#define itxfm_wrap(sz, bits) \
1163
itxfm_wrapper(idct,  idct,  sz, bits, 1) \
1164
itxfm_wrapper(iadst, idct,  sz, bits, 0) \
1165
itxfm_wrapper(idct,  iadst, sz, bits, 0) \
1166
itxfm_wrapper(iadst, iadst, sz, bits, 0)
1167
1168
#define IN(x) ((dctint) in[(x) * stride])
1169
1170
3485652
static av_always_inline void idct4_1d(const dctcoef *in, ptrdiff_t stride,
1171
                                      dctcoef *out, int pass)
1172
{
1173
    dctint t0, t1, t2, t3;
1174
1175
3485652
    t0 = ((IN(0) + IN(2)) * 11585 + (1 << 13)) >> 14;
1176
3485652
    t1 = ((IN(0) - IN(2)) * 11585 + (1 << 13)) >> 14;
1177
3485652
    t2 = (IN(1) *  6270 - IN(3) * 15137 + (1 << 13)) >> 14;
1178
3485652
    t3 = (IN(1) * 15137 + IN(3) *  6270 + (1 << 13)) >> 14;
1179
1180
3485652
    out[0] = t0 + t3;
1181
3485652
    out[1] = t1 + t2;
1182
3485652
    out[2] = t1 - t2;
1183
3485652
    out[3] = t0 - t3;
1184
3485652
}
1185
1186
1493868
static av_always_inline void iadst4_1d(const dctcoef *in, ptrdiff_t stride,
1187
                                       dctcoef *out, int pass)
1188
{
1189
    dctint t0, t1, t2, t3;
1190
1191
1493868
    t0 =  5283 * IN(0) + 15212 * IN(2) +  9929 * IN(3);
1192
1493868
    t1 =  9929 * IN(0) -  5283 * IN(2) - 15212 * IN(3);
1193
1493868
    t2 = 13377 * (IN(0) - IN(2) + IN(3));
1194
1493868
    t3 = 13377 * IN(1);
1195
1196
1493868
    out[0] = (t0 + t3      + (1 << 13)) >> 14;
1197
1493868
    out[1] = (t1 + t3      + (1 << 13)) >> 14;
1198
1493868
    out[2] = (t2           + (1 << 13)) >> 14;
1199
1493868
    out[3] = (t0 + t1 - t3 + (1 << 13)) >> 14;
1200
1493868
}
1201
1202



35865312
itxfm_wrap(4, 4)
1203
1204
2918504
static av_always_inline void idct8_1d(const dctcoef *in, ptrdiff_t stride,
1205
                                      dctcoef *out, int pass)
1206
{
1207
    dctint t0, t0a, t1, t1a, t2, t2a, t3, t3a, t4, t4a, t5, t5a, t6, t6a, t7, t7a;
1208
1209
2918504
    t0a = ((IN(0) + IN(4)) * 11585 + (1 << 13)) >> 14;
1210
2918504
    t1a = ((IN(0) - IN(4)) * 11585 + (1 << 13)) >> 14;
1211
2918504
    t2a = (IN(2) *  6270 - IN(6) * 15137 + (1 << 13)) >> 14;
1212
2918504
    t3a = (IN(2) * 15137 + IN(6) *  6270 + (1 << 13)) >> 14;
1213
2918504
    t4a = (IN(1) *  3196 - IN(7) * 16069 + (1 << 13)) >> 14;
1214
2918504
    t5a = (IN(5) * 13623 - IN(3) *  9102 + (1 << 13)) >> 14;
1215
2918504
    t6a = (IN(5) *  9102 + IN(3) * 13623 + (1 << 13)) >> 14;
1216
2918504
    t7a = (IN(1) * 16069 + IN(7) *  3196 + (1 << 13)) >> 14;
1217
1218
2918504
    t0  = t0a + t3a;
1219
2918504
    t1  = t1a + t2a;
1220
2918504
    t2  = t1a - t2a;
1221
2918504
    t3  = t0a - t3a;
1222
2918504
    t4  = t4a + t5a;
1223
2918504
    t5a = t4a - t5a;
1224
2918504
    t7  = t7a + t6a;
1225
2918504
    t6a = t7a - t6a;
1226
1227
2918504
    t5  = ((t6a - t5a) * 11585 + (1 << 13)) >> 14;
1228
2918504
    t6  = ((t6a + t5a) * 11585 + (1 << 13)) >> 14;
1229
1230
2918504
    out[0] = t0 + t7;
1231
2918504
    out[1] = t1 + t6;
1232
2918504
    out[2] = t2 + t5;
1233
2918504
    out[3] = t3 + t4;
1234
2918504
    out[4] = t3 - t4;
1235
2918504
    out[5] = t2 - t5;
1236
2918504
    out[6] = t1 - t6;
1237
2918504
    out[7] = t0 - t7;
1238
2918504
}
1239
1240
719416
static av_always_inline void iadst8_1d(const dctcoef *in, ptrdiff_t stride,
1241
                                       dctcoef *out, int pass)
1242
{
1243
    dctint t0, t0a, t1, t1a, t2, t2a, t3, t3a, t4, t4a, t5, t5a, t6, t6a, t7, t7a;
1244
1245
719416
    t0a = 16305 * IN(7) +  1606 * IN(0);
1246
719416
    t1a =  1606 * IN(7) - 16305 * IN(0);
1247
719416
    t2a = 14449 * IN(5) +  7723 * IN(2);
1248
719416
    t3a =  7723 * IN(5) - 14449 * IN(2);
1249
719416
    t4a = 10394 * IN(3) + 12665 * IN(4);
1250
719416
    t5a = 12665 * IN(3) - 10394 * IN(4);
1251
719416
    t6a =  4756 * IN(1) + 15679 * IN(6);
1252
719416
    t7a = 15679 * IN(1) -  4756 * IN(6);
1253
1254
719416
    t0 = (t0a + t4a + (1 << 13)) >> 14;
1255
719416
    t1 = (t1a + t5a + (1 << 13)) >> 14;
1256
719416
    t2 = (t2a + t6a + (1 << 13)) >> 14;
1257
719416
    t3 = (t3a + t7a + (1 << 13)) >> 14;
1258
719416
    t4 = (t0a - t4a + (1 << 13)) >> 14;
1259
719416
    t5 = (t1a - t5a + (1 << 13)) >> 14;
1260
719416
    t6 = (t2a - t6a + (1 << 13)) >> 14;
1261
719416
    t7 = (t3a - t7a + (1 << 13)) >> 14;
1262
1263
719416
    t4a = 15137U * t4 +  6270U * t5;
1264
719416
    t5a =  6270U * t4 - 15137U * t5;
1265
719416
    t6a = 15137U * t7 -  6270U * t6;
1266
719416
    t7a =  6270U * t7 + 15137U * t6;
1267
1268
719416
    out[0] =   t0 + t2;
1269
719416
    out[7] = -(t1 + t3);
1270
719416
    t2     =   t0 - t2;
1271
719416
    t3     =   t1 - t3;
1272
1273
719416
    out[1] = -((dctint)((1U << 13) + t4a + t6a) >> 14);
1274
719416
    out[6] =   (dctint)((1U << 13) + t5a + t7a) >> 14;
1275
719416
    t6     =   (dctint)((1U << 13) + t4a - t6a) >> 14;
1276
719416
    t7     =   (dctint)((1U << 13) + t5a - t7a) >> 14;
1277
1278
719416
    out[3] = -(((t2 + t3) * 11585 + (1 << 13)) >> 14);
1279
719416
    out[4] =   ((t2 - t3) * 11585 + (1 << 13)) >> 14;
1280
719416
    out[2] =   ((t6 + t7) * 11585 + (1 << 13)) >> 14;
1281
719416
    out[5] = -(((t6 - t7) * 11585 + (1 << 13)) >> 14);
1282
719416
}
1283
1284



41791954
itxfm_wrap(8, 5)
1285
1286
1107952
static av_always_inline void idct16_1d(const dctcoef *in, ptrdiff_t stride,
1287
                                       dctcoef *out, int pass)
1288
{
1289
    dctint t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
1290
    dctint t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a;
1291
    dctint t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
1292
1293
1107952
    t0a  = ((IN(0) + IN(8)) * 11585 + (1 << 13)) >> 14;
1294
1107952
    t1a  = ((IN(0) - IN(8)) * 11585 + (1 << 13)) >> 14;
1295
1107952
    t2a  = (IN(4)  *  6270 - IN(12) * 15137 + (1 << 13)) >> 14;
1296
1107952
    t3a  = (IN(4)  * 15137 + IN(12) *  6270 + (1 << 13)) >> 14;
1297
1107952
    t4a  = (IN(2)  *  3196 - IN(14) * 16069 + (1 << 13)) >> 14;
1298
1107952
    t7a  = (IN(2)  * 16069 + IN(14) *  3196 + (1 << 13)) >> 14;
1299
1107952
    t5a  = (IN(10) * 13623 - IN(6)  *  9102 + (1 << 13)) >> 14;
1300
1107952
    t6a  = (IN(10) *  9102 + IN(6)  * 13623 + (1 << 13)) >> 14;
1301
1107952
    t8a  = (IN(1)  *  1606 - IN(15) * 16305 + (1 << 13)) >> 14;
1302
1107952
    t15a = (IN(1)  * 16305 + IN(15) *  1606 + (1 << 13)) >> 14;
1303
1107952
    t9a  = (IN(9)  * 12665 - IN(7)  * 10394 + (1 << 13)) >> 14;
1304
1107952
    t14a = (IN(9)  * 10394 + IN(7)  * 12665 + (1 << 13)) >> 14;
1305
1107952
    t10a = (IN(5)  *  7723 - IN(11) * 14449 + (1 << 13)) >> 14;
1306
1107952
    t13a = (IN(5)  * 14449 + IN(11) *  7723 + (1 << 13)) >> 14;
1307
1107952
    t11a = (IN(13) * 15679 - IN(3)  *  4756 + (1 << 13)) >> 14;
1308
1107952
    t12a = (IN(13) *  4756 + IN(3)  * 15679 + (1 << 13)) >> 14;
1309
1310
1107952
    t0  = t0a  + t3a;
1311
1107952
    t1  = t1a  + t2a;
1312
1107952
    t2  = t1a  - t2a;
1313
1107952
    t3  = t0a  - t3a;
1314
1107952
    t4  = t4a  + t5a;
1315
1107952
    t5  = t4a  - t5a;
1316
1107952
    t6  = t7a  - t6a;
1317
1107952
    t7  = t7a  + t6a;
1318
1107952
    t8  = t8a  + t9a;
1319
1107952
    t9  = t8a  - t9a;
1320
1107952
    t10 = t11a - t10a;
1321
1107952
    t11 = t11a + t10a;
1322
1107952
    t12 = t12a + t13a;
1323
1107952
    t13 = t12a - t13a;
1324
1107952
    t14 = t15a - t14a;
1325
1107952
    t15 = t15a + t14a;
1326
1327
1107952
    t5a  = ((t6 - t5) * 11585 + (1 << 13)) >> 14;
1328
1107952
    t6a  = ((t6 + t5) * 11585 + (1 << 13)) >> 14;
1329
1107952
    t9a  = (  t14 *  6270 - t9  * 15137  + (1 << 13)) >> 14;
1330
1107952
    t14a = (  t14 * 15137 + t9  *  6270  + (1 << 13)) >> 14;
1331
1107952
    t10a = (-(t13 * 15137 + t10 *  6270) + (1 << 13)) >> 14;
1332
1107952
    t13a = (  t13 *  6270 - t10 * 15137  + (1 << 13)) >> 14;
1333
1334
1107952
    t0a  = t0   + t7;
1335
1107952
    t1a  = t1   + t6a;
1336
1107952
    t2a  = t2   + t5a;
1337
1107952
    t3a  = t3   + t4;
1338
1107952
    t4   = t3   - t4;
1339
1107952
    t5   = t2   - t5a;
1340
1107952
    t6   = t1   - t6a;
1341
1107952
    t7   = t0   - t7;
1342
1107952
    t8a  = t8   + t11;
1343
1107952
    t9   = t9a  + t10a;
1344
1107952
    t10  = t9a  - t10a;
1345
1107952
    t11a = t8   - t11;
1346
1107952
    t12a = t15  - t12;
1347
1107952
    t13  = t14a - t13a;
1348
1107952
    t14  = t14a + t13a;
1349
1107952
    t15a = t15  + t12;
1350
1351
1107952
    t10a = ((t13  - t10)  * 11585 + (1 << 13)) >> 14;
1352
1107952
    t13a = ((t13  + t10)  * 11585 + (1 << 13)) >> 14;
1353
1107952
    t11  = ((t12a - t11a) * 11585 + (1 << 13)) >> 14;
1354
1107952
    t12  = ((t12a + t11a) * 11585 + (1 << 13)) >> 14;
1355
1356
1107952
    out[ 0] = t0a + t15a;
1357
1107952
    out[ 1] = t1a + t14;
1358
1107952
    out[ 2] = t2a + t13a;
1359
1107952
    out[ 3] = t3a + t12;
1360
1107952
    out[ 4] = t4  + t11;
1361
1107952
    out[ 5] = t5  + t10a;
1362
1107952
    out[ 6] = t6  + t9;
1363
1107952
    out[ 7] = t7  + t8a;
1364
1107952
    out[ 8] = t7  - t8a;
1365
1107952
    out[ 9] = t6  - t9;
1366
1107952
    out[10] = t5  - t10a;
1367
1107952
    out[11] = t4  - t11;
1368
1107952
    out[12] = t3a - t12;
1369
1107952
    out[13] = t2a - t13a;
1370
1107952
    out[14] = t1a - t14;
1371
1107952
    out[15] = t0a - t15a;
1372
1107952
}
1373
1374
311504
static av_always_inline void iadst16_1d(const dctcoef *in, ptrdiff_t stride,
1375
                                        dctcoef *out, int pass)
1376
{
1377
    dctint t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
1378
    dctint t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a;
1379
    dctint t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
1380
1381
311504
    t0  = IN(15) * 16364U + IN(0)  *   804U;
1382
311504
    t1  = IN(15) *   804U - IN(0)  * 16364U;
1383
311504
    t2  = IN(13) * 15893U + IN(2)  *  3981U;
1384
311504
    t3  = IN(13) *  3981U - IN(2)  * 15893U;
1385
311504
    t4  = IN(11) * 14811U + IN(4)  *  7005U;
1386
311504
    t5  = IN(11) *  7005U - IN(4)  * 14811U;
1387
311504
    t6  = IN(9)  * 13160U + IN(6)  *  9760U;
1388
311504
    t7  = IN(9)  *  9760U - IN(6)  * 13160U;
1389
311504
    t8  = IN(7)  * 11003U + IN(8)  * 12140U;
1390
311504
    t9  = IN(7)  * 12140U - IN(8)  * 11003U;
1391
311504
    t10 = IN(5)  *  8423U + IN(10) * 14053U;
1392
311504
    t11 = IN(5)  * 14053U - IN(10) *  8423U;
1393
311504
    t12 = IN(3)  *  5520U + IN(12) * 15426U;
1394
311504
    t13 = IN(3)  * 15426U - IN(12) *  5520U;
1395
311504
    t14 = IN(1)  *  2404U + IN(14) * 16207U;
1396
311504
    t15 = IN(1)  * 16207U - IN(14) *  2404U;
1397
1398
311504
    t0a  = (dctint)((1U << 13) + t0 + t8 ) >> 14;
1399
311504
    t1a  = (dctint)((1U << 13) + t1 + t9 ) >> 14;
1400
311504
    t2a  = (dctint)((1U << 13) + t2 + t10) >> 14;
1401
311504
    t3a  = (dctint)((1U << 13) + t3 + t11) >> 14;
1402
311504
    t4a  = (dctint)((1U << 13) + t4 + t12) >> 14;
1403
311504
    t5a  = (dctint)((1U << 13) + t5 + t13) >> 14;
1404
311504
    t6a  = (dctint)((1U << 13) + t6 + t14) >> 14;
1405
311504
    t7a  = (dctint)((1U << 13) + t7 + t15) >> 14;
1406
311504
    t8a  = (dctint)((1U << 13) + t0 - t8 ) >> 14;
1407
311504
    t9a  = (dctint)((1U << 13) + t1 - t9 ) >> 14;
1408
311504
    t10a = (dctint)((1U << 13) + t2 - t10) >> 14;
1409
311504
    t11a = (dctint)((1U << 13) + t3 - t11) >> 14;
1410
311504
    t12a = (dctint)((1U << 13) + t4 - t12) >> 14;
1411
311504
    t13a = (dctint)((1U << 13) + t5 - t13) >> 14;
1412
311504
    t14a = (dctint)((1U << 13) + t6 - t14) >> 14;
1413
311504
    t15a = (dctint)((1U << 13) + t7 - t15) >> 14;
1414
1415
311504
    t8   = t8a  * 16069U + t9a  *  3196U;
1416
311504
    t9   = t8a  *  3196U - t9a  * 16069U;
1417
311504
    t10  = t10a *  9102U + t11a * 13623U;
1418
311504
    t11  = t10a * 13623U - t11a *  9102U;
1419
311504
    t12  = t13a * 16069U - t12a *  3196U;
1420
311504
    t13  = t13a *  3196U + t12a * 16069U;
1421
311504
    t14  = t15a *  9102U - t14a * 13623U;
1422
311504
    t15  = t15a * 13623U + t14a *  9102U;
1423
1424
311504
    t0   = t0a + t4a;
1425
311504
    t1   = t1a + t5a;
1426
311504
    t2   = t2a + t6a;
1427
311504
    t3   = t3a + t7a;
1428
311504
    t4   = t0a - t4a;
1429
311504
    t5   = t1a - t5a;
1430
311504
    t6   = t2a - t6a;
1431
311504
    t7   = t3a - t7a;
1432
311504
    t8a  = (dctint)((1U << 13) + t8  + t12) >> 14;
1433
311504
    t9a  = (dctint)((1U << 13) + t9  + t13) >> 14;
1434
311504
    t10a = (dctint)((1U << 13) + t10 + t14) >> 14;
1435
311504
    t11a = (dctint)((1U << 13) + t11 + t15) >> 14;
1436
311504
    t12a = (dctint)((1U << 13) + t8  - t12) >> 14;
1437
311504
    t13a = (dctint)((1U << 13) + t9  - t13) >> 14;
1438
311504
    t14a = (dctint)((1U << 13) + t10 - t14) >> 14;
1439
311504
    t15a = (dctint)((1U << 13) + t11 - t15) >> 14;
1440
1441
311504
    t4a  = t4 * 15137U + t5 *  6270U;
1442
311504
    t5a  = t4 *  6270U - t5 * 15137U;
1443
311504
    t6a  = t7 * 15137U - t6 *  6270U;
1444
311504
    t7a  = t7 *  6270U + t6 * 15137U;
1445
311504
    t12  = t12a * 15137U + t13a *  6270U;
1446
311504
    t13  = t12a *  6270U - t13a * 15137U;
1447
311504
    t14  = t15a * 15137U - t14a *  6270U;
1448
311504
    t15  = t15a *  6270U + t14a * 15137U;
1449
1450
311504
    out[ 0] =   t0 + t2;
1451
311504
    out[15] = -(t1 + t3);
1452
311504
    t2a     =   t0 - t2;
1453
311504
    t3a     =   t1 - t3;
1454
311504
    out[ 3] = -((dctint)((1U << 13) + t4a + t6a) >> 14);
1455
311504
    out[12] =   (dctint)((1U << 13) + t5a + t7a) >> 14;
1456
311504
    t6      =   (dctint)((1U << 13) + t4a - t6a) >> 14;
1457
311504
    t7      =   (dctint)((1U << 13) + t5a - t7a) >> 14;
1458
311504
    out[ 1] = -(t8a + t10a);
1459
311504
    out[14] =   t9a + t11a;
1460
311504
    t10     =   t8a - t10a;
1461
311504
    t11     =   t9a - t11a;
1462
311504
    out[ 2] =   (dctint)((1U << 13) + t12 + t14) >> 14;
1463
311504
    out[13] = -((dctint)((1U << 13) + t13 + t15) >> 14);
1464
311504
    t14a    =   (dctint)((1U << 13) + t12 - t14) >> 14;
1465
311504
    t15a    =   (dctint)((1U << 13) + t13 - t15) >> 14;
1466
1467
311504
    out[ 7] = (dctint)(-(t2a  + t3a)  * 11585U  + (1 << 13)) >> 14;
1468
311504
    out[ 8] = (dctint)( (t2a  - t3a)  * 11585U  + (1 << 13)) >> 14;
1469
311504
    out[ 4] = (dctint)( (t7   + t6)   * 11585U  + (1 << 13)) >> 14;
1470
311504
    out[11] = (dctint)( (t7   - t6)   * 11585U  + (1 << 13)) >> 14;
1471
311504
    out[ 6] = (dctint)( (t11  + t10)  * 11585U  + (1 << 13)) >> 14;
1472
311504
    out[ 9] = (dctint)( (t11  - t10)  * 11585U  + (1 << 13)) >> 14;
1473
311504
    out[ 5] = (dctint)(-(t14a + t15a) * 11585U  + (1 << 13)) >> 14;
1474
311504
    out[10] = (dctint)( (t14a - t15a) * 11585U  + (1 << 13)) >> 14;
1475
311504
}
1476
1477



30057156
itxfm_wrap(16, 6)
1478
1479
628800
static av_always_inline void idct32_1d(const dctcoef *in, ptrdiff_t stride,
1480
                                       dctcoef *out, int pass)
1481
{
1482
628800
    dctint t0a  = (dctint)((IN(0) + IN(16)) * 11585U         + (1 << 13)) >> 14;
1483
628800
    dctint t1a  = (dctint)((IN(0) - IN(16)) * 11585U         + (1 << 13)) >> 14;
1484
628800
    dctint t2a  = (dctint)(IN( 8) *  6270U - IN(24) * 15137U + (1 << 13)) >> 14;
1485
628800
    dctint t3a  = (dctint)(IN( 8) * 15137U + IN(24) *  6270U + (1 << 13)) >> 14;
1486
628800
    dctint t4a  = (dctint)(IN( 4) *  3196U - IN(28) * 16069U + (1 << 13)) >> 14;
1487
628800
    dctint t7a  = (dctint)(IN( 4) * 16069U + IN(28) *  3196U + (1 << 13)) >> 14;
1488
628800
    dctint t5a  = (dctint)(IN(20) * 13623U - IN(12) *  9102U + (1 << 13)) >> 14;
1489
628800
    dctint t6a  = (dctint)(IN(20) *  9102U + IN(12) * 13623U + (1 << 13)) >> 14;
1490
628800
    dctint t8a  = (dctint)(IN( 2) *  1606U - IN(30) * 16305U + (1 << 13)) >> 14;
1491
628800
    dctint t15a = (dctint)(IN( 2) * 16305U + IN(30) *  1606U + (1 << 13)) >> 14;
1492
628800
    dctint t9a  = (dctint)(IN(18) * 12665U - IN(14) * 10394U + (1 << 13)) >> 14;
1493
628800
    dctint t14a = (dctint)(IN(18) * 10394U + IN(14) * 12665U + (1 << 13)) >> 14;
1494
628800
    dctint t10a = (dctint)(IN(10) *  7723U - IN(22) * 14449U + (1 << 13)) >> 14;
1495
628800
    dctint t13a = (dctint)(IN(10) * 14449U + IN(22) *  7723U + (1 << 13)) >> 14;
1496
628800
    dctint t11a = (dctint)(IN(26) * 15679U - IN( 6) *  4756U + (1 << 13)) >> 14;
1497
628800
    dctint t12a = (dctint)(IN(26) *  4756U + IN( 6) * 15679U + (1 << 13)) >> 14;
1498
628800
    dctint t16a = (dctint)(IN( 1) *   804U - IN(31) * 16364U + (1 << 13)) >> 14;
1499
628800
    dctint t31a = (dctint)(IN( 1) * 16364U + IN(31) *   804U + (1 << 13)) >> 14;
1500
628800
    dctint t17a = (dctint)(IN(17) * 12140U - IN(15) * 11003U + (1 << 13)) >> 14;
1501
628800
    dctint t30a = (dctint)(IN(17) * 11003U + IN(15) * 12140U + (1 << 13)) >> 14;
1502
628800
    dctint t18a = (dctint)(IN( 9) *  7005U - IN(23) * 14811U + (1 << 13)) >> 14;
1503
628800
    dctint t29a = (dctint)(IN( 9) * 14811U + IN(23) *  7005U + (1 << 13)) >> 14;
1504
628800
    dctint t19a = (dctint)(IN(25) * 15426U - IN( 7) *  5520U + (1 << 13)) >> 14;
1505
628800
    dctint t28a = (dctint)(IN(25) *  5520U + IN( 7) * 15426U + (1 << 13)) >> 14;
1506
628800
    dctint t20a = (dctint)(IN( 5) *  3981U - IN(27) * 15893U + (1 << 13)) >> 14;
1507
628800
    dctint t27a = (dctint)(IN( 5) * 15893U + IN(27) *  3981U + (1 << 13)) >> 14;
1508
628800
    dctint t21a = (dctint)(IN(21) * 14053U - IN(11) *  8423U + (1 << 13)) >> 14;
1509
628800
    dctint t26a = (dctint)(IN(21) *  8423U + IN(11) * 14053U + (1 << 13)) >> 14;
1510
628800
    dctint t22a = (dctint)(IN(13) *  9760U - IN(19) * 13160U + (1 << 13)) >> 14;
1511
628800
    dctint t25a = (dctint)(IN(13) * 13160U + IN(19) *  9760U + (1 << 13)) >> 14;
1512
628800
    dctint t23a = (dctint)(IN(29) * 16207U - IN( 3) *  2404U + (1 << 13)) >> 14;
1513
628800
    dctint t24a = (dctint)(IN(29) *  2404U + IN( 3) * 16207U + (1 << 13)) >> 14;
1514
1515
628800
    dctint t0  = t0a  + t3a;
1516
628800
    dctint t1  = t1a  + t2a;
1517
628800
    dctint t2  = t1a  - t2a;
1518
628800
    dctint t3  = t0a  - t3a;
1519
628800
    dctint t4  = t4a  + t5a;
1520
628800
    dctint t5  = t4a  - t5a;
1521
628800
    dctint t6  = t7a  - t6a;
1522
628800
    dctint t7  = t7a  + t6a;
1523
628800
    dctint t8  = t8a  + t9a;
1524
628800
    dctint t9  = t8a  - t9a;
1525
628800
    dctint t10 = t11a - t10a;
1526
628800
    dctint t11 = t11a + t10a;
1527
628800
    dctint t12 = t12a + t13a;
1528
628800
    dctint t13 = t12a - t13a;
1529
628800
    dctint t14 = t15a - t14a;
1530
628800
    dctint t15 = t15a + t14a;
1531
628800
    dctint t16 = t16a + t17a;
1532
628800
    dctint t17 = t16a - t17a;
1533
628800
    dctint t18 = t19a - t18a;
1534
628800
    dctint t19 = t19a + t18a;
1535
628800
    dctint t20 = t20a + t21a;
1536
628800
    dctint t21 = t20a - t21a;
1537
628800
    dctint t22 = t23a - t22a;
1538
628800
    dctint t23 = t23a + t22a;
1539
628800
    dctint t24 = t24a + t25a;
1540
628800
    dctint t25 = t24a - t25a;
1541
628800
    dctint t26 = t27a - t26a;
1542
628800
    dctint t27 = t27a + t26a;
1543
628800
    dctint t28 = t28a + t29a;
1544
628800
    dctint t29 = t28a - t29a;
1545
628800
    dctint t30 = t31a - t30a;
1546
628800
    dctint t31 = t31a + t30a;
1547
1548
628800
    t5a  = (dctint)((t6 - t5) * 11585U             + (1 << 13)) >> 14;
1549
628800
    t6a  = (dctint)((t6 + t5) * 11585U             + (1 << 13)) >> 14;
1550
628800
    t9a  = (dctint)(  t14 *  6270U - t9  * 15137U  + (1 << 13)) >> 14;
1551
628800
    t14a = (dctint)(  t14 * 15137U + t9  *  6270U  + (1 << 13)) >> 14;
1552
628800
    t10a = (dctint)(-(t13 * 15137U + t10 *  6270U) + (1 << 13)) >> 14;
1553
628800
    t13a = (dctint)(  t13 *  6270U - t10 * 15137U  + (1 << 13)) >> 14;
1554
628800
    t17a = (dctint)(  t30 *  3196U - t17 * 16069U  + (1 << 13)) >> 14;
1555
628800
    t30a = (dctint)(  t30 * 16069U + t17 *  3196U  + (1 << 13)) >> 14;
1556
628800
    t18a = (dctint)(-(t29 * 16069U + t18 *  3196U) + (1 << 13)) >> 14;
1557
628800
    t29a = (dctint)(  t29 *  3196U - t18 * 16069U  + (1 << 13)) >> 14;
1558
628800
    t21a = (dctint)(  t26 * 13623U - t21 *  9102U  + (1 << 13)) >> 14;
1559
628800
    t26a = (dctint)(  t26 *  9102U + t21 * 13623U  + (1 << 13)) >> 14;
1560
628800
    t22a = (dctint)(-(t25 *  9102U + t22 * 13623U) + (1 << 13)) >> 14;
1561
628800
    t25a = (dctint)(  t25 * 13623U - t22 *  9102U  + (1 << 13)) >> 14;
1562
1563
628800
    t0a  = t0   + t7;
1564
628800
    t1a  = t1   + t6a;
1565
628800
    t2a  = t2   + t5a;
1566
628800
    t3a  = t3   + t4;
1567
628800
    t4a  = t3   - t4;
1568
628800
    t5   = t2   - t5a;
1569
628800
    t6   = t1   - t6a;
1570
628800
    t7a  = t0   - t7;
1571
628800
    t8a  = t8   + t11;
1572
628800
    t9   = t9a  + t10a;
1573
628800
    t10  = t9a  - t10a;
1574
628800
    t11a = t8   - t11;
1575
628800
    t12a = t15  - t12;
1576
628800
    t13  = t14a - t13a;
1577
628800
    t14  = t14a + t13a;
1578
628800
    t15a = t15  + t12;
1579
628800
    t16a = t16  + t19;
1580
628800
    t17  = t17a + t18a;
1581
628800
    t18  = t17a - t18a;
1582
628800
    t19a = t16  - t19;
1583
628800
    t20a = t23  - t20;
1584
628800
    t21  = t22a - t21a;
1585
628800
    t22  = t22a + t21a;
1586
628800
    t23a = t23  + t20;
1587
628800
    t24a = t24  + t27;
1588
628800
    t25  = t25a + t26a;
1589
628800
    t26  = t25a - t26a;
1590
628800
    t27a = t24  - t27;
1591
628800
    t28a = t31  - t28;
1592
628800
    t29  = t30a - t29a;
1593
628800
    t30  = t30a + t29a;
1594
628800
    t31a = t31  + t28;
1595
1596
628800
    t10a = (dctint)((t13  - t10)  * 11585U           + (1 << 13)) >> 14;
1597
628800
    t13a = (dctint)((t13  + t10)  * 11585U           + (1 << 13)) >> 14;
1598
628800
    t11  = (dctint)((t12a - t11a) * 11585U           + (1 << 13)) >> 14;
1599
628800
    t12  = (dctint)((t12a + t11a) * 11585U           + (1 << 13)) >> 14;
1600
628800
    t18a = (dctint)(  t29  *  6270U - t18  * 15137U  + (1 << 13)) >> 14;
1601
628800
    t29a = (dctint)(  t29  * 15137U + t18  *  6270U  + (1 << 13)) >> 14;
1602
628800
    t19  = (dctint)(  t28a *  6270U - t19a * 15137U  + (1 << 13)) >> 14;
1603
628800
    t28  = (dctint)(  t28a * 15137U + t19a *  6270U  + (1 << 13)) >> 14;
1604
628800
    t20  = (dctint)(-(t27a * 15137U + t20a *  6270U) + (1 << 13)) >> 14;
1605
628800
    t27  = (dctint)(  t27a *  6270U - t20a * 15137U  + (1 << 13)) >> 14;
1606
628800
    t21a = (dctint)(-(t26  * 15137U + t21  *  6270U) + (1 << 13)) >> 14;
1607
628800
    t26a = (dctint)(  t26  *  6270U - t21  * 15137U  + (1 << 13)) >> 14;
1608
1609
628800
    t0   = t0a + t15a;
1610
628800
    t1   = t1a + t14;
1611
628800
    t2   = t2a + t13a;
1612
628800
    t3   = t3a + t12;
1613
628800
    t4   = t4a + t11;
1614
628800
    t5a  = t5  + t10a;
1615
628800
    t6a  = t6  + t9;
1616
628800
    t7   = t7a + t8a;
1617
628800
    t8   = t7a - t8a;
1618
628800
    t9a  = t6  - t9;
1619
628800
    t10  = t5  - t10a;
1620
628800
    t11a = t4a - t11;
1621
628800
    t12a = t3a - t12;
1622
628800
    t13  = t2a - t13a;
1623
628800
    t14a = t1a - t14;
1624
628800
    t15  = t0a - t15a;
1625
628800
    t16  = t16a + t23a;
1626
628800
    t17a = t17  + t22;
1627
628800
    t18  = t18a + t21a;
1628
628800
    t19a = t19  + t20;
1629
628800
    t20a = t19  - t20;
1630
628800
    t21  = t18a - t21a;
1631
628800
    t22a = t17  - t22;
1632
628800
    t23  = t16a - t23a;
1633
628800
    t24  = t31a - t24a;
1634
628800
    t25a = t30  - t25;
1635
628800
    t26  = t29a - t26a;
1636
628800
    t27a = t28  - t27;
1637
628800
    t28a = t28  + t27;
1638
628800
    t29  = t29a + t26a;
1639
628800
    t30a = t30  + t25;
1640
628800
    t31  = t31a + t24a;
1641
1642
628800
    t20  = (dctint)((t27a - t20a) * 11585U + (1 << 13)) >> 14;
1643
628800
    t27  = (dctint)((t27a + t20a) * 11585U + (1 << 13)) >> 14;
1644
628800
    t21a = (dctint)((t26  - t21 ) * 11585U + (1 << 13)) >> 14;
1645
628800
    t26a = (dctint)((t26  + t21 ) * 11585U + (1 << 13)) >> 14;
1646
628800
    t22  = (dctint)((t25a - t22a) * 11585U + (1 << 13)) >> 14;
1647
628800
    t25  = (dctint)((t25a + t22a) * 11585U + (1 << 13)) >> 14;
1648
628800
    t23a = (dctint)((t24  - t23 ) * 11585U + (1 << 13)) >> 14;
1649
628800
    t24a = (dctint)((t24  + t23 ) * 11585U + (1 << 13)) >> 14;
1650
1651
628800
    out[ 0] = t0   + t31;
1652
628800
    out[ 1] = t1   + t30a;
1653
628800
    out[ 2] = t2   + t29;
1654
628800
    out[ 3] = t3   + t28a;
1655
628800
    out[ 4] = t4   + t27;
1656
628800
    out[ 5] = t5a  + t26a;
1657
628800
    out[ 6] = t6a  + t25;
1658
628800
    out[ 7] = t7   + t24a;
1659
628800
    out[ 8] = t8   + t23a;
1660
628800
    out[ 9] = t9a  + t22;
1661
628800
    out[10] = t10  + t21a;
1662
628800
    out[11] = t11a + t20;
1663
628800
    out[12] = t12a + t19a;
1664
628800
    out[13] = t13  + t18;
1665
628800
    out[14] = t14a + t17a;
1666
628800
    out[15] = t15  + t16;
1667
628800
    out[16] = t15  - t16;
1668
628800
    out[17] = t14a - t17a;
1669
628800
    out[18] = t13  - t18;
1670
628800
    out[19] = t12a - t19a;
1671
628800
    out[20] = t11a - t20;
1672
628800
    out[21] = t10  - t21a;
1673
628800
    out[22] = t9a  - t22;
1674
628800
    out[23] = t8   - t23a;
1675
628800
    out[24] = t7   - t24a;
1676
628800
    out[25] = t6a  - t25;
1677
628800
    out[26] = t5a  - t26a;
1678
628800
    out[27] = t4   - t27;
1679
628800
    out[28] = t3   - t28a;
1680
628800
    out[29] = t2   - t29;
1681
628800
    out[30] = t1   - t30a;
1682
628800
    out[31] = t0   - t31;
1683
628800
}
1684
1685



13604061
itxfm_wrapper(idct, idct, 32, 6, 1)
1686
1687
229432
static av_always_inline void iwht4_1d(const dctcoef *in, ptrdiff_t stride,
1688
                                      dctcoef *out, int pass)
1689
{
1690
    int t0, t1, t2, t3, t4;
1691
1692
229432
    if (pass == 0) {
1693
114716
        t0 = IN(0) >> 2;
1694
114716
        t1 = IN(3) >> 2;
1695
114716
        t2 = IN(1) >> 2;
1696
114716
        t3 = IN(2) >> 2;
1697
    } else {
1698
114716
        t0 = IN(0);
1699
114716
        t1 = IN(3);
1700
114716
        t2 = IN(1);
1701
114716
        t3 = IN(2);
1702
    }
1703
1704
229432
    t0 += t2;
1705
229432
    t3 -= t1;
1706
229432
    t4 = (t0 - t3) >> 1;
1707
229432
    t1 = t4 - t1;
1708
229432
    t2 = t4 - t2;
1709
229432
    t0 -= t1;
1710
229432
    t3 += t2;
1711
1712
229432
    out[0] = t0;
1713
229432
    out[1] = t1;
1714
229432
    out[2] = t2;
1715
229432
    out[3] = t3;
1716
229432
}
1717
1718

716975
itxfm_wrapper(iwht, iwht, 4, 0, 0)
1719
1720
#undef IN
1721
#undef itxfm_wrapper
1722
#undef itxfm_wrap
1723
1724
661
static av_cold void vp9dsp_itxfm_init(VP9DSPContext *dsp)
1725
{
1726
#define init_itxfm(tx, sz) \
1727
    dsp->itxfm_add[tx][DCT_DCT]   = idct_idct_##sz##_add_c; \
1728
    dsp->itxfm_add[tx][DCT_ADST]  = iadst_idct_##sz##_add_c; \
1729
    dsp->itxfm_add[tx][ADST_DCT]  = idct_iadst_##sz##_add_c; \
1730
    dsp->itxfm_add[tx][ADST_ADST] = iadst_iadst_##sz##_add_c
1731
1732
#define init_idct(tx, nm) \
1733
    dsp->itxfm_add[tx][DCT_DCT]   = \
1734
    dsp->itxfm_add[tx][ADST_DCT]  = \
1735
    dsp->itxfm_add[tx][DCT_ADST]  = \
1736
    dsp->itxfm_add[tx][ADST_ADST] = nm##_add_c
1737
1738
661
    init_itxfm(TX_4X4,   4x4);
1739
661
    init_itxfm(TX_8X8,   8x8);
1740
661
    init_itxfm(TX_16X16, 16x16);
1741
661
    init_idct(TX_32X32,  idct_idct_32x32);
1742
661
    init_idct(4 /* lossless */, iwht_iwht_4x4);
1743
1744
#undef init_itxfm
1745
#undef init_idct
1746
661
}
1747
1748
4878082
static av_always_inline void loop_filter(pixel *dst, int E, int I, int H,
1749
                                         ptrdiff_t stridea, ptrdiff_t strideb,
1750
                                         int wd)
1751
{
1752
4878082
    int i, F = 1 << (BIT_DEPTH - 8);
1753
1754
4878082
    E <<= (BIT_DEPTH - 8);
1755
4878082
    I <<= (BIT_DEPTH - 8);
1756
4878082
    H <<= (BIT_DEPTH - 8);
1757
43902738
    for (i = 0; i < 8; i++, dst += stridea) {
1758
        int p7, p6, p5, p4;
1759
39024656
        int p3 = dst[strideb * -4], p2 = dst[strideb * -3];
1760
39024656
        int p1 = dst[strideb * -2], p0 = dst[strideb * -1];
1761
39024656
        int q0 = dst[strideb * +0], q1 = dst[strideb * +1];
1762
39024656
        int q2 = dst[strideb * +2], q3 = dst[strideb * +3];
1763
        int q4, q5, q6, q7;
1764
34510217
        int fm = FFABS(p3 - p2) <= I && FFABS(p2 - p1) <= I &&
1765

32514219
                 FFABS(p1 - p0) <= I && FFABS(q1 - q0) <= I &&
1766

102121207
                 FFABS(q2 - q1) <= I && FFABS(q3 - q2) <= I &&
1767
28586334
                 FFABS(p0 - q0) * 2 + (FFABS(p1 - q1) >> 1) <= E;
1768
        int flat8out, flat8in;
1769
1770
39024656
        if (!fm)
1771
10812461
            continue;
1772
1773
28212195
        if (wd >= 16) {
1774
8293663
            p7 = dst[strideb * -8];
1775
8293663
            p6 = dst[strideb * -7];
1776
8293663
            p5 = dst[strideb * -6];
1777
8293663
            p4 = dst[strideb * -5];
1778
8293663
            q4 = dst[strideb * +4];
1779
8293663
            q5 = dst[strideb * +5];
1780
8293663
            q6 = dst[strideb * +6];
1781
8293663
            q7 = dst[strideb * +7];
1782
1783
11781963
            flat8out = FFABS(p7 - p0) <= F && FFABS(p6 - p0) <= F &&
1784

3243589
                       FFABS(p5 - p0) <= F && FFABS(p4 - p0) <= F &&
1785

3087498
                       FFABS(q4 - q0) <= F && FFABS(q5 - q0) <= F &&
1786

11781963
                       FFABS(q6 - q0) <= F && FFABS(q7 - q0) <= F;
1787
        }
1788
1789
28212195
        if (wd >= 8)
1790
30380343
            flat8in = FFABS(p3 - p0) <= F && FFABS(p2 - p0) <= F &&
1791

10539054
                      FFABS(p1 - p0) <= F && FFABS(q1 - q0) <= F &&
1792

30380343
                      FFABS(q2 - q0) <= F && FFABS(q3 - q0) <= F;
1793
1794

28212195
        if (wd >= 16 && flat8out && flat8in) {
1795
2264823
            dst[strideb * -7] = (p7 + p7 + p7 + p7 + p7 + p7 + p7 + p6 * 2 +
1796
2264823
                                 p5 + p4 + p3 + p2 + p1 + p0 + q0 + 8) >> 4;
1797
2264823
            dst[strideb * -6] = (p7 + p7 + p7 + p7 + p7 + p7 + p6 + p5 * 2 +
1798
2264823
                                 p4 + p3 + p2 + p1 + p0 + q0 + q1 + 8) >> 4;
1799
2264823
            dst[strideb * -5] = (p7 + p7 + p7 + p7 + p7 + p6 + p5 + p4 * 2 +
1800
2264823
                                 p3 + p2 + p1 + p0 + q0 + q1 + q2 + 8) >> 4;
1801
2264823
            dst[strideb * -4] = (p7 + p7 + p7 + p7 + p6 + p5 + p4 + p3 * 2 +
1802
2264823
                                 p2 + p1 + p0 + q0 + q1 + q2 + q3 + 8) >> 4;
1803
2264823
            dst[strideb * -3] = (p7 + p7 + p7 + p6 + p5 + p4 + p3 + p2 * 2 +
1804
2264823
                                 p1 + p0 + q0 + q1 + q2 + q3 + q4 + 8) >> 4;
1805
2264823
            dst[strideb * -2] = (p7 + p7 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
1806
2264823
                                 p0 + q0 + q1 + q2 + q3 + q4 + q5 + 8) >> 4;
1807
2264823
            dst[strideb * -1] = (p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
1808
2264823
                                 q0 + q1 + q2 + q3 + q4 + q5 + q6 + 8) >> 4;
1809
2264823
            dst[strideb * +0] = (p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
1810
2264823
                                 q1 + q2 + q3 + q4 + q5 + q6 + q7 + 8) >> 4;
1811
2264823
            dst[strideb * +1] = (p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
1812
2264823
                                 q2 + q3 + q4 + q5 + q6 + q7 + q7 + 8) >> 4;
1813
2264823
            dst[strideb * +2] = (p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
1814
2264823
                                 q3 + q4 + q5 + q6 + q7 + q7 + q7 + 8) >> 4;
1815
2264823
            dst[strideb * +3] = (p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 +
1816
2264823
                                 q4 + q5 + q6 + q7 + q7 + q7 + q7 + 8) >> 4;
1817
2264823
            dst[strideb * +4] = (p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 +
1818
2264823
                                 q5 + q6 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
1819
2264823
            dst[strideb * +5] = (p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 +
1820
2264823
                                 q6 + q7 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
1821
2264823
            dst[strideb * +6] = (p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 +
1822
2264823
                                 q7 + q7 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
1823

25947372
        } else if (wd >= 8 && flat8in) {
1824
6147140
            dst[strideb * -3] = (p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0 + 4) >> 3;
1825
6147140
            dst[strideb * -2] = (p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4) >> 3;
1826
6147140
            dst[strideb * -1] = (p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4) >> 3;
1827
6147140
            dst[strideb * +0] = (p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4) >> 3;
1828
6147140
            dst[strideb * +1] = (p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3 + 4) >> 3;
1829
6147140
            dst[strideb * +2] = (p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3 + 4) >> 3;
1830
        } else {
1831

19800232
            int hev = FFABS(p1 - p0) > H || FFABS(q1 - q0) > H;
1832
1833
19800232
            if (hev) {
1834
11724432
                int f = av_clip_intp2(p1 - q1, BIT_DEPTH - 1), f1, f2;
1835
11724432
                f = av_clip_intp2(3 * (q0 - p0) + f, BIT_DEPTH - 1);
1836
1837
11724432
                f1 = FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1) >> 3;
1838
11724432
                f2 = FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1) >> 3;
1839
1840
11724432
                dst[strideb * -1] = av_clip_pixel(p0 + f2);
1841
11724432
                dst[strideb * +0] = av_clip_pixel(q0 - f1);
1842
            } else {
1843
8075800
                int f = av_clip_intp2(3 * (q0 - p0), BIT_DEPTH - 1), f1, f2;
1844
1845
8075800
                f1 = FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1) >> 3;
1846
8075800
                f2 = FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1) >> 3;
1847
1848
8075800
                dst[strideb * -1] = av_clip_pixel(p0 + f2);
1849
8075800
                dst[strideb * +0] = av_clip_pixel(q0 - f1);
1850
1851
8075800
                f = (f1 + 1) >> 1;
1852
8075800
                dst[strideb * -2] = av_clip_pixel(p1 + f);
1853
8075800
                dst[strideb * +1] = av_clip_pixel(q1 - f);
1854
            }
1855
        }
1856
    }
1857
4878082
}
1858
1859
#define lf_8_fn(dir, wd, stridea, strideb) \
1860
static void loop_filter_##dir##_##wd##_8_c(uint8_t *_dst, \
1861
                                           ptrdiff_t stride, \
1862
                                           int E, int I, int H) \
1863
{ \
1864
    pixel *dst = (pixel *) _dst; \
1865
    stride /= sizeof(pixel); \
1866
    loop_filter(dst, E, I, H, stridea, strideb, wd); \
1867
}
1868
1869
#define lf_8_fns(wd) \
1870
lf_8_fn(h, wd, stride, 1) \
1871
lf_8_fn(v, wd, 1, stride)
1872
1873
3201000
lf_8_fns(4)
1874
1957501
lf_8_fns(8)
1875
1320081
lf_8_fns(16)
1876
1877
#undef lf_8_fn
1878
#undef lf_8_fns
1879
1880
#define lf_16_fn(dir, stridea) \
1881
static void loop_filter_##dir##_16_16_c(uint8_t *dst, \
1882
                                        ptrdiff_t stride, \
1883
                                        int E, int I, int H) \
1884
{ \
1885
    loop_filter_##dir##_16_8_c(dst, stride, E, I, H); \
1886
    loop_filter_##dir##_16_8_c(dst + 8 * stridea, stride, E, I, H); \
1887
}
1888
1889
327535
lf_16_fn(h, stride)
1890
331534
lf_16_fn(v, sizeof(pixel))
1891
1892
#undef lf_16_fn
1893
1894
#define lf_mix_fn(dir, wd1, wd2, stridea) \
1895
static void loop_filter_##dir##_##wd1##wd2##_16_c(uint8_t *dst, \
1896
                                                  ptrdiff_t stride, \
1897
                                                  int E, int I, int H) \
1898
{ \
1899
    loop_filter_##dir##_##wd1##_8_c(dst, stride, E & 0xff, I & 0xff, H & 0xff); \
1900
    loop_filter_##dir##_##wd2##_8_c(dst + 8 * stridea, stride, E >> 8, I >> 8, H >> 8); \
1901
}
1902
1903
#define lf_mix_fns(wd1, wd2) \
1904
lf_mix_fn(h, wd1, wd2, stride) \
1905
lf_mix_fn(v, wd1, wd2, sizeof(pixel))
1906
1907
1017694
lf_mix_fns(4, 4)
1908
127262
lf_mix_fns(4, 8)
1909
130597
lf_mix_fns(8, 4)
1910
830656
lf_mix_fns(8, 8)
1911
1912
#undef lf_mix_fn
1913
#undef lf_mix_fns
1914
1915
661
static av_cold void vp9dsp_loopfilter_init(VP9DSPContext *dsp)
1916
{
1917
661
    dsp->loop_filter_8[0][0] = loop_filter_h_4_8_c;
1918
661
    dsp->loop_filter_8[0][1] = loop_filter_v_4_8_c;
1919
661
    dsp->loop_filter_8[1][0] = loop_filter_h_8_8_c;
1920
661
    dsp->loop_filter_8[1][1] = loop_filter_v_8_8_c;
1921
661
    dsp->loop_filter_8[2][0] = loop_filter_h_16_8_c;
1922
661
    dsp->loop_filter_8[2][1] = loop_filter_v_16_8_c;
1923
1924
661
    dsp->loop_filter_16[0] = loop_filter_h_16_16_c;
1925
661
    dsp->loop_filter_16[1] = loop_filter_v_16_16_c;
1926
1927
661
    dsp->loop_filter_mix2[0][0][0] = loop_filter_h_44_16_c;
1928
661
    dsp->loop_filter_mix2[0][0][1] = loop_filter_v_44_16_c;
1929
661
    dsp->loop_filter_mix2[0][1][0] = loop_filter_h_48_16_c;
1930
661
    dsp->loop_filter_mix2[0][1][1] = loop_filter_v_48_16_c;
1931
661
    dsp->loop_filter_mix2[1][0][0] = loop_filter_h_84_16_c;
1932
661
    dsp->loop_filter_mix2[1][0][1] = loop_filter_v_84_16_c;
1933
661
    dsp->loop_filter_mix2[1][1][0] = loop_filter_h_88_16_c;
1934
661
    dsp->loop_filter_mix2[1][1][1] = loop_filter_v_88_16_c;
1935
661
}
1936
1937
#if BIT_DEPTH != 12
1938
1939
1994856
static av_always_inline void copy_c(uint8_t *dst, ptrdiff_t dst_stride,
1940
                                    const uint8_t *src, ptrdiff_t src_stride,
1941
                                    int w, int h)
1942
{
1943
    do {
1944
1994856
        memcpy(dst, src, w * sizeof(pixel));
1945
1946
1994856
        dst += dst_stride;
1947
1994856
        src += src_stride;
1948
1994856
    } while (--h);
1949
168973
}
1950
1951
6636
static av_always_inline void avg_c(uint8_t *_dst, ptrdiff_t dst_stride,
1952
                                   const uint8_t *_src, ptrdiff_t src_stride,
1953
                                   int w, int h)
1954
{
1955
6636
    pixel *dst = (pixel *) _dst;
1956
6636
    const pixel *src = (const pixel *) _src;
1957
1958
6636
    dst_stride /= sizeof(pixel);
1959
6636
    src_stride /= sizeof(pixel);
1960
    do {
1961
        int x;
1962
1963
370208
        for (x = 0; x < w; x += 4)
1964
303764
            AV_WN4PA(&dst[x], rnd_avg_pixel4(AV_RN4PA(&dst[x]), AV_RN4P(&src[x])));
1965
1966
66444
        dst += dst_stride;
1967
66444
        src += src_stride;
1968
66444
    } while (--h);
1969
6636
}
1970
1971
#define fpel_fn(type, sz) \
1972
static void type##sz##_c(uint8_t *dst, ptrdiff_t dst_stride, \
1973
                         const uint8_t *src, ptrdiff_t src_stride, \
1974
                         int h, int mx, int my) \
1975
{ \
1976
    type##_c(dst, dst_stride, src, src_stride, sz, h); \
1977
}
1978
1979
#define copy_avg_fn(sz) \
1980
fpel_fn(copy, sz) \
1981
fpel_fn(avg,  sz)
1982
1983
8948
copy_avg_fn(64)
1984
18606
copy_avg_fn(32)
1985
37020
copy_avg_fn(16)
1986
66195
copy_avg_fn(8)
1987
49314
copy_avg_fn(4)
1988
1989
#undef fpel_fn
1990
#undef copy_avg_fn
1991
1992
#endif /* BIT_DEPTH != 12 */
1993
1994
#define FILTER_8TAP(src, x, F, stride) \
1995
    av_clip_pixel((F[0] * src[x + -3 * stride] + \
1996
                   F[1] * src[x + -2 * stride] + \
1997
                   F[2] * src[x + -1 * stride] + \
1998
                   F[3] * src[x + +0 * stride] + \
1999
                   F[4] * src[x + +1 * stride] + \
2000
                   F[5] * src[x + +2 * stride] + \
2001
                   F[6] * src[x + +3 * stride] + \
2002
                   F[7] * src[x + +4 * stride] + 64) >> 7)
2003
2004
475587
static av_always_inline void do_8tap_1d_c(uint8_t *_dst, ptrdiff_t dst_stride,
2005
                                          const uint8_t *_src, ptrdiff_t src_stride,
2006
                                          int w, int h, ptrdiff_t ds,
2007
                                          const int16_t *filter, int avg)
2008
{
2009
475587
    pixel *dst = (pixel *) _dst;
2010
475587
    const pixel *src = (const pixel *) _src;
2011
2012
475587
    dst_stride /= sizeof(pixel);
2013
475587
    src_stride /= sizeof(pixel);
2014
    do {
2015
        int x;
2016
2017
70379384
        for (x = 0; x < w; x++)
2018
66093232
            if (avg) {
2019
5447824
                dst[x] = (dst[x] + FILTER_8TAP(src, x, filter, ds) + 1) >> 1;
2020
            } else {
2021
60645408
                dst[x] = FILTER_8TAP(src, x, filter, ds);
2022
            }
2023
2024
4286152
        dst += dst_stride;
2025
4286152
        src += src_stride;
2026
4286152
    } while (--h);
2027
475587
}
2028
2029
#define filter_8tap_1d_fn(opn, opa, dir, ds) \
2030
static av_noinline void opn##_8tap_1d_##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
2031
                                                const uint8_t *src, ptrdiff_t src_stride, \
2032
                                                int w, int h, const int16_t *filter) \
2033
{ \
2034
    do_8tap_1d_c(dst, dst_stride, src, src_stride, w, h, ds, filter, opa); \
2035
}
2036
2037
188113
filter_8tap_1d_fn(put, 0, v, src_stride / sizeof(pixel))
2038
261154
filter_8tap_1d_fn(put, 0, h, 1)
2039
10787
filter_8tap_1d_fn(avg, 1, v, src_stride / sizeof(pixel))
2040
15533
filter_8tap_1d_fn(avg, 1, h, 1)
2041
2042
#undef filter_8tap_1d_fn
2043
2044
1103022
static av_always_inline void do_8tap_2d_c(uint8_t *_dst, ptrdiff_t dst_stride,
2045
                                          const uint8_t *_src, ptrdiff_t src_stride,
2046
                                          int w, int h, const int16_t *filterx,
2047
                                          const int16_t *filtery, int avg)
2048
{
2049
1103022
    int tmp_h = h + 7;
2050
1103022
    pixel tmp[64 * 71], *tmp_ptr = tmp;
2051
1103022
    pixel *dst = (pixel *) _dst;
2052
1103022
    const pixel *src = (const pixel *) _src;
2053
2054
1103022
    dst_stride /= sizeof(pixel);
2055
1103022
    src_stride /= sizeof(pixel);
2056
1103022
    src -= src_stride * 3;
2057
    do {
2058
        int x;
2059
2060
184804070
        for (x = 0; x < w; x++)
2061
168564992
            tmp_ptr[x] = FILTER_8TAP(src, x, filterx, 1);
2062
2063
16239078
        tmp_ptr += 64;
2064
16239078
        src += src_stride;
2065
16239078
    } while (--tmp_h);
2066
2067
1103022
    tmp_ptr = tmp + 64 * 3;
2068
    do {
2069
        int x;
2070
2071
117119572
        for (x = 0; x < w; x++)
2072
108601648
            if (avg) {
2073
9056928
                dst[x] = (dst[x] + FILTER_8TAP(tmp_ptr, x, filtery, 64) + 1) >> 1;
2074
            } else {
2075
99544720
                dst[x] = FILTER_8TAP(tmp_ptr, x, filtery, 64);
2076
            }
2077
2078
8517924
        tmp_ptr += 64;
2079
8517924
        dst += dst_stride;
2080
8517924
    } while (--h);
2081
1103022
}
2082
2083
#define filter_8tap_2d_fn(opn, opa) \
2084
static av_noinline void opn##_8tap_2d_hv_c(uint8_t *dst, ptrdiff_t dst_stride, \
2085
                                           const uint8_t *src, ptrdiff_t src_stride, \
2086
                                           int w, int h, const int16_t *filterx, \
2087
                                           const int16_t *filtery) \
2088
{ \
2089
    do_8tap_2d_c(dst, dst_stride, src, src_stride, w, h, filterx, filtery, opa); \
2090
}
2091
2092
1051983
filter_8tap_2d_fn(put, 0)
2093
51039
filter_8tap_2d_fn(avg, 1)
2094
2095
#undef filter_8tap_2d_fn
2096
2097
#define filter_fn_1d(sz, dir, dir_m, type, type_idx, avg) \
2098
static void avg##_8tap_##type##_##sz##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
2099
                                              const uint8_t *src, ptrdiff_t src_stride, \
2100
                                              int h, int mx, int my) \
2101
{ \
2102
    avg##_8tap_1d_##dir##_c(dst, dst_stride, src, src_stride, sz, h, \
2103
                            ff_vp9_subpel_filters[type_idx][dir_m]); \
2104
}
2105
2106
#define filter_fn_2d(sz, type, type_idx, avg) \
2107
static void avg##_8tap_##type##_##sz##hv_c(uint8_t *dst, ptrdiff_t dst_stride, \
2108
                                           const uint8_t *src, ptrdiff_t src_stride, \
2109
                                           int h, int mx, int my) \
2110
{ \
2111
    avg##_8tap_2d_hv_c(dst, dst_stride, src, src_stride, sz, h, \
2112
                       ff_vp9_subpel_filters[type_idx][mx], \
2113
                       ff_vp9_subpel_filters[type_idx][my]); \
2114
}
2115
2116
#if BIT_DEPTH != 12
2117
2118
#define FILTER_BILIN(src, x, mxy, stride) \
2119
    (src[x] + ((mxy * (src[x + stride] - src[x]) + 8) >> 4))
2120
2121
4285
static av_always_inline void do_bilin_1d_c(uint8_t *_dst, ptrdiff_t dst_stride,
2122
                                           const uint8_t *_src, ptrdiff_t src_stride,
2123
                                           int w, int h, ptrdiff_t ds, int mxy, int avg)
2124
{
2125
4285
    pixel *dst = (pixel *) _dst;
2126
4285
    const pixel *src = (const pixel *) _src;
2127
2128
4285
    dst_stride /= sizeof(pixel);
2129
4285
    src_stride /= sizeof(pixel);
2130
    do {
2131
        int x;
2132
2133
694128
        for (x = 0; x < w; x++)
2134
652832
            if (avg) {
2135
65472
                dst[x] = (dst[x] + FILTER_BILIN(src, x, mxy, ds) + 1) >> 1;
2136
            } else {
2137
587360
                dst[x] = FILTER_BILIN(src, x, mxy, ds);
2138
            }
2139
2140
41296
        dst += dst_stride;
2141
41296
        src += src_stride;
2142
41296
    } while (--h);
2143
4285
}
2144
2145
#define bilin_1d_fn(opn, opa, dir, ds) \
2146
static av_noinline void opn##_bilin_1d_##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
2147
                                                 const uint8_t *src, ptrdiff_t src_stride, \
2148
                                                 int w, int h, int mxy) \
2149
{ \
2150
    do_bilin_1d_c(dst, dst_stride, src, src_stride, w, h, ds, mxy, opa); \
2151
}
2152
2153
1527
bilin_1d_fn(put, 0, v, src_stride / sizeof(pixel))
2154
2698
bilin_1d_fn(put, 0, h, 1)
2155
30
bilin_1d_fn(avg, 1, v, src_stride / sizeof(pixel))
2156
30
bilin_1d_fn(avg, 1, h, 1)
2157
2158
#undef bilin_1d_fn
2159
2160
8301
static av_always_inline void do_bilin_2d_c(uint8_t *_dst, ptrdiff_t dst_stride,
2161
                                           const uint8_t *_src, ptrdiff_t src_stride,
2162
                                           int w, int h, int mx, int my, int avg)
2163
{
2164
8301
    pixel tmp[64 * 65], *tmp_ptr = tmp;
2165
8301
    int tmp_h = h + 1;
2166
8301
    pixel *dst = (pixel *) _dst;
2167
8301
    const pixel *src = (const pixel *) _src;
2168
2169
8301
    dst_stride /= sizeof(pixel);
2170
8301
    src_stride /= sizeof(pixel);
2171
    do {
2172
        int x;
2173
2174
868717
        for (x = 0; x < w; x++)
2175
796840
            tmp_ptr[x] = FILTER_BILIN(src, x, mx, 1);
2176
2177
71877
        tmp_ptr += 64;
2178
71877
        src += src_stride;
2179
71877
    } while (--tmp_h);
2180
2181
8301
    tmp_ptr = tmp;
2182
    do {
2183
        int x;
2184
2185
793656
        for (x = 0; x < w; x++)
2186
730080
            if (avg) {
2187
32736
                dst[x] = (dst[x] + FILTER_BILIN(tmp_ptr, x, my, 64) + 1) >> 1;
2188
            } else {
2189
697344
                dst[x] = FILTER_BILIN(tmp_ptr, x, my, 64);
2190
            }
2191
2192
63576
        tmp_ptr += 64;
2193
63576
        dst += dst_stride;
2194
63576
    } while (--h);
2195
8301
}
2196
2197
#define bilin_2d_fn(opn, opa) \
2198
static av_noinline void opn##_bilin_2d_hv_c(uint8_t *dst, ptrdiff_t dst_stride, \
2199
                                            const uint8_t *src, ptrdiff_t src_stride, \
2200
                                            int w, int h, int mx, int my) \
2201
{ \
2202
    do_bilin_2d_c(dst, dst_stride, src, src_stride, w, h, mx, my, opa); \
2203
}
2204
2205
8271
bilin_2d_fn(put, 0)
2206
30
bilin_2d_fn(avg, 1)
2207
2208
#undef bilin_2d_fn
2209
2210
#define bilinf_fn_1d(sz, dir, dir_m, avg) \
2211
static void avg##_bilin_##sz##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
2212
                                      const uint8_t *src, ptrdiff_t src_stride, \
2213
                                      int h, int mx, int my) \
2214
{ \
2215
    avg##_bilin_1d_##dir##_c(dst, dst_stride, src, src_stride, sz, h, dir_m); \
2216
}
2217
2218
#define bilinf_fn_2d(sz, avg) \
2219
static void avg##_bilin_##sz##hv_c(uint8_t *dst, ptrdiff_t dst_stride, \
2220
                                   const uint8_t *src, ptrdiff_t src_stride, \
2221
                                   int h, int mx, int my) \
2222
{ \
2223
    avg##_bilin_2d_hv_c(dst, dst_stride, src, src_stride, sz, h, mx, my); \
2224
}
2225
2226
#else
2227
2228
#define bilinf_fn_1d(a, b, c, d)
2229
#define bilinf_fn_2d(a, b)
2230
2231
#endif
2232
2233
#define filter_fn(sz, avg) \
2234
filter_fn_1d(sz, h, mx, regular, FILTER_8TAP_REGULAR, avg) \
2235
filter_fn_1d(sz, v, my, regular, FILTER_8TAP_REGULAR, avg) \
2236
filter_fn_2d(sz,        regular, FILTER_8TAP_REGULAR, avg) \
2237
filter_fn_1d(sz, h, mx, smooth,  FILTER_8TAP_SMOOTH,  avg) \
2238
filter_fn_1d(sz, v, my, smooth,  FILTER_8TAP_SMOOTH,  avg) \
2239
filter_fn_2d(sz,        smooth,  FILTER_8TAP_SMOOTH,  avg) \
2240
filter_fn_1d(sz, h, mx, sharp,   FILTER_8TAP_SHARP,   avg) \
2241
filter_fn_1d(sz, v, my, sharp,   FILTER_8TAP_SHARP,   avg) \
2242
filter_fn_2d(sz,        sharp,   FILTER_8TAP_SHARP,   avg) \
2243
bilinf_fn_1d(sz, h, mx,                               avg) \
2244
bilinf_fn_1d(sz, v, my,                               avg) \
2245
bilinf_fn_2d(sz,                                      avg)
2246
2247
#define filter_fn_set(avg) \
2248
filter_fn(64, avg) \
2249
filter_fn(32, avg) \
2250
filter_fn(16, avg) \
2251
filter_fn(8,  avg) \
2252
filter_fn(4,  avg)
2253
2254
3027492
filter_fn_set(put)
2255
77449
filter_fn_set(avg)
2256
2257
#undef filter_fn
2258
#undef filter_fn_set
2259
#undef filter_fn_1d
2260
#undef filter_fn_2d
2261
#undef bilinf_fn_1d
2262
#undef bilinf_fn_2d
2263
2264
#if BIT_DEPTH != 8
2265
void ff_vp9dsp_mc_init_10(VP9DSPContext *dsp);
2266
#endif
2267
#if BIT_DEPTH != 10
2268
static
2269
#endif
2270
735
av_cold void FUNC(ff_vp9dsp_mc_init)(VP9DSPContext *dsp)
2271
{
2272
#if BIT_DEPTH == 12
2273
74
    ff_vp9dsp_mc_init_10(dsp);
2274
#else /* BIT_DEPTH == 12 */
2275
2276
#define init_fpel(idx1, idx2, sz, type) \
2277
    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = type##sz##_c; \
2278
    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = type##sz##_c; \
2279
    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = type##sz##_c; \
2280
    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = type##sz##_c
2281
2282
#define init_copy_avg(idx, sz) \
2283
    init_fpel(idx, 0, sz, copy); \
2284
    init_fpel(idx, 1, sz, avg)
2285
2286
661
    init_copy_avg(0, 64);
2287
661
    init_copy_avg(1, 32);
2288
661
    init_copy_avg(2, 16);
2289
661
    init_copy_avg(3,  8);
2290
661
    init_copy_avg(4,  4);
2291
2292
#undef init_copy_avg
2293
#undef init_fpel
2294
2295
#endif /* BIT_DEPTH == 12 */
2296
2297
#define init_subpel1_bd_aware(idx1, idx2, idxh, idxv, sz, dir, type) \
2298
    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = type##_8tap_smooth_##sz##dir##_c; \
2299
    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type##_8tap_regular_##sz##dir##_c; \
2300
    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] = type##_8tap_sharp_##sz##dir##_c
2301
2302
#if BIT_DEPTH == 12
2303
#define init_subpel1 init_subpel1_bd_aware
2304
#else
2305
#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type) \
2306
    init_subpel1_bd_aware(idx1, idx2, idxh, idxv, sz, dir, type); \
2307
    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][idxh][idxv] = type##_bilin_##sz##dir##_c
2308
#endif
2309
2310
#define init_subpel2(idx, idxh, idxv, dir, type) \
2311
    init_subpel1(0, idx, idxh, idxv, 64, dir, type); \
2312
    init_subpel1(1, idx, idxh, idxv, 32, dir, type); \
2313
    init_subpel1(2, idx, idxh, idxv, 16, dir, type); \
2314
    init_subpel1(3, idx, idxh, idxv,  8, dir, type); \
2315
    init_subpel1(4, idx, idxh, idxv,  4, dir, type)
2316
2317
#define init_subpel3(idx, type) \
2318
    init_subpel2(idx, 1, 1, hv, type); \
2319
    init_subpel2(idx, 0, 1, v, type); \
2320
    init_subpel2(idx, 1, 0, h, type)
2321
2322
735
    init_subpel3(0, put);
2323
735
    init_subpel3(1, avg);
2324
2325
#undef init_subpel1
2326
#undef init_subpel2
2327
#undef init_subpel3
2328
#undef init_subpel1_bd_aware
2329
735
}
2330
2331
2067
static av_always_inline void do_scaled_8tap_c(uint8_t *_dst, ptrdiff_t dst_stride,
2332
                                              const uint8_t *_src, ptrdiff_t src_stride,
2333
                                              int w, int h, int mx, int my,
2334
                                              int dx, int dy, int avg,
2335
                                              const int16_t (*filters)[8])
2336
{
2337
2067
    int tmp_h = (((h - 1) * dy + my) >> 4) + 8;
2338
2067
    pixel tmp[64 * 135], *tmp_ptr = tmp;
2339
2067
    pixel *dst = (pixel *) _dst;
2340
2067
    const pixel *src = (const pixel *) _src;
2341
2342
2067
    dst_stride /= sizeof(pixel);
2343
2067
    src_stride /= sizeof(pixel);
2344
2067
    src -= src_stride * 3;
2345
    do {
2346
        int x;
2347
30118
        int imx = mx, ioff = 0;
2348
2349
365566
        for (x = 0; x < w; x++) {
2350
335448
            tmp_ptr[x] = FILTER_8TAP(src, ioff, filters[imx], 1);
2351
335448
            imx += dx;
2352
335448
            ioff += imx >> 4;
2353
335448
            imx &= 0xf;
2354
        }
2355
2356
30118
        tmp_ptr += 64;
2357
30118
        src += src_stride;
2358
30118
    } while (--tmp_h);
2359
2360
2067
    tmp_ptr = tmp + 64 * 3;
2361
    do {
2362
        int x;
2363
17088
        const int16_t *filter = filters[my];
2364
2365
241152
        for (x = 0; x < w; x++)
2366
224064
            if (avg) {
2367
                dst[x] = (dst[x] + FILTER_8TAP(tmp_ptr, x, filter, 64) + 1) >> 1;
2368
            } else {
2369
224064
                dst[x] = FILTER_8TAP(tmp_ptr, x, filter, 64);
2370
            }
2371
2372
17088
        my += dy;
2373
17088
        tmp_ptr += (my >> 4) * 64;
2374
17088
        my &= 0xf;
2375
17088
        dst += dst_stride;
2376
17088
    } while (--h);
2377
2067
}
2378
2379
#define scaled_filter_8tap_fn(opn, opa) \
2380
static av_noinline void opn##_scaled_8tap_c(uint8_t *dst, ptrdiff_t dst_stride, \
2381
                                            const uint8_t *src, ptrdiff_t src_stride, \
2382
                                            int w, int h, int mx, int my, int dx, int dy, \
2383
                                            const int16_t (*filters)[8]) \
2384
{ \
2385
    do_scaled_8tap_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, \
2386
                     opa, filters); \
2387
}
2388
2389
2067
scaled_filter_8tap_fn(put, 0)
2390
scaled_filter_8tap_fn(avg, 1)
2391
2392
#undef scaled_filter_8tap_fn
2393
2394
#undef FILTER_8TAP
2395
2396
#define scaled_filter_fn(sz, type, type_idx, avg) \
2397
static void avg##_scaled_##type##_##sz##_c(uint8_t *dst, ptrdiff_t dst_stride, \
2398
                                           const uint8_t *src, ptrdiff_t src_stride, \
2399
                                           int h, int mx, int my, int dx, int dy) \
2400
{ \
2401
    avg##_scaled_8tap_c(dst, dst_stride, src, src_stride, sz, h, mx, my, dx, dy, \
2402
                        ff_vp9_subpel_filters[type_idx]); \
2403
}
2404
2405
#if BIT_DEPTH != 12
2406
2407
static av_always_inline void do_scaled_bilin_c(uint8_t *_dst, ptrdiff_t dst_stride,
2408
                                               const uint8_t *_src, ptrdiff_t src_stride,
2409
                                               int w, int h, int mx, int my,
2410
                                               int dx, int dy, int avg)
2411
{
2412
    pixel tmp[64 * 129], *tmp_ptr = tmp;
2413
    int tmp_h = (((h - 1) * dy + my) >> 4) + 2;
2414
    pixel *dst = (pixel *) _dst;
2415
    const pixel *src = (const pixel *) _src;
2416
2417
    dst_stride /= sizeof(pixel);
2418
    src_stride /= sizeof(pixel);
2419
    do {
2420
        int x;
2421
        int imx = mx, ioff = 0;
2422
2423
        for (x = 0; x < w; x++) {
2424
            tmp_ptr[x] = FILTER_BILIN(src, ioff, imx, 1);
2425
            imx += dx;
2426
            ioff += imx >> 4;
2427
            imx &= 0xf;
2428
        }
2429
2430
        tmp_ptr += 64;
2431
        src += src_stride;
2432
    } while (--tmp_h);
2433
2434
    tmp_ptr = tmp;
2435
    do {
2436
        int x;
2437
2438
        for (x = 0; x < w; x++)
2439
            if (avg) {
2440
                dst[x] = (dst[x] + FILTER_BILIN(tmp_ptr, x, my, 64) + 1) >> 1;
2441
            } else {
2442
                dst[x] = FILTER_BILIN(tmp_ptr, x, my, 64);
2443
            }
2444
2445
        my += dy;
2446
        tmp_ptr += (my >> 4) * 64;
2447
        my &= 0xf;
2448
        dst += dst_stride;
2449
    } while (--h);
2450
}
2451
2452
#define scaled_bilin_fn(opn, opa) \
2453
static av_noinline void opn##_scaled_bilin_c(uint8_t *dst, ptrdiff_t dst_stride, \
2454
                                             const uint8_t *src, ptrdiff_t src_stride, \
2455
                                             int w, int h, int mx, int my, int dx, int dy) \
2456
{ \
2457
    do_scaled_bilin_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, opa); \
2458
}
2459
2460
scaled_bilin_fn(put, 0)
2461
scaled_bilin_fn(avg, 1)
2462
2463
#undef scaled_bilin_fn
2464
2465
#undef FILTER_BILIN
2466
2467
#define scaled_bilinf_fn(sz, avg) \
2468
static void avg##_scaled_bilin_##sz##_c(uint8_t *dst, ptrdiff_t dst_stride, \
2469
                                        const uint8_t *src, ptrdiff_t src_stride, \
2470
                                        int h, int mx, int my, int dx, int dy) \
2471
{ \
2472
    avg##_scaled_bilin_c(dst, dst_stride, src, src_stride, sz, h, mx, my, dx, dy); \
2473
}
2474
2475
#else
2476
2477
#define scaled_bilinf_fn(a, b)
2478
2479
#endif
2480
2481
#define scaled_filter_fns(sz, avg) \
2482
scaled_filter_fn(sz,        regular, FILTER_8TAP_REGULAR, avg) \
2483
scaled_filter_fn(sz,        smooth,  FILTER_8TAP_SMOOTH,  avg) \
2484
scaled_filter_fn(sz,        sharp,   FILTER_8TAP_SHARP,   avg) \
2485
scaled_bilinf_fn(sz,                                      avg)
2486
2487
#define scaled_filter_fn_set(avg) \
2488
scaled_filter_fns(64, avg) \
2489
scaled_filter_fns(32, avg) \
2490
scaled_filter_fns(16, avg) \
2491
scaled_filter_fns(8,  avg) \
2492
scaled_filter_fns(4,  avg)
2493
2494
4134
scaled_filter_fn_set(put)
2495
scaled_filter_fn_set(avg)
2496
2497
#undef scaled_filter_fns
2498
#undef scaled_filter_fn_set
2499
#undef scaled_filter_fn
2500
#undef scaled_bilinf_fn
2501
2502
#if BIT_DEPTH != 8
2503
void ff_vp9dsp_scaled_mc_init_10(VP9DSPContext *dsp);
2504
#endif
2505
#if BIT_DEPTH != 10
2506
static
2507
#endif
2508
735
av_cold void FUNC(ff_vp9dsp_scaled_mc_init)(VP9DSPContext *dsp)
2509
{
2510
#define init_scaled_bd_aware(idx1, idx2, sz, type) \
2511
    dsp->smc[idx1][FILTER_8TAP_SMOOTH ][idx2] = type##_scaled_smooth_##sz##_c; \
2512
    dsp->smc[idx1][FILTER_8TAP_REGULAR][idx2] = type##_scaled_regular_##sz##_c; \
2513
    dsp->smc[idx1][FILTER_8TAP_SHARP  ][idx2] = type##_scaled_sharp_##sz##_c
2514
2515
#if BIT_DEPTH == 12
2516
74
    ff_vp9dsp_scaled_mc_init_10(dsp);
2517
#define init_scaled(a,b,c,d) init_scaled_bd_aware(a,b,c,d)
2518
#else
2519
#define init_scaled(idx1, idx2, sz, type) \
2520
    init_scaled_bd_aware(idx1, idx2, sz, type); \
2521
    dsp->smc[idx1][FILTER_BILINEAR    ][idx2] = type##_scaled_bilin_##sz##_c
2522
#endif
2523
2524
#define init_scaled_put_avg(idx, sz) \
2525
    init_scaled(idx, 0, sz, put); \
2526
    init_scaled(idx, 1, sz, avg)
2527
2528
735
    init_scaled_put_avg(0, 64);
2529
735
    init_scaled_put_avg(1, 32);
2530
735
    init_scaled_put_avg(2, 16);
2531
735
    init_scaled_put_avg(3,  8);
2532
735
    init_scaled_put_avg(4,  4);
2533
2534
#undef init_scaled_put_avg
2535
#undef init_scaled
2536
#undef init_scaled_bd_aware
2537
735
}
2538
2539
661
av_cold void FUNC(ff_vp9dsp_init)(VP9DSPContext *dsp)
2540
{
2541
661
    FUNC(ff_vp9dsp_intrapred_init)(dsp);
2542
661
    vp9dsp_itxfm_init(dsp);
2543
661
    vp9dsp_loopfilter_init(dsp);
2544
661
    FUNC(ff_vp9dsp_mc_init)(dsp);
2545
661
    FUNC(ff_vp9dsp_scaled_mc_init)(dsp);
2546
661
}