GCC Code Coverage Report
Directory: ../../../ffmpeg/ Exec Total Coverage
File: src/libavcodec/h264pred_template.c Lines: 760 822 92.5 %
Date: 2020-07-13 04:22:34 Branches: 162 184 88.0 %

Line Branch Exec Source
1
/*
2
 * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
3
 * Copyright (c) 2003-2011 Michael Niedermayer <michaelni@gmx.at>
4
 *
5
 * This file is part of FFmpeg.
6
 *
7
 * FFmpeg is free software; you can redistribute it and/or
8
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11
 *
12
 * FFmpeg is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with FFmpeg; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
 */
21
22
/**
23
 * @file
24
 * H.264 / AVC / MPEG-4 part10 prediction functions.
25
 * @author Michael Niedermayer <michaelni@gmx.at>
26
 */
27
28
#include "libavutil/intreadwrite.h"
29
30
#include "mathops.h"
31
32
#include "bit_depth_template.c"
33
34
8307618
static void FUNCC(pred4x4_vertical)(uint8_t *_src, const uint8_t *topright,
35
                                    ptrdiff_t _stride)
36
{
37
8307618
    pixel *src = (pixel*)_src;
38
8307618
    int stride = _stride>>(sizeof(pixel)-1);
39
8307618
    const pixel4 a= AV_RN4PA(src-stride);
40
41
8307618
    AV_WN4PA(src+0*stride, a);
42
8307618
    AV_WN4PA(src+1*stride, a);
43
8307618
    AV_WN4PA(src+2*stride, a);
44
8307618
    AV_WN4PA(src+3*stride, a);
45
}
46
47
13576512
static void FUNCC(pred4x4_horizontal)(uint8_t *_src, const uint8_t *topright,
48
                                      ptrdiff_t _stride)
49
{
50
13576512
    pixel *src = (pixel*)_src;
51
13576512
    int stride = _stride>>(sizeof(pixel)-1);
52
13576512
    AV_WN4PA(src+0*stride, PIXEL_SPLAT_X4(src[-1+0*stride]));
53
13576512
    AV_WN4PA(src+1*stride, PIXEL_SPLAT_X4(src[-1+1*stride]));
54
13576512
    AV_WN4PA(src+2*stride, PIXEL_SPLAT_X4(src[-1+2*stride]));
55
13576512
    AV_WN4PA(src+3*stride, PIXEL_SPLAT_X4(src[-1+3*stride]));
56
}
57
58
5900690
static void FUNCC(pred4x4_dc)(uint8_t *_src, const uint8_t *topright,
59
                              ptrdiff_t _stride)
60
{
61
5900690
    pixel *src = (pixel*)_src;
62
5900690
    int stride = _stride>>(sizeof(pixel)-1);
63
5900690
    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
64
5900690
                   + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
65
5900690
    const pixel4 a = PIXEL_SPLAT_X4(dc);
66
67
5900690
    AV_WN4PA(src+0*stride, a);
68
5900690
    AV_WN4PA(src+1*stride, a);
69
5900690
    AV_WN4PA(src+2*stride, a);
70
5900690
    AV_WN4PA(src+3*stride, a);
71
}
72
73
582650
static void FUNCC(pred4x4_left_dc)(uint8_t *_src, const uint8_t *topright,
74
                                   ptrdiff_t _stride)
75
{
76
582650
    pixel *src = (pixel*)_src;
77
582650
    int stride = _stride>>(sizeof(pixel)-1);
78
582650
    const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
79
582650
    const pixel4 a = PIXEL_SPLAT_X4(dc);
80
81
582650
    AV_WN4PA(src+0*stride, a);
82
582650
    AV_WN4PA(src+1*stride, a);
83
582650
    AV_WN4PA(src+2*stride, a);
84
582650
    AV_WN4PA(src+3*stride, a);
85
}
86
87
116342
static void FUNCC(pred4x4_top_dc)(uint8_t *_src, const uint8_t *topright,
88
                                  ptrdiff_t _stride)
89
{
90
116342
    pixel *src = (pixel*)_src;
91
116342
    int stride = _stride>>(sizeof(pixel)-1);
92
116342
    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
93
116342
    const pixel4 a = PIXEL_SPLAT_X4(dc);
94
95
116342
    AV_WN4PA(src+0*stride, a);
96
116342
    AV_WN4PA(src+1*stride, a);
97
116342
    AV_WN4PA(src+2*stride, a);
98
116342
    AV_WN4PA(src+3*stride, a);
99
}
100
101
14710
static void FUNCC(pred4x4_128_dc)(uint8_t *_src, const uint8_t *topright,
102
                                  ptrdiff_t _stride)
103
{
104
14710
    pixel *src = (pixel*)_src;
105
14710
    int stride = _stride>>(sizeof(pixel)-1);
106
14710
    const pixel4 a = PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1));
107
108
14710
    AV_WN4PA(src+0*stride, a);
109
14710
    AV_WN4PA(src+1*stride, a);
110
14710
    AV_WN4PA(src+2*stride, a);
111
14710
    AV_WN4PA(src+3*stride, a);
112
}
113
114
52
static void FUNCC(pred4x4_127_dc)(uint8_t *_src, const uint8_t *topright,
115
                                  ptrdiff_t _stride)
116
{
117
52
    pixel *src = (pixel*)_src;
118
52
    int stride = _stride>>(sizeof(pixel)-1);
119
52
    const pixel4 a = PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))-1);
120
121
52
    AV_WN4PA(src+0*stride, a);
122
52
    AV_WN4PA(src+1*stride, a);
123
52
    AV_WN4PA(src+2*stride, a);
124
52
    AV_WN4PA(src+3*stride, a);
125
}
126
127
94
static void FUNCC(pred4x4_129_dc)(uint8_t *_src, const uint8_t *topright,
128
                                  ptrdiff_t _stride)
129
{
130
94
    pixel *src = (pixel*)_src;
131
94
    int stride = _stride>>(sizeof(pixel)-1);
132
94
    const pixel4 a = PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))+1);
133
134
94
    AV_WN4PA(src+0*stride, a);
135
94
    AV_WN4PA(src+1*stride, a);
136
94
    AV_WN4PA(src+2*stride, a);
137
94
    AV_WN4PA(src+3*stride, a);
138
}
139
140
141
#define LOAD_TOP_RIGHT_EDGE\
142
    const unsigned av_unused t4 = topright[0];\
143
    const unsigned av_unused t5 = topright[1];\
144
    const unsigned av_unused t6 = topright[2];\
145
    const unsigned av_unused t7 = topright[3];\
146
147
#define LOAD_DOWN_LEFT_EDGE\
148
    const unsigned av_unused l4 = src[-1+4*stride];\
149
    const unsigned av_unused l5 = src[-1+5*stride];\
150
    const unsigned av_unused l6 = src[-1+6*stride];\
151
    const unsigned av_unused l7 = src[-1+7*stride];\
152
153
#define LOAD_LEFT_EDGE\
154
    const unsigned av_unused l0 = src[-1+0*stride];\
155
    const unsigned av_unused l1 = src[-1+1*stride];\
156
    const unsigned av_unused l2 = src[-1+2*stride];\
157
    const unsigned av_unused l3 = src[-1+3*stride];\
158
159
#define LOAD_TOP_EDGE\
160
    const unsigned av_unused t0 = src[ 0-1*stride];\
161
    const unsigned av_unused t1 = src[ 1-1*stride];\
162
    const unsigned av_unused t2 = src[ 2-1*stride];\
163
    const unsigned av_unused t3 = src[ 3-1*stride];\
164
165
3771408
static void FUNCC(pred4x4_down_right)(uint8_t *_src, const uint8_t *topright,
166
                                      ptrdiff_t _stride)
167
{
168
3771408
    pixel *src = (pixel*)_src;
169
3771408
    int stride = _stride>>(sizeof(pixel)-1);
170
3771408
    const int lt= src[-1-1*stride];
171
3771408
    LOAD_TOP_EDGE
172
3771408
    LOAD_LEFT_EDGE
173
174
3771408
    src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
175
3771408
    src[0+2*stride]=
176
3771408
    src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
177
3771408
    src[0+1*stride]=
178
3771408
    src[1+2*stride]=
179
3771408
    src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
180
3771408
    src[0+0*stride]=
181
3771408
    src[1+1*stride]=
182
3771408
    src[2+2*stride]=
183
3771408
    src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
184
3771408
    src[1+0*stride]=
185
3771408
    src[2+1*stride]=
186
3771408
    src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
187
3771408
    src[2+0*stride]=
188
3771408
    src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
189
3771408
    src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
190
}
191
192
2247578
static void FUNCC(pred4x4_down_left)(uint8_t *_src, const uint8_t *_topright,
193
                                     ptrdiff_t _stride)
194
{
195
2247578
    pixel *src = (pixel*)_src;
196
2247578
    const pixel *topright = (const pixel*)_topright;
197
2247578
    int stride = _stride>>(sizeof(pixel)-1);
198
2247578
    LOAD_TOP_EDGE
199
2247578
    LOAD_TOP_RIGHT_EDGE
200
//    LOAD_LEFT_EDGE
201
202
2247578
    src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
203
2247578
    src[1+0*stride]=
204
2247578
    src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
205
2247578
    src[2+0*stride]=
206
2247578
    src[1+1*stride]=
207
2247578
    src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
208
2247578
    src[3+0*stride]=
209
2247578
    src[2+1*stride]=
210
2247578
    src[1+2*stride]=
211
2247578
    src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
212
2247578
    src[3+1*stride]=
213
2247578
    src[2+2*stride]=
214
2247578
    src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
215
2247578
    src[3+2*stride]=
216
2247578
    src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
217
2247578
    src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
218
}
219
220
2589450
static void FUNCC(pred4x4_vertical_right)(uint8_t *_src,
221
                                          const uint8_t *topright,
222
                                          ptrdiff_t _stride)
223
{
224
2589450
    pixel *src = (pixel*)_src;
225
2589450
    int stride = _stride>>(sizeof(pixel)-1);
226
2589450
    const int lt= src[-1-1*stride];
227
2589450
    LOAD_TOP_EDGE
228
2589450
    LOAD_LEFT_EDGE
229
230
2589450
    src[0+0*stride]=
231
2589450
    src[1+2*stride]=(lt + t0 + 1)>>1;
232
2589450
    src[1+0*stride]=
233
2589450
    src[2+2*stride]=(t0 + t1 + 1)>>1;
234
2589450
    src[2+0*stride]=
235
2589450
    src[3+2*stride]=(t1 + t2 + 1)>>1;
236
2589450
    src[3+0*stride]=(t2 + t3 + 1)>>1;
237
2589450
    src[0+1*stride]=
238
2589450
    src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
239
2589450
    src[1+1*stride]=
240
2589450
    src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
241
2589450
    src[2+1*stride]=
242
2589450
    src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
243
2589450
    src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
244
2589450
    src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
245
2589450
    src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
246
}
247
248
2129380
static void FUNCC(pred4x4_vertical_left)(uint8_t *_src,
249
                                         const uint8_t *_topright,
250
                                         ptrdiff_t _stride)
251
{
252
2129380
    pixel *src = (pixel*)_src;
253
2129380
    const pixel *topright = (const pixel*)_topright;
254
2129380
    int stride = _stride>>(sizeof(pixel)-1);
255
2129380
    LOAD_TOP_EDGE
256
2129380
    LOAD_TOP_RIGHT_EDGE
257
258
2129380
    src[0+0*stride]=(t0 + t1 + 1)>>1;
259
2129380
    src[1+0*stride]=
260
2129380
    src[0+2*stride]=(t1 + t2 + 1)>>1;
261
2129380
    src[2+0*stride]=
262
2129380
    src[1+2*stride]=(t2 + t3 + 1)>>1;
263
2129380
    src[3+0*stride]=
264
2129380
    src[2+2*stride]=(t3 + t4+ 1)>>1;
265
2129380
    src[3+2*stride]=(t4 + t5+ 1)>>1;
266
2129380
    src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
267
2129380
    src[1+1*stride]=
268
2129380
    src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
269
2129380
    src[2+1*stride]=
270
2129380
    src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
271
2129380
    src[3+1*stride]=
272
2129380
    src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
273
2129380
    src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
274
}
275
276
3970960
static void FUNCC(pred4x4_horizontal_up)(uint8_t *_src, const uint8_t *topright,
277
                                         ptrdiff_t _stride)
278
{
279
3970960
    pixel *src = (pixel*)_src;
280
3970960
    int stride = _stride>>(sizeof(pixel)-1);
281
3970960
    LOAD_LEFT_EDGE
282
283
3970960
    src[0+0*stride]=(l0 + l1 + 1)>>1;
284
3970960
    src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
285
3970960
    src[2+0*stride]=
286
3970960
    src[0+1*stride]=(l1 + l2 + 1)>>1;
287
3970960
    src[3+0*stride]=
288
3970960
    src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
289
3970960
    src[2+1*stride]=
290
3970960
    src[0+2*stride]=(l2 + l3 + 1)>>1;
291
3970960
    src[3+1*stride]=
292
3970960
    src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
293
3970960
    src[3+2*stride]=
294
3970960
    src[1+3*stride]=
295
3970960
    src[0+3*stride]=
296
3970960
    src[2+2*stride]=
297
3970960
    src[2+3*stride]=
298
3970960
    src[3+3*stride]=l3;
299
}
300
301
4284614
static void FUNCC(pred4x4_horizontal_down)(uint8_t *_src,
302
                                           const uint8_t *topright,
303
                                           ptrdiff_t _stride)
304
{
305
4284614
    pixel *src = (pixel*)_src;
306
4284614
    int stride = _stride>>(sizeof(pixel)-1);
307
4284614
    const int lt= src[-1-1*stride];
308
4284614
    LOAD_TOP_EDGE
309
4284614
    LOAD_LEFT_EDGE
310
311
4284614
    src[0+0*stride]=
312
4284614
    src[2+1*stride]=(lt + l0 + 1)>>1;
313
4284614
    src[1+0*stride]=
314
4284614
    src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
315
4284614
    src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
316
4284614
    src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
317
4284614
    src[0+1*stride]=
318
4284614
    src[2+2*stride]=(l0 + l1 + 1)>>1;
319
4284614
    src[1+1*stride]=
320
4284614
    src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
321
4284614
    src[0+2*stride]=
322
4284614
    src[2+3*stride]=(l1 + l2+ 1)>>1;
323
4284614
    src[1+2*stride]=
324
4284614
    src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
325
4284614
    src[0+3*stride]=(l2 + l3 + 1)>>1;
326
4284614
    src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
327
}
328
329
439730
static void FUNCC(pred16x16_vertical)(uint8_t *_src, ptrdiff_t _stride)
330
{
331
    int i;
332
439730
    pixel *src = (pixel*)_src;
333
439730
    int stride = _stride>>(sizeof(pixel)-1);
334
439730
    const pixel4 a = AV_RN4PA(((pixel4*)(src-stride))+0);
335
439730
    const pixel4 b = AV_RN4PA(((pixel4*)(src-stride))+1);
336
439730
    const pixel4 c = AV_RN4PA(((pixel4*)(src-stride))+2);
337
439730
    const pixel4 d = AV_RN4PA(((pixel4*)(src-stride))+3);
338
339
7475410
    for(i=0; i<16; i++){
340
7035680
        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
341
7035680
        AV_WN4PA(((pixel4*)(src+i*stride))+1, b);
342
7035680
        AV_WN4PA(((pixel4*)(src+i*stride))+2, c);
343
7035680
        AV_WN4PA(((pixel4*)(src+i*stride))+3, d);
344
    }
345
}
346
347
485648
static void FUNCC(pred16x16_horizontal)(uint8_t *_src, ptrdiff_t stride)
348
{
349
    int i;
350
485648
    pixel *src = (pixel*)_src;
351
485648
    stride >>= sizeof(pixel)-1;
352
353
8256016
    for(i=0; i<16; i++){
354
7770368
        const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]);
355
356
7770368
        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
357
7770368
        AV_WN4PA(((pixel4*)(src+i*stride))+1, a);
358
7770368
        AV_WN4PA(((pixel4*)(src+i*stride))+2, a);
359
7770368
        AV_WN4PA(((pixel4*)(src+i*stride))+3, a);
360
    }
361
}
362
363
#define PREDICT_16x16_DC(v)\
364
    for(i=0; i<16; i++){\
365
        AV_WN4PA(src+ 0, v);\
366
        AV_WN4PA(src+ 4, v);\
367
        AV_WN4PA(src+ 8, v);\
368
        AV_WN4PA(src+12, v);\
369
        src += stride;\
370
    }
371
372
495376
static void FUNCC(pred16x16_dc)(uint8_t *_src, ptrdiff_t stride)
373
{
374
495376
    int i, dc=0;
375
495376
    pixel *src = (pixel*)_src;
376
    pixel4 dcsplat;
377
495376
    stride >>= sizeof(pixel)-1;
378
379
8421392
    for(i=0;i<16; i++){
380
7926016
        dc+= src[-1+i*stride];
381
    }
382
383
8421392
    for(i=0;i<16; i++){
384
7926016
        dc+= src[i-stride];
385
    }
386
387
495376
    dcsplat = PIXEL_SPLAT_X4((dc+16)>>5);
388
8421392
    PREDICT_16x16_DC(dcsplat);
389
}
390
391
66876
static void FUNCC(pred16x16_left_dc)(uint8_t *_src, ptrdiff_t stride)
392
{
393
66876
    int i, dc=0;
394
66876
    pixel *src = (pixel*)_src;
395
    pixel4 dcsplat;
396
66876
    stride >>= sizeof(pixel)-1;
397
398
1136892
    for(i=0;i<16; i++){
399
1070016
        dc+= src[-1+i*stride];
400
    }
401
402
66876
    dcsplat = PIXEL_SPLAT_X4((dc+8)>>4);
403
1136892
    PREDICT_16x16_DC(dcsplat);
404
}
405
406
11188
static void FUNCC(pred16x16_top_dc)(uint8_t *_src, ptrdiff_t stride)
407
{
408
11188
    int i, dc=0;
409
11188
    pixel *src = (pixel*)_src;
410
    pixel4 dcsplat;
411
11188
    stride >>= sizeof(pixel)-1;
412
413
190196
    for(i=0;i<16; i++){
414
179008
        dc+= src[i-stride];
415
    }
416
417
11188
    dcsplat = PIXEL_SPLAT_X4((dc+8)>>4);
418
190196
    PREDICT_16x16_DC(dcsplat);
419
}
420
421
#define PRED16x16_X(n, v) \
422
static void FUNCC(pred16x16_##n##_dc)(uint8_t *_src, ptrdiff_t stride)\
423
{\
424
    int i;\
425
    pixel *src = (pixel*)_src;\
426
    stride >>= sizeof(pixel)-1;\
427
    PREDICT_16x16_DC(PIXEL_SPLAT_X4(v));\
428
}
429
430
408
PRED16x16_X(127, (1<<(BIT_DEPTH-1))-1)
431
34425
PRED16x16_X(128, (1<<(BIT_DEPTH-1))+0)
432
170
PRED16x16_X(129, (1<<(BIT_DEPTH-1))+1)
433
434
300788
static inline void FUNCC(pred16x16_plane_compat)(uint8_t *_src,
435
                                                 ptrdiff_t _stride,
436
                                                 const int svq3,
437
                                                 const int rv40)
438
{
439
  int i, j, k;
440
  int a;
441
  INIT_CLIP
442
300788
  pixel *src = (pixel*)_src;
443
300788
  int stride = _stride>>(sizeof(pixel)-1);
444
300788
  const pixel * const src0 = src +7-stride;
445
300788
  const pixel *       src1 = src +8*stride-1;
446
300788
  const pixel *       src2 = src1-2*stride;    // == src+6*stride-1;
447
300788
  int H = src0[1] - src0[-1];
448
300788
  int V = src1[0] - src2[ 0];
449
2406304
  for(k=2; k<=8; ++k) {
450
2105516
    src1 += stride; src2 -= stride;
451
2105516
    H += k*(src0[k] - src0[-k]);
452
2105516
    V += k*(src1[0] - src2[ 0]);
453
  }
454
300788
  if(svq3){
455
112
    H = ( 5*(H/4) ) / 16;
456
112
    V = ( 5*(V/4) ) / 16;
457
458
    /* required for 100% accuracy */
459
112
    i = H; H = V; V = i;
460
300676
  }else if(rv40){
461
2432
    H = ( H + (H>>2) ) >> 4;
462
2432
    V = ( V + (V>>2) ) >> 4;
463
  }else{
464
298244
    H = ( 5*H+32 ) >> 6;
465
298244
    V = ( 5*V+32 ) >> 6;
466
  }
467
468
300788
  a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
469
5113396
  for(j=16; j>0; --j) {
470
4812608
    int b = a;
471
4812608
    a += V;
472
24063040
    for(i=-16; i<0; i+=4) {
473
19250432
      src[16+i] = CLIP((b    ) >> 5);
474
19250432
      src[17+i] = CLIP((b+  H) >> 5);
475
19250432
      src[18+i] = CLIP((b+2*H) >> 5);
476
19250432
      src[19+i] = CLIP((b+3*H) >> 5);
477
19250432
      b += 4*H;
478
    }
479
4812608
    src += stride;
480
  }
481
}
482
483
298244
static void FUNCC(pred16x16_plane)(uint8_t *src, ptrdiff_t stride)
484
{
485
298244
    FUNCC(pred16x16_plane_compat)(src, stride, 0, 0);
486
}
487
488
1417544
static void FUNCC(pred8x8_vertical)(uint8_t *_src, ptrdiff_t _stride)
489
{
490
    int i;
491
1417544
    pixel *src = (pixel*)_src;
492
1417544
    int stride = _stride>>(sizeof(pixel)-1);
493
1417544
    const pixel4 a= AV_RN4PA(((pixel4*)(src-stride))+0);
494
1417544
    const pixel4 b= AV_RN4PA(((pixel4*)(src-stride))+1);
495
496
12757896
    for(i=0; i<8; i++){
497
11340352
        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
498
11340352
        AV_WN4PA(((pixel4*)(src+i*stride))+1, b);
499
    }
500
}
501
502
322924
static void FUNCC(pred8x16_vertical)(uint8_t *_src, ptrdiff_t _stride)
503
{
504
    int i;
505
322924
    pixel *src = (pixel*)_src;
506
322924
    int stride = _stride>>(sizeof(pixel)-1);
507
322924
    const pixel4 a= AV_RN4PA(((pixel4*)(src-stride))+0);
508
322924
    const pixel4 b= AV_RN4PA(((pixel4*)(src-stride))+1);
509
510
5489708
    for(i=0; i<16; i++){
511
5166784
        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
512
5166784
        AV_WN4PA(((pixel4*)(src+i*stride))+1, b);
513
    }
514
}
515
516
2923820
static void FUNCC(pred8x8_horizontal)(uint8_t *_src, ptrdiff_t stride)
517
{
518
    int i;
519
2923820
    pixel *src = (pixel*)_src;
520
2923820
    stride >>= sizeof(pixel)-1;
521
522
26314380
    for(i=0; i<8; i++){
523
23390560
        const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]);
524
23390560
        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
525
23390560
        AV_WN4PA(((pixel4*)(src+i*stride))+1, a);
526
    }
527
}
528
529
595976
static void FUNCC(pred8x16_horizontal)(uint8_t *_src, ptrdiff_t stride)
530
{
531
    int i;
532
595976
    pixel *src = (pixel*)_src;
533
595976
    stride >>= sizeof(pixel)-1;
534
10131592
    for(i=0; i<16; i++){
535
9535616
        const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]);
536
9535616
        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
537
9535616
        AV_WN4PA(((pixel4*)(src+i*stride))+1, a);
538
    }
539
}
540
541
#define PRED8x8_X(n, v)\
542
static void FUNCC(pred8x8_##n##_dc)(uint8_t *_src, ptrdiff_t stride)\
543
{\
544
    int i;\
545
    const pixel4 a = PIXEL_SPLAT_X4(v);\
546
    pixel *src = (pixel*)_src;\
547
    stride >>= sizeof(pixel)-1;\
548
    for(i=0; i<8; i++){\
549
        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);\
550
        AV_WN4PA(((pixel4*)(src+i*stride))+1, a);\
551
    }\
552
}
553
554
2052
PRED8x8_X(127, (1<<(BIT_DEPTH-1))-1)
555
1160910
PRED8x8_X(128, (1<<(BIT_DEPTH-1))+0)
556
2412
PRED8x8_X(129, (1<<(BIT_DEPTH-1))+1)
557
558
3076
static void FUNCC(pred8x16_128_dc)(uint8_t *_src, ptrdiff_t stride)
559
{
560
3076
    FUNCC(pred8x8_128_dc)(_src, stride);
561
3076
    FUNCC(pred8x8_128_dc)(_src+8*stride, stride);
562
}
563
564
1078636
static void FUNCC(pred8x8_left_dc)(uint8_t *_src, ptrdiff_t stride)
565
{
566
    int i;
567
    int dc0, dc2;
568
    pixel4 dc0splat, dc2splat;
569
1078636
    pixel *src = (pixel*)_src;
570
1078636
    stride >>= sizeof(pixel)-1;
571
572
1078636
    dc0=dc2=0;
573
5393180
    for(i=0;i<4; i++){
574
4314544
        dc0+= src[-1+i*stride];
575
4314544
        dc2+= src[-1+(i+4)*stride];
576
    }
577
1078636
    dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
578
1078636
    dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
579
580
5393180
    for(i=0; i<4; i++){
581
4314544
        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
582
4314544
        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc0splat);
583
    }
584
5393180
    for(i=4; i<8; i++){
585
4314544
        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat);
586
4314544
        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc2splat);
587
    }
588
}
589
590
195540
static void FUNCC(pred8x16_left_dc)(uint8_t *_src, ptrdiff_t stride)
591
{
592
195540
    FUNCC(pred8x8_left_dc)(_src, stride);
593
195540
    FUNCC(pred8x8_left_dc)(_src+8*stride, stride);
594
}
595
596
164552
static void FUNCC(pred8x8_top_dc)(uint8_t *_src, ptrdiff_t stride)
597
{
598
    int i;
599
    int dc0, dc1;
600
    pixel4 dc0splat, dc1splat;
601
164552
    pixel *src = (pixel*)_src;
602
164552
    stride >>= sizeof(pixel)-1;
603
604
164552
    dc0=dc1=0;
605
822760
    for(i=0;i<4; i++){
606
658208
        dc0+= src[i-stride];
607
658208
        dc1+= src[4+i-stride];
608
    }
609
164552
    dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
610
164552
    dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
611
612
822760
    for(i=0; i<4; i++){
613
658208
        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
614
658208
        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
615
    }
616
822760
    for(i=4; i<8; i++){
617
658208
        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
618
658208
        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
619
    }
620
}
621
622
26616
static void FUNCC(pred8x16_top_dc)(uint8_t *_src, ptrdiff_t stride)
623
{
624
    int i;
625
    int dc0, dc1;
626
    pixel4 dc0splat, dc1splat;
627
26616
    pixel *src = (pixel*)_src;
628
26616
    stride >>= sizeof(pixel)-1;
629
630
26616
    dc0=dc1=0;
631
133080
    for(i=0;i<4; i++){
632
106464
        dc0+= src[i-stride];
633
106464
        dc1+= src[4+i-stride];
634
    }
635
26616
    dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
636
26616
    dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
637
638
452472
    for(i=0; i<16; i++){
639
425856
        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
640
425856
        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
641
    }
642
}
643
644
5456784
static void FUNCC(pred8x8_dc)(uint8_t *_src, ptrdiff_t stride)
645
{
646
    int i;
647
    int dc0, dc1, dc2;
648
    pixel4 dc0splat, dc1splat, dc2splat, dc3splat;
649
5456784
    pixel *src = (pixel*)_src;
650
5456784
    stride >>= sizeof(pixel)-1;
651
652
5456784
    dc0=dc1=dc2=0;
653
27283920
    for(i=0;i<4; i++){
654
21827136
        dc0+= src[-1+i*stride] + src[i-stride];
655
21827136
        dc1+= src[4+i-stride];
656
21827136
        dc2+= src[-1+(i+4)*stride];
657
    }
658
5456784
    dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3);
659
5456784
    dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
660
5456784
    dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
661
5456784
    dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3);
662
663
27283920
    for(i=0; i<4; i++){
664
21827136
        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
665
21827136
        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
666
    }
667
27283920
    for(i=4; i<8; i++){
668
21827136
        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat);
669
21827136
        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc3splat);
670
    }
671
}
672
673
1594572
static void FUNCC(pred8x16_dc)(uint8_t *_src, ptrdiff_t stride)
674
{
675
    int i;
676
    int dc0, dc1, dc2, dc3, dc4;
677
    pixel4 dc0splat, dc1splat, dc2splat, dc3splat, dc4splat, dc5splat, dc6splat, dc7splat;
678
1594572
    pixel *src = (pixel*)_src;
679
1594572
    stride >>= sizeof(pixel)-1;
680
681
1594572
    dc0=dc1=dc2=dc3=dc4=0;
682
7972860
    for(i=0;i<4; i++){
683
6378288
        dc0+= src[-1+i*stride] + src[i-stride];
684
6378288
        dc1+= src[4+i-stride];
685
6378288
        dc2+= src[-1+(i+4)*stride];
686
6378288
        dc3+= src[-1+(i+8)*stride];
687
6378288
        dc4+= src[-1+(i+12)*stride];
688
    }
689
1594572
    dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3);
690
1594572
    dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
691
1594572
    dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
692
1594572
    dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3);
693
1594572
    dc4splat = PIXEL_SPLAT_X4((dc3 + 2)>>2);
694
1594572
    dc5splat = PIXEL_SPLAT_X4((dc1 + dc3 + 4)>>3);
695
1594572
    dc6splat = PIXEL_SPLAT_X4((dc4 + 2)>>2);
696
1594572
    dc7splat = PIXEL_SPLAT_X4((dc1 + dc4 + 4)>>3);
697
698
7972860
    for(i=0; i<4; i++){
699
6378288
        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
700
6378288
        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
701
    }
702
7972860
    for(i=4; i<8; i++){
703
6378288
        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat);
704
6378288
        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc3splat);
705
    }
706
7972860
    for(i=8; i<12; i++){
707
6378288
        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc4splat);
708
6378288
        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc5splat);
709
    }
710
7972860
    for(i=12; i<16; i++){
711
6378288
        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc6splat);
712
6378288
        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc7splat);
713
    }
714
}
715
716
//the following 4 function should not be optimized!
717
68
static void FUNC(pred8x8_mad_cow_dc_l0t)(uint8_t *src, ptrdiff_t stride)
718
{
719
68
    FUNCC(pred8x8_top_dc)(src, stride);
720
68
    FUNCC(pred4x4_dc)(src, NULL, stride);
721
}
722
723
12
static void FUNC(pred8x16_mad_cow_dc_l0t)(uint8_t *src, ptrdiff_t stride)
724
{
725
12
    FUNCC(pred8x16_top_dc)(src, stride);
726
12
    FUNCC(pred4x4_dc)(src, NULL, stride);
727
}
728
729
36
static void FUNC(pred8x8_mad_cow_dc_0lt)(uint8_t *src, ptrdiff_t stride)
730
{
731
36
    FUNCC(pred8x8_dc)(src, stride);
732
36
    FUNCC(pred4x4_top_dc)(src, NULL, stride);
733
}
734
735
12
static void FUNC(pred8x16_mad_cow_dc_0lt)(uint8_t *src, ptrdiff_t stride)
736
{
737
12
    FUNCC(pred8x16_dc)(src, stride);
738
12
    FUNCC(pred4x4_top_dc)(src, NULL, stride);
739
}
740
741
20
static void FUNC(pred8x8_mad_cow_dc_l00)(uint8_t *src, ptrdiff_t stride)
742
{
743
20
    FUNCC(pred8x8_left_dc)(src, stride);
744
20
    FUNCC(pred4x4_128_dc)(src + 4*stride                  , NULL, stride);
745
20
    FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride);
746
}
747
748
12
static void FUNC(pred8x16_mad_cow_dc_l00)(uint8_t *src, ptrdiff_t stride)
749
{
750
12
    FUNCC(pred8x16_left_dc)(src, stride);
751
12
    FUNCC(pred4x4_128_dc)(src + 4*stride                  , NULL, stride);
752
12
    FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride);
753
}
754
755
32
static void FUNC(pred8x8_mad_cow_dc_0l0)(uint8_t *src, ptrdiff_t stride)
756
{
757
32
    FUNCC(pred8x8_left_dc)(src, stride);
758
32
    FUNCC(pred4x4_128_dc)(src                  , NULL, stride);
759
32
    FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride);
760
}
761
762
12
static void FUNC(pred8x16_mad_cow_dc_0l0)(uint8_t *src, ptrdiff_t stride)
763
{
764
12
    FUNCC(pred8x16_left_dc)(src, stride);
765
12
    FUNCC(pred4x4_128_dc)(src                  , NULL, stride);
766
12
    FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride);
767
}
768
769
923988
static void FUNCC(pred8x8_plane)(uint8_t *_src, ptrdiff_t _stride)
770
{
771
  int j, k;
772
  int a;
773
  INIT_CLIP
774
923988
  pixel *src = (pixel*)_src;
775
923988
  int stride = _stride>>(sizeof(pixel)-1);
776
923988
  const pixel * const src0 = src +3-stride;
777
923988
  const pixel *       src1 = src +4*stride-1;
778
923988
  const pixel *       src2 = src1-2*stride;    // == src+2*stride-1;
779
923988
  int H = src0[1] - src0[-1];
780
923988
  int V = src1[0] - src2[ 0];
781
3695952
  for(k=2; k<=4; ++k) {
782
2771964
    src1 += stride; src2 -= stride;
783
2771964
    H += k*(src0[k] - src0[-k]);
784
2771964
    V += k*(src1[0] - src2[ 0]);
785
  }
786
923988
  H = ( 17*H+16 ) >> 5;
787
923988
  V = ( 17*V+16 ) >> 5;
788
789
923988
  a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
790
8315892
  for(j=8; j>0; --j) {
791
7391904
    int b = a;
792
7391904
    a += V;
793
7391904
    src[0] = CLIP((b    ) >> 5);
794
7391904
    src[1] = CLIP((b+  H) >> 5);
795
7391904
    src[2] = CLIP((b+2*H) >> 5);
796
7391904
    src[3] = CLIP((b+3*H) >> 5);
797
7391904
    src[4] = CLIP((b+4*H) >> 5);
798
7391904
    src[5] = CLIP((b+5*H) >> 5);
799
7391904
    src[6] = CLIP((b+6*H) >> 5);
800
7391904
    src[7] = CLIP((b+7*H) >> 5);
801
7391904
    src += stride;
802
  }
803
}
804
805
328576
static void FUNCC(pred8x16_plane)(uint8_t *_src, ptrdiff_t _stride)
806
{
807
  int j, k;
808
  int a;
809
  INIT_CLIP
810
328576
  pixel *src = (pixel*)_src;
811
328576
  int stride = _stride>>(sizeof(pixel)-1);
812
328576
  const pixel * const src0 = src +3-stride;
813
328576
  const pixel *       src1 = src +8*stride-1;
814
328576
  const pixel *       src2 = src1-2*stride;    // == src+6*stride-1;
815
328576
  int H = src0[1] - src0[-1];
816
328576
  int V = src1[0] - src2[ 0];
817
818
1314304
  for (k = 2; k <= 4; ++k) {
819
985728
      src1 += stride; src2 -= stride;
820
985728
      H += k*(src0[k] - src0[-k]);
821
985728
      V += k*(src1[0] - src2[ 0]);
822
  }
823
1642880
  for (; k <= 8; ++k) {
824
1314304
      src1 += stride; src2 -= stride;
825
1314304
      V += k*(src1[0] - src2[0]);
826
  }
827
828
328576
  H = (17*H+16) >> 5;
829
328576
  V = (5*V+32) >> 6;
830
831
328576
  a = 16*(src1[0] + src2[8] + 1) - 7*V - 3*H;
832
5585792
  for(j=16; j>0; --j) {
833
5257216
    int b = a;
834
5257216
    a += V;
835
5257216
    src[0] = CLIP((b    ) >> 5);
836
5257216
    src[1] = CLIP((b+  H) >> 5);
837
5257216
    src[2] = CLIP((b+2*H) >> 5);
838
5257216
    src[3] = CLIP((b+3*H) >> 5);
839
5257216
    src[4] = CLIP((b+4*H) >> 5);
840
5257216
    src[5] = CLIP((b+5*H) >> 5);
841
5257216
    src[6] = CLIP((b+6*H) >> 5);
842
5257216
    src[7] = CLIP((b+7*H) >> 5);
843
5257216
    src += stride;
844
  }
845
}
846
847
#define SRC(x,y) src[(x)+(y)*stride]
848
#define PL(y) \
849
    const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
850
#define PREDICT_8x8_LOAD_LEFT \
851
    const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
852
                     + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
853
    PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
854
    const int l7 av_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
855
856
#define PT(x) \
857
    const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
858
#define PREDICT_8x8_LOAD_TOP \
859
    const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
860
                     + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
861
    PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
862
    const int t7 av_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
863
                     + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
864
865
#define PTR(x) \
866
    t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
867
#define PREDICT_8x8_LOAD_TOPRIGHT \
868
    int t8, t9, t10, t11, t12, t13, t14, t15; \
869
    if(has_topright) { \
870
        PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
871
        t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
872
    } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
873
874
#define PREDICT_8x8_LOAD_TOPLEFT \
875
    const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
876
877
#define PREDICT_8x8_DC(v) \
878
    int y; \
879
    for( y = 0; y < 8; y++ ) { \
880
        AV_WN4PA(((pixel4*)src)+0, v); \
881
        AV_WN4PA(((pixel4*)src)+1, v); \
882
        src += stride; \
883
    }
884
885
2226
static void FUNCC(pred8x8l_128_dc)(uint8_t *_src, int has_topleft,
886
                                   int has_topright, ptrdiff_t _stride)
887
{
888
2226
    pixel *src = (pixel*)_src;
889
2226
    int stride = _stride>>(sizeof(pixel)-1);
890
891
20034
    PREDICT_8x8_DC(PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1)));
892
}
893
128394
static void FUNCC(pred8x8l_left_dc)(uint8_t *_src, int has_topleft,
894
                                    int has_topright, ptrdiff_t _stride)
895
{
896
256788
    pixel *src = (pixel*)_src;
897
256788
    int stride = _stride>>(sizeof(pixel)-1);
898
899
256788
    PREDICT_8x8_LOAD_LEFT;
900
256788
    const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3);
901
2311092
    PREDICT_8x8_DC(dc);
902
}
903
15422
static void FUNCC(pred8x8l_top_dc)(uint8_t *_src, int has_topleft,
904
                                   int has_topright, ptrdiff_t _stride)
905
{
906
30844
    pixel *src = (pixel*)_src;
907
30844
    int stride = _stride>>(sizeof(pixel)-1);
908
909

30844
    PREDICT_8x8_LOAD_TOP;
910
30844
    const pixel4 dc = PIXEL_SPLAT_X4((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3);
911
277596
    PREDICT_8x8_DC(dc);
912
}
913
1634866
static void FUNCC(pred8x8l_dc)(uint8_t *_src, int has_topleft,
914
                               int has_topright, ptrdiff_t _stride)
915
{
916
3269732
    pixel *src = (pixel*)_src;
917
3269732
    int stride = _stride>>(sizeof(pixel)-1);
918
919
3269732
    PREDICT_8x8_LOAD_LEFT;
920

3269732
    PREDICT_8x8_LOAD_TOP;
921
3269732
    const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7
922
                                     +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4);
923
29427588
    PREDICT_8x8_DC(dc);
924
}
925
1622678
static void FUNCC(pred8x8l_horizontal)(uint8_t *_src, int has_topleft,
926
                                       int has_topright, ptrdiff_t _stride)
927
{
928
3245356
    pixel *src = (pixel*)_src;
929
3245356
    int stride = _stride>>(sizeof(pixel)-1);
930
    pixel4 a;
931
932
3245356
    PREDICT_8x8_LOAD_LEFT;
933
#define ROW(y) a = PIXEL_SPLAT_X4(l##y); \
934
               AV_WN4PA(src+y*stride, a); \
935
               AV_WN4PA(src+y*stride+4, a);
936
3245356
    ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
937
#undef ROW
938
}
939
437199
static void FUNCC(pred8x8l_vertical)(uint8_t *_src, int has_topleft,
940
                                     int has_topright, ptrdiff_t _stride)
941
{
942
    int y;
943
874398
    pixel *src = (pixel*)_src;
944
874398
    int stride = _stride>>(sizeof(pixel)-1);
945
    pixel4 a, b;
946
947

874398
    PREDICT_8x8_LOAD_TOP;
948
874398
    src[0] = t0;
949
874398
    src[1] = t1;
950
874398
    src[2] = t2;
951
874398
    src[3] = t3;
952
874398
    src[4] = t4;
953
874398
    src[5] = t5;
954
874398
    src[6] = t6;
955
874398
    src[7] = t7;
956
874398
    a = AV_RN4PA(((pixel4*)src)+0);
957
874398
    b = AV_RN4PA(((pixel4*)src)+1);
958
6995184
    for( y = 1; y < 8; y++ ) {
959
6120786
        AV_WN4PA(((pixel4*)(src+y*stride))+0, a);
960
6120786
        AV_WN4PA(((pixel4*)(src+y*stride))+1, b);
961
    }
962
}
963
194050
static void FUNCC(pred8x8l_down_left)(uint8_t *_src, int has_topleft,
964
                                      int has_topright, ptrdiff_t _stride)
965
{
966
388100
    pixel *src = (pixel*)_src;
967
388100
    int stride = _stride>>(sizeof(pixel)-1);
968

388100
    PREDICT_8x8_LOAD_TOP;
969
388100
    PREDICT_8x8_LOAD_TOPRIGHT;
970
388100
    SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
971
388100
    SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
972
388100
    SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
973
388100
    SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
974
388100
    SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
975
388100
    SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
976
388100
    SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
977
388100
    SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
978
388100
    SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
979
388100
    SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
980
388100
    SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
981
388100
    SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
982
388100
    SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
983
388100
    SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
984
388100
    SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
985
}
986
293665
static void FUNCC(pred8x8l_down_right)(uint8_t *_src, int has_topleft,
987
                                       int has_topright, ptrdiff_t _stride)
988
{
989
587330
    pixel *src = (pixel*)_src;
990
587330
    int stride = _stride>>(sizeof(pixel)-1);
991

587330
    PREDICT_8x8_LOAD_TOP;
992
587330
    PREDICT_8x8_LOAD_LEFT;
993
587330
    PREDICT_8x8_LOAD_TOPLEFT;
994
587330
    SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
995
587330
    SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
996
587330
    SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
997
587330
    SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
998
587330
    SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
999
587330
    SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
1000
587330
    SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
1001
587330
    SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
1002
587330
    SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
1003
587330
    SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
1004
587330
    SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
1005
587330
    SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
1006
587330
    SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
1007
587330
    SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
1008
587330
    SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
1009
}
1010
197599
static void FUNCC(pred8x8l_vertical_right)(uint8_t *_src, int has_topleft,
1011
                                           int has_topright, ptrdiff_t _stride)
1012
{
1013
395198
    pixel *src = (pixel*)_src;
1014
395198
    int stride = _stride>>(sizeof(pixel)-1);
1015

395198
    PREDICT_8x8_LOAD_TOP;
1016
395198
    PREDICT_8x8_LOAD_LEFT;
1017
395198
    PREDICT_8x8_LOAD_TOPLEFT;
1018
395198
    SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
1019
395198
    SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
1020
395198
    SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
1021
395198
    SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
1022
395198
    SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
1023
395198
    SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
1024
395198
    SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
1025
395198
    SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
1026
395198
    SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
1027
395198
    SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
1028
395198
    SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
1029
395198
    SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
1030
395198
    SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
1031
395198
    SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
1032
395198
    SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
1033
395198
    SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
1034
395198
    SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
1035
395198
    SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
1036
395198
    SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
1037
395198
    SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
1038
395198
    SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
1039
395198
    SRC(7,0)= (t6 + t7 + 1) >> 1;
1040
}
1041
447851
static void FUNCC(pred8x8l_horizontal_down)(uint8_t *_src, int has_topleft,
1042
                                            int has_topright, ptrdiff_t _stride)
1043
{
1044
895702
    pixel *src = (pixel*)_src;
1045
895702
    int stride = _stride>>(sizeof(pixel)-1);
1046

895702
    PREDICT_8x8_LOAD_TOP;
1047
895702
    PREDICT_8x8_LOAD_LEFT;
1048
895702
    PREDICT_8x8_LOAD_TOPLEFT;
1049
895702
    SRC(0,7)= (l6 + l7 + 1) >> 1;
1050
895702
    SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
1051
895702
    SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
1052
895702
    SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
1053
895702
    SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
1054
895702
    SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
1055
895702
    SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
1056
895702
    SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
1057
895702
    SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
1058
895702
    SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
1059
895702
    SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
1060
895702
    SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
1061
895702
    SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
1062
895702
    SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
1063
895702
    SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
1064
895702
    SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
1065
895702
    SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
1066
895702
    SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
1067
895702
    SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
1068
895702
    SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
1069
895702
    SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
1070
895702
    SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
1071
}
1072
198165
static void FUNCC(pred8x8l_vertical_left)(uint8_t *_src, int has_topleft,
1073
                                          int has_topright, ptrdiff_t _stride)
1074
{
1075
396330
    pixel *src = (pixel*)_src;
1076
396330
    int stride = _stride>>(sizeof(pixel)-1);
1077

396330
    PREDICT_8x8_LOAD_TOP;
1078
396330
    PREDICT_8x8_LOAD_TOPRIGHT;
1079
396330
    SRC(0,0)= (t0 + t1 + 1) >> 1;
1080
396330
    SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
1081
396330
    SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
1082
396330
    SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
1083
396330
    SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
1084
396330
    SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
1085
396330
    SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
1086
396330
    SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
1087
396330
    SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
1088
396330
    SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
1089
396330
    SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
1090
396330
    SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
1091
396330
    SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
1092
396330
    SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
1093
396330
    SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
1094
396330
    SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
1095
396330
    SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
1096
396330
    SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
1097
396330
    SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
1098
396330
    SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
1099
396330
    SRC(7,6)= (t10 + t11 + 1) >> 1;
1100
396330
    SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
1101
}
1102
548546
static void FUNCC(pred8x8l_horizontal_up)(uint8_t *_src, int has_topleft,
1103
                                          int has_topright, ptrdiff_t _stride)
1104
{
1105
1097092
    pixel *src = (pixel*)_src;
1106
1097092
    int stride = _stride>>(sizeof(pixel)-1);
1107
1097092
    PREDICT_8x8_LOAD_LEFT;
1108
1097092
    SRC(0,0)= (l0 + l1 + 1) >> 1;
1109
1097092
    SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
1110
1097092
    SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
1111
1097092
    SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
1112
1097092
    SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
1113
1097092
    SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
1114
1097092
    SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
1115
1097092
    SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
1116
1097092
    SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
1117
1097092
    SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
1118
1097092
    SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
1119
1097092
    SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
1120
1097092
    SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
1121
1097092
    SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
1122
1097092
    SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
1123
1097092
    SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
1124
1097092
    SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
1125
1097092
    SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
1126
}
1127
1128
static void FUNCC(pred8x8l_vertical_filter_add)(uint8_t *_src, int16_t *_block, int has_topleft,
1129
                                     int has_topright, ptrdiff_t _stride)
1130
{
1131
    int i;
1132
    pixel *src = (pixel*)_src;
1133
    const dctcoef *block = (const dctcoef*)_block;
1134
    pixel pix[8];
1135
    int stride = _stride>>(sizeof(pixel)-1);
1136
    PREDICT_8x8_LOAD_TOP;
1137
1138
    pix[0] = t0;
1139
    pix[1] = t1;
1140
    pix[2] = t2;
1141
    pix[3] = t3;
1142
    pix[4] = t4;
1143
    pix[5] = t5;
1144
    pix[6] = t6;
1145
    pix[7] = t7;
1146
1147
    for(i=0; i<8; i++){
1148
        pixel v = pix[i];
1149
        src[0*stride]= v += block[0];
1150
        src[1*stride]= v += block[8];
1151
        src[2*stride]= v += block[16];
1152
        src[3*stride]= v += block[24];
1153
        src[4*stride]= v += block[32];
1154
        src[5*stride]= v += block[40];
1155
        src[6*stride]= v += block[48];
1156
        src[7*stride]= v +  block[56];
1157
        src++;
1158
        block++;
1159
    }
1160
1161
    memset(_block, 0, sizeof(dctcoef) * 64);
1162
}
1163
1164
static void FUNCC(pred8x8l_horizontal_filter_add)(uint8_t *_src, int16_t *_block, int has_topleft,
1165
                               int has_topright, ptrdiff_t _stride)
1166
{
1167
    int i;
1168
    pixel *src = (pixel*)_src;
1169
    const dctcoef *block = (const dctcoef*)_block;
1170
    pixel pix[8];
1171
    int stride = _stride>>(sizeof(pixel)-1);
1172
    PREDICT_8x8_LOAD_LEFT;
1173
1174
    pix[0] = l0;
1175
    pix[1] = l1;
1176
    pix[2] = l2;
1177
    pix[3] = l3;
1178
    pix[4] = l4;
1179
    pix[5] = l5;
1180
    pix[6] = l6;
1181
    pix[7] = l7;
1182
1183
    for(i=0; i<8; i++){
1184
        pixel v = pix[i];
1185
        src[0]= v += block[0];
1186
        src[1]= v += block[1];
1187
        src[2]= v += block[2];
1188
        src[3]= v += block[3];
1189
        src[4]= v += block[4];
1190
        src[5]= v += block[5];
1191
        src[6]= v += block[6];
1192
        src[7]= v +  block[7];
1193
        src+= stride;
1194
        block+= 8;
1195
    }
1196
1197
    memset(_block, 0, sizeof(dctcoef) * 64);
1198
}
1199
1200
#undef PREDICT_8x8_LOAD_LEFT
1201
#undef PREDICT_8x8_LOAD_TOP
1202
#undef PREDICT_8x8_LOAD_TOPLEFT
1203
#undef PREDICT_8x8_LOAD_TOPRIGHT
1204
#undef PREDICT_8x8_DC
1205
#undef PTR
1206
#undef PT
1207
#undef PL
1208
#undef SRC
1209
1210
160640
static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, int16_t *_block,
1211
                                        ptrdiff_t stride)
1212
{
1213
    int i;
1214
160640
    pixel *pix = (pixel*)_pix;
1215
160640
    const dctcoef *block = (const dctcoef*)_block;
1216
160640
    stride >>= sizeof(pixel)-1;
1217
160640
    pix -= stride;
1218
803200
    for(i=0; i<4; i++){
1219
642560
        pixel v = pix[0];
1220
642560
        pix[1*stride]= v += block[0];
1221
642560
        pix[2*stride]= v += block[4];
1222
642560
        pix[3*stride]= v += block[8];
1223
642560
        pix[4*stride]= v +  block[12];
1224
642560
        pix++;
1225
642560
        block++;
1226
    }
1227
1228
160640
    memset(_block, 0, sizeof(dctcoef) * 16);
1229
}
1230
1231
208454
static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, int16_t *_block,
1232
                                          ptrdiff_t stride)
1233
{
1234
    int i;
1235
208454
    pixel *pix = (pixel*)_pix;
1236
208454
    const dctcoef *block = (const dctcoef*)_block;
1237
208454
    stride >>= sizeof(pixel)-1;
1238
1042270
    for(i=0; i<4; i++){
1239
833816
        pixel v = pix[-1];
1240
833816
        pix[0]= v += block[0];
1241
833816
        pix[1]= v += block[1];
1242
833816
        pix[2]= v += block[2];
1243
833816
        pix[3]= v +  block[3];
1244
833816
        pix+= stride;
1245
833816
        block+= 4;
1246
    }
1247
1248
208454
    memset(_block, 0, sizeof(dctcoef) * 16);
1249
}
1250
1251
2148
static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, int16_t *_block,
1252
                                         ptrdiff_t stride)
1253
{
1254
    int i;
1255
2148
    pixel *pix = (pixel*)_pix;
1256
2148
    const dctcoef *block = (const dctcoef*)_block;
1257
2148
    stride >>= sizeof(pixel)-1;
1258
2148
    pix -= stride;
1259
19332
    for(i=0; i<8; i++){
1260
17184
        pixel v = pix[0];
1261
17184
        pix[1*stride]= v += block[0];
1262
17184
        pix[2*stride]= v += block[8];
1263
17184
        pix[3*stride]= v += block[16];
1264
17184
        pix[4*stride]= v += block[24];
1265
17184
        pix[5*stride]= v += block[32];
1266
17184
        pix[6*stride]= v += block[40];
1267
17184
        pix[7*stride]= v += block[48];
1268
17184
        pix[8*stride]= v +  block[56];
1269
17184
        pix++;
1270
17184
        block++;
1271
    }
1272
1273
2148
    memset(_block, 0, sizeof(dctcoef) * 64);
1274
}
1275
1276
2828
static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, int16_t *_block,
1277
                                           ptrdiff_t stride)
1278
{
1279
    int i;
1280
2828
    pixel *pix = (pixel*)_pix;
1281
2828
    const dctcoef *block = (const dctcoef*)_block;
1282
2828
    stride >>= sizeof(pixel)-1;
1283
25452
    for(i=0; i<8; i++){
1284
22624
        pixel v = pix[-1];
1285
22624
        pix[0]= v += block[0];
1286
22624
        pix[1]= v += block[1];
1287
22624
        pix[2]= v += block[2];
1288
22624
        pix[3]= v += block[3];
1289
22624
        pix[4]= v += block[4];
1290
22624
        pix[5]= v += block[5];
1291
22624
        pix[6]= v += block[6];
1292
22624
        pix[7]= v +  block[7];
1293
22624
        pix+= stride;
1294
22624
        block+= 8;
1295
    }
1296
1297
2828
    memset(_block, 0, sizeof(dctcoef) * 64);
1298
}
1299
1300
706
static void FUNCC(pred16x16_vertical_add)(uint8_t *pix, const int *block_offset,
1301
                                          int16_t *block,
1302
                                          ptrdiff_t stride)
1303
{
1304
    int i;
1305
12002
    for(i=0; i<16; i++)
1306
11296
        FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1307
}
1308
1309
544
static void FUNCC(pred16x16_horizontal_add)(uint8_t *pix,
1310
                                            const int *block_offset,
1311
                                            int16_t *block,
1312
                                            ptrdiff_t stride)
1313
{
1314
    int i;
1315
9248
    for(i=0; i<16; i++)
1316
8704
        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1317
}
1318
1319
4516
static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset,
1320
                                        int16_t *block, ptrdiff_t stride)
1321
{
1322
    int i;
1323
22580
    for(i=0; i<4; i++)
1324
18064
        FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1325
}
1326
1327
static void FUNCC(pred8x16_vertical_add)(uint8_t *pix, const int *block_offset,
1328
                                         int16_t *block, ptrdiff_t stride)
1329
{
1330
    int i;
1331
    for(i=0; i<4; i++)
1332
        FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1333
    for(i=4; i<8; i++)
1334
        FUNCC(pred4x4_vertical_add)(pix + block_offset[i+4], block + i*16*sizeof(pixel), stride);
1335
}
1336
1337
5776
static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset,
1338
                                          int16_t *block,
1339
                                          ptrdiff_t stride)
1340
{
1341
    int i;
1342
28880
    for(i=0; i<4; i++)
1343
23104
        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1344
}
1345
1346
static void FUNCC(pred8x16_horizontal_add)(uint8_t *pix,
1347
                                           const int *block_offset,
1348
                                           int16_t *block, ptrdiff_t stride)
1349
{
1350
    int i;
1351
    for(i=0; i<4; i++)
1352
        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1353
    for(i=4; i<8; i++)
1354
        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i+4], block + i*16*sizeof(pixel), stride);
1355
}