GCC Code Coverage Report
Directory: ../../../ffmpeg/ Exec Total Coverage
File: src/libavcodec/h264pred_template.c Lines: 822 884 93.0 %
Date: 2019-11-18 18:00:01 Branches: 162 184 88.0 %

Line Branch Exec Source
1
/*
2
 * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
3
 * Copyright (c) 2003-2011 Michael Niedermayer <michaelni@gmx.at>
4
 *
5
 * This file is part of FFmpeg.
6
 *
7
 * FFmpeg is free software; you can redistribute it and/or
8
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11
 *
12
 * FFmpeg is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with FFmpeg; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
 */
21
22
/**
23
 * @file
24
 * H.264 / AVC / MPEG-4 part10 prediction functions.
25
 * @author Michael Niedermayer <michaelni@gmx.at>
26
 */
27
28
#include "libavutil/intreadwrite.h"
29
30
#include "mathops.h"
31
32
#include "bit_depth_template.c"
33
34
3277615
static void FUNCC(pred4x4_vertical)(uint8_t *_src, const uint8_t *topright,
35
                                    ptrdiff_t _stride)
36
{
37
3277615
    pixel *src = (pixel*)_src;
38
3277615
    int stride = _stride>>(sizeof(pixel)-1);
39
3277615
    const pixel4 a= AV_RN4PA(src-stride);
40
41
3277615
    AV_WN4PA(src+0*stride, a);
42
3277615
    AV_WN4PA(src+1*stride, a);
43
3277615
    AV_WN4PA(src+2*stride, a);
44
3277615
    AV_WN4PA(src+3*stride, a);
45
3277615
}
46
47
4560114
static void FUNCC(pred4x4_horizontal)(uint8_t *_src, const uint8_t *topright,
48
                                      ptrdiff_t _stride)
49
{
50
4560114
    pixel *src = (pixel*)_src;
51
4560114
    int stride = _stride>>(sizeof(pixel)-1);
52
4560114
    AV_WN4PA(src+0*stride, PIXEL_SPLAT_X4(src[-1+0*stride]));
53
4560114
    AV_WN4PA(src+1*stride, PIXEL_SPLAT_X4(src[-1+1*stride]));
54
4560114
    AV_WN4PA(src+2*stride, PIXEL_SPLAT_X4(src[-1+2*stride]));
55
4560114
    AV_WN4PA(src+3*stride, PIXEL_SPLAT_X4(src[-1+3*stride]));
56
4560114
}
57
58
2182455
static void FUNCC(pred4x4_dc)(uint8_t *_src, const uint8_t *topright,
59
                              ptrdiff_t _stride)
60
{
61
2182455
    pixel *src = (pixel*)_src;
62
2182455
    int stride = _stride>>(sizeof(pixel)-1);
63
2182455
    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
64
2182455
                   + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
65
2182455
    const pixel4 a = PIXEL_SPLAT_X4(dc);
66
67
2182455
    AV_WN4PA(src+0*stride, a);
68
2182455
    AV_WN4PA(src+1*stride, a);
69
2182455
    AV_WN4PA(src+2*stride, a);
70
2182455
    AV_WN4PA(src+3*stride, a);
71
2182455
}
72
73
219641
static void FUNCC(pred4x4_left_dc)(uint8_t *_src, const uint8_t *topright,
74
                                   ptrdiff_t _stride)
75
{
76
219641
    pixel *src = (pixel*)_src;
77
219641
    int stride = _stride>>(sizeof(pixel)-1);
78
219641
    const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
79
219641
    const pixel4 a = PIXEL_SPLAT_X4(dc);
80
81
219641
    AV_WN4PA(src+0*stride, a);
82
219641
    AV_WN4PA(src+1*stride, a);
83
219641
    AV_WN4PA(src+2*stride, a);
84
219641
    AV_WN4PA(src+3*stride, a);
85
219641
}
86
87
40579
static void FUNCC(pred4x4_top_dc)(uint8_t *_src, const uint8_t *topright,
88
                                  ptrdiff_t _stride)
89
{
90
40579
    pixel *src = (pixel*)_src;
91
40579
    int stride = _stride>>(sizeof(pixel)-1);
92
40579
    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
93
40579
    const pixel4 a = PIXEL_SPLAT_X4(dc);
94
95
40579
    AV_WN4PA(src+0*stride, a);
96
40579
    AV_WN4PA(src+1*stride, a);
97
40579
    AV_WN4PA(src+2*stride, a);
98
40579
    AV_WN4PA(src+3*stride, a);
99
40579
}
100
101
6788
static void FUNCC(pred4x4_128_dc)(uint8_t *_src, const uint8_t *topright,
102
                                  ptrdiff_t _stride)
103
{
104
6788
    pixel *src = (pixel*)_src;
105
6788
    int stride = _stride>>(sizeof(pixel)-1);
106
6788
    const pixel4 a = PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1));
107
108
6788
    AV_WN4PA(src+0*stride, a);
109
6788
    AV_WN4PA(src+1*stride, a);
110
6788
    AV_WN4PA(src+2*stride, a);
111
6788
    AV_WN4PA(src+3*stride, a);
112
6788
}
113
114
26
static void FUNCC(pred4x4_127_dc)(uint8_t *_src, const uint8_t *topright,
115
                                  ptrdiff_t _stride)
116
{
117
26
    pixel *src = (pixel*)_src;
118
26
    int stride = _stride>>(sizeof(pixel)-1);
119
26
    const pixel4 a = PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))-1);
120
121
26
    AV_WN4PA(src+0*stride, a);
122
26
    AV_WN4PA(src+1*stride, a);
123
26
    AV_WN4PA(src+2*stride, a);
124
26
    AV_WN4PA(src+3*stride, a);
125
26
}
126
127
47
static void FUNCC(pred4x4_129_dc)(uint8_t *_src, const uint8_t *topright,
128
                                  ptrdiff_t _stride)
129
{
130
47
    pixel *src = (pixel*)_src;
131
47
    int stride = _stride>>(sizeof(pixel)-1);
132
47
    const pixel4 a = PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))+1);
133
134
47
    AV_WN4PA(src+0*stride, a);
135
47
    AV_WN4PA(src+1*stride, a);
136
47
    AV_WN4PA(src+2*stride, a);
137
47
    AV_WN4PA(src+3*stride, a);
138
47
}
139
140
141
#define LOAD_TOP_RIGHT_EDGE\
142
    const unsigned av_unused t4 = topright[0];\
143
    const unsigned av_unused t5 = topright[1];\
144
    const unsigned av_unused t6 = topright[2];\
145
    const unsigned av_unused t7 = topright[3];\
146
147
#define LOAD_DOWN_LEFT_EDGE\
148
    const unsigned av_unused l4 = src[-1+4*stride];\
149
    const unsigned av_unused l5 = src[-1+5*stride];\
150
    const unsigned av_unused l6 = src[-1+6*stride];\
151
    const unsigned av_unused l7 = src[-1+7*stride];\
152
153
#define LOAD_LEFT_EDGE\
154
    const unsigned av_unused l0 = src[-1+0*stride];\
155
    const unsigned av_unused l1 = src[-1+1*stride];\
156
    const unsigned av_unused l2 = src[-1+2*stride];\
157
    const unsigned av_unused l3 = src[-1+3*stride];\
158
159
#define LOAD_TOP_EDGE\
160
    const unsigned av_unused t0 = src[ 0-1*stride];\
161
    const unsigned av_unused t1 = src[ 1-1*stride];\
162
    const unsigned av_unused t2 = src[ 2-1*stride];\
163
    const unsigned av_unused t3 = src[ 3-1*stride];\
164
165
1318801
static void FUNCC(pred4x4_down_right)(uint8_t *_src, const uint8_t *topright,
166
                                      ptrdiff_t _stride)
167
{
168
1318801
    pixel *src = (pixel*)_src;
169
1318801
    int stride = _stride>>(sizeof(pixel)-1);
170
1318801
    const int lt= src[-1-1*stride];
171
1318801
    LOAD_TOP_EDGE
172
1318801
    LOAD_LEFT_EDGE
173
174
1318801
    src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
175
1318801
    src[0+2*stride]=
176
1318801
    src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
177
1318801
    src[0+1*stride]=
178
1318801
    src[1+2*stride]=
179
1318801
    src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
180
1318801
    src[0+0*stride]=
181
1318801
    src[1+1*stride]=
182
1318801
    src[2+2*stride]=
183
1318801
    src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
184
1318801
    src[1+0*stride]=
185
1318801
    src[2+1*stride]=
186
1318801
    src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
187
1318801
    src[2+0*stride]=
188
1318801
    src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
189
1318801
    src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
190
1318801
}
191
192
730676
static void FUNCC(pred4x4_down_left)(uint8_t *_src, const uint8_t *_topright,
193
                                     ptrdiff_t _stride)
194
{
195
730676
    pixel *src = (pixel*)_src;
196
730676
    const pixel *topright = (const pixel*)_topright;
197
730676
    int stride = _stride>>(sizeof(pixel)-1);
198
730676
    LOAD_TOP_EDGE
199
730676
    LOAD_TOP_RIGHT_EDGE
200
//    LOAD_LEFT_EDGE
201
202
730676
    src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
203
730676
    src[1+0*stride]=
204
730676
    src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
205
730676
    src[2+0*stride]=
206
730676
    src[1+1*stride]=
207
730676
    src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
208
730676
    src[3+0*stride]=
209
730676
    src[2+1*stride]=
210
730676
    src[1+2*stride]=
211
730676
    src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
212
730676
    src[3+1*stride]=
213
730676
    src[2+2*stride]=
214
730676
    src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
215
730676
    src[3+2*stride]=
216
730676
    src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
217
730676
    src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
218
730676
}
219
220
900489
static void FUNCC(pred4x4_vertical_right)(uint8_t *_src,
221
                                          const uint8_t *topright,
222
                                          ptrdiff_t _stride)
223
{
224
900489
    pixel *src = (pixel*)_src;
225
900489
    int stride = _stride>>(sizeof(pixel)-1);
226
900489
    const int lt= src[-1-1*stride];
227
900489
    LOAD_TOP_EDGE
228
900489
    LOAD_LEFT_EDGE
229
230
900489
    src[0+0*stride]=
231
900489
    src[1+2*stride]=(lt + t0 + 1)>>1;
232
900489
    src[1+0*stride]=
233
900489
    src[2+2*stride]=(t0 + t1 + 1)>>1;
234
900489
    src[2+0*stride]=
235
900489
    src[3+2*stride]=(t1 + t2 + 1)>>1;
236
900489
    src[3+0*stride]=(t2 + t3 + 1)>>1;
237
900489
    src[0+1*stride]=
238
900489
    src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
239
900489
    src[1+1*stride]=
240
900489
    src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
241
900489
    src[2+1*stride]=
242
900489
    src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
243
900489
    src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
244
900489
    src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
245
900489
    src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
246
900489
}
247
248
663343
static void FUNCC(pred4x4_vertical_left)(uint8_t *_src,
249
                                         const uint8_t *_topright,
250
                                         ptrdiff_t _stride)
251
{
252
663343
    pixel *src = (pixel*)_src;
253
663343
    const pixel *topright = (const pixel*)_topright;
254
663343
    int stride = _stride>>(sizeof(pixel)-1);
255
663343
    LOAD_TOP_EDGE
256
663343
    LOAD_TOP_RIGHT_EDGE
257
258
663343
    src[0+0*stride]=(t0 + t1 + 1)>>1;
259
663343
    src[1+0*stride]=
260
663343
    src[0+2*stride]=(t1 + t2 + 1)>>1;
261
663343
    src[2+0*stride]=
262
663343
    src[1+2*stride]=(t2 + t3 + 1)>>1;
263
663343
    src[3+0*stride]=
264
663343
    src[2+2*stride]=(t3 + t4+ 1)>>1;
265
663343
    src[3+2*stride]=(t4 + t5+ 1)>>1;
266
663343
    src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
267
663343
    src[1+1*stride]=
268
663343
    src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
269
663343
    src[2+1*stride]=
270
663343
    src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
271
663343
    src[3+1*stride]=
272
663343
    src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
273
663343
    src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
274
663343
}
275
276
1169661
static void FUNCC(pred4x4_horizontal_up)(uint8_t *_src, const uint8_t *topright,
277
                                         ptrdiff_t _stride)
278
{
279
1169661
    pixel *src = (pixel*)_src;
280
1169661
    int stride = _stride>>(sizeof(pixel)-1);
281
1169661
    LOAD_LEFT_EDGE
282
283
1169661
    src[0+0*stride]=(l0 + l1 + 1)>>1;
284
1169661
    src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
285
1169661
    src[2+0*stride]=
286
1169661
    src[0+1*stride]=(l1 + l2 + 1)>>1;
287
1169661
    src[3+0*stride]=
288
1169661
    src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
289
1169661
    src[2+1*stride]=
290
1169661
    src[0+2*stride]=(l2 + l3 + 1)>>1;
291
1169661
    src[3+1*stride]=
292
1169661
    src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
293
1169661
    src[3+2*stride]=
294
1169661
    src[1+3*stride]=
295
1169661
    src[0+3*stride]=
296
1169661
    src[2+2*stride]=
297
1169661
    src[2+3*stride]=
298
1169661
    src[3+3*stride]=l3;
299
1169661
}
300
301
1382672
static void FUNCC(pred4x4_horizontal_down)(uint8_t *_src,
302
                                           const uint8_t *topright,
303
                                           ptrdiff_t _stride)
304
{
305
1382672
    pixel *src = (pixel*)_src;
306
1382672
    int stride = _stride>>(sizeof(pixel)-1);
307
1382672
    const int lt= src[-1-1*stride];
308
1382672
    LOAD_TOP_EDGE
309
1382672
    LOAD_LEFT_EDGE
310
311
1382672
    src[0+0*stride]=
312
1382672
    src[2+1*stride]=(lt + l0 + 1)>>1;
313
1382672
    src[1+0*stride]=
314
1382672
    src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
315
1382672
    src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
316
1382672
    src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
317
1382672
    src[0+1*stride]=
318
1382672
    src[2+2*stride]=(l0 + l1 + 1)>>1;
319
1382672
    src[1+1*stride]=
320
1382672
    src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
321
1382672
    src[0+2*stride]=
322
1382672
    src[2+3*stride]=(l1 + l2+ 1)>>1;
323
1382672
    src[1+2*stride]=
324
1382672
    src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
325
1382672
    src[0+3*stride]=(l2 + l3 + 1)>>1;
326
1382672
    src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
327
1382672
}
328
329
215176
static void FUNCC(pred16x16_vertical)(uint8_t *_src, ptrdiff_t _stride)
330
{
331
    int i;
332
215176
    pixel *src = (pixel*)_src;
333
215176
    int stride = _stride>>(sizeof(pixel)-1);
334
215176
    const pixel4 a = AV_RN4PA(((pixel4*)(src-stride))+0);
335
215176
    const pixel4 b = AV_RN4PA(((pixel4*)(src-stride))+1);
336
215176
    const pixel4 c = AV_RN4PA(((pixel4*)(src-stride))+2);
337
215176
    const pixel4 d = AV_RN4PA(((pixel4*)(src-stride))+3);
338
339
3657992
    for(i=0; i<16; i++){
340
3442816
        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
341
3442816
        AV_WN4PA(((pixel4*)(src+i*stride))+1, b);
342
3442816
        AV_WN4PA(((pixel4*)(src+i*stride))+2, c);
343
3442816
        AV_WN4PA(((pixel4*)(src+i*stride))+3, d);
344
    }
345
215176
}
346
347
220381
static void FUNCC(pred16x16_horizontal)(uint8_t *_src, ptrdiff_t stride)
348
{
349
    int i;
350
220381
    pixel *src = (pixel*)_src;
351
220381
    stride >>= sizeof(pixel)-1;
352
353
3746477
    for(i=0; i<16; i++){
354
3526096
        const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]);
355
356
3526096
        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
357
3526096
        AV_WN4PA(((pixel4*)(src+i*stride))+1, a);
358
3526096
        AV_WN4PA(((pixel4*)(src+i*stride))+2, a);
359
3526096
        AV_WN4PA(((pixel4*)(src+i*stride))+3, a);
360
    }
361
220381
}
362
363
#define PREDICT_16x16_DC(v)\
364
    for(i=0; i<16; i++){\
365
        AV_WN4PA(src+ 0, v);\
366
        AV_WN4PA(src+ 4, v);\
367
        AV_WN4PA(src+ 8, v);\
368
        AV_WN4PA(src+12, v);\
369
        src += stride;\
370
    }
371
372
156704
static void FUNCC(pred16x16_dc)(uint8_t *_src, ptrdiff_t stride)
373
{
374
156704
    int i, dc=0;
375
156704
    pixel *src = (pixel*)_src;
376
    pixel4 dcsplat;
377
156704
    stride >>= sizeof(pixel)-1;
378
379
2663968
    for(i=0;i<16; i++){
380
2507264
        dc+= src[-1+i*stride];
381
    }
382
383
2663968
    for(i=0;i<16; i++){
384
2507264
        dc+= src[i-stride];
385
    }
386
387
156704
    dcsplat = PIXEL_SPLAT_X4((dc+16)>>5);
388
2663968
    PREDICT_16x16_DC(dcsplat);
389
156704
}
390
391
17042
static void FUNCC(pred16x16_left_dc)(uint8_t *_src, ptrdiff_t stride)
392
{
393
17042
    int i, dc=0;
394
17042
    pixel *src = (pixel*)_src;
395
    pixel4 dcsplat;
396
17042
    stride >>= sizeof(pixel)-1;
397
398
289714
    for(i=0;i<16; i++){
399
272672
        dc+= src[-1+i*stride];
400
    }
401
402
17042
    dcsplat = PIXEL_SPLAT_X4((dc+8)>>4);
403
289714
    PREDICT_16x16_DC(dcsplat);
404
17042
}
405
406
4221
static void FUNCC(pred16x16_top_dc)(uint8_t *_src, ptrdiff_t stride)
407
{
408
4221
    int i, dc=0;
409
4221
    pixel *src = (pixel*)_src;
410
    pixel4 dcsplat;
411
4221
    stride >>= sizeof(pixel)-1;
412
413
71757
    for(i=0;i<16; i++){
414
67536
        dc+= src[i-stride];
415
    }
416
417
4221
    dcsplat = PIXEL_SPLAT_X4((dc+8)>>4);
418
71757
    PREDICT_16x16_DC(dcsplat);
419
4221
}
420
421
#define PRED16x16_X(n, v) \
422
static void FUNCC(pred16x16_##n##_dc)(uint8_t *_src, ptrdiff_t stride)\
423
{\
424
    int i;\
425
    pixel *src = (pixel*)_src;\
426
    stride >>= sizeof(pixel)-1;\
427
    PREDICT_16x16_DC(PIXEL_SPLAT_X4(v));\
428
}
429
430
204
PRED16x16_X(127, (1<<(BIT_DEPTH-1))-1)
431
29682
PRED16x16_X(128, (1<<(BIT_DEPTH-1))+0)
432
170
PRED16x16_X(129, (1<<(BIT_DEPTH-1))+1)
433
434
88139
static inline void FUNCC(pred16x16_plane_compat)(uint8_t *_src,
435
                                                 ptrdiff_t _stride,
436
                                                 const int svq3,
437
                                                 const int rv40)
438
{
439
  int i, j, k;
440
  int a;
441
  INIT_CLIP
442
88139
  pixel *src = (pixel*)_src;
443
88139
  int stride = _stride>>(sizeof(pixel)-1);
444
88139
  const pixel * const src0 = src +7-stride;
445
88139
  const pixel *       src1 = src +8*stride-1;
446
88139
  const pixel *       src2 = src1-2*stride;    // == src+6*stride-1;
447
88139
  int H = src0[1] - src0[-1];
448
88139
  int V = src1[0] - src2[ 0];
449
705112
  for(k=2; k<=8; ++k) {
450
616973
    src1 += stride; src2 -= stride;
451
616973
    H += k*(src0[k] - src0[-k]);
452
616973
    V += k*(src1[0] - src2[ 0]);
453
  }
454
88139
  if(svq3){
455
56
    H = ( 5*(H/4) ) / 16;
456
56
    V = ( 5*(V/4) ) / 16;
457
458
    /* required for 100% accuracy */
459
56
    i = H; H = V; V = i;
460
88083
  }else if(rv40){
461
1216
    H = ( H + (H>>2) ) >> 4;
462
1216
    V = ( V + (V>>2) ) >> 4;
463
  }else{
464
86867
    H = ( 5*H+32 ) >> 6;
465
86867
    V = ( 5*V+32 ) >> 6;
466
  }
467
468
88139
  a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
469
1498363
  for(j=16; j>0; --j) {
470
1410224
    int b = a;
471
1410224
    a += V;
472
7051120
    for(i=-16; i<0; i+=4) {
473
5640896
      src[16+i] = CLIP((b    ) >> 5);
474
5640896
      src[17+i] = CLIP((b+  H) >> 5);
475
5640896
      src[18+i] = CLIP((b+2*H) >> 5);
476
5640896
      src[19+i] = CLIP((b+3*H) >> 5);
477
5640896
      b += 4*H;
478
    }
479
1410224
    src += stride;
480
  }
481
88139
}
482
483
86867
static void FUNCC(pred16x16_plane)(uint8_t *src, ptrdiff_t stride)
484
{
485
86867
    FUNCC(pred16x16_plane_compat)(src, stride, 0, 0);
486
86867
}
487
488
593605
static void FUNCC(pred8x8_vertical)(uint8_t *_src, ptrdiff_t _stride)
489
{
490
    int i;
491
593605
    pixel *src = (pixel*)_src;
492
593605
    int stride = _stride>>(sizeof(pixel)-1);
493
593605
    const pixel4 a= AV_RN4PA(((pixel4*)(src-stride))+0);
494
593605
    const pixel4 b= AV_RN4PA(((pixel4*)(src-stride))+1);
495
496
5342445
    for(i=0; i<8; i++){
497
4748840
        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
498
4748840
        AV_WN4PA(((pixel4*)(src+i*stride))+1, b);
499
    }
500
593605
}
501
502
2638
static void FUNCC(pred8x16_vertical)(uint8_t *_src, ptrdiff_t _stride)
503
{
504
    int i;
505
2638
    pixel *src = (pixel*)_src;
506
2638
    int stride = _stride>>(sizeof(pixel)-1);
507
2638
    const pixel4 a= AV_RN4PA(((pixel4*)(src-stride))+0);
508
2638
    const pixel4 b= AV_RN4PA(((pixel4*)(src-stride))+1);
509
510
44846
    for(i=0; i<16; i++){
511
42208
        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
512
42208
        AV_WN4PA(((pixel4*)(src+i*stride))+1, b);
513
    }
514
2638
}
515
516
1045557
static void FUNCC(pred8x8_horizontal)(uint8_t *_src, ptrdiff_t stride)
517
{
518
    int i;
519
1045557
    pixel *src = (pixel*)_src;
520
1045557
    stride >>= sizeof(pixel)-1;
521
522
9410013
    for(i=0; i<8; i++){
523
8364456
        const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]);
524
8364456
        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
525
8364456
        AV_WN4PA(((pixel4*)(src+i*stride))+1, a);
526
    }
527
1045557
}
528
529
2186
static void FUNCC(pred8x16_horizontal)(uint8_t *_src, ptrdiff_t stride)
530
{
531
    int i;
532
2186
    pixel *src = (pixel*)_src;
533
2186
    stride >>= sizeof(pixel)-1;
534
37162
    for(i=0; i<16; i++){
535
34976
        const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]);
536
34976
        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
537
34976
        AV_WN4PA(((pixel4*)(src+i*stride))+1, a);
538
    }
539
2186
}
540
541
#define PRED8x8_X(n, v)\
542
static void FUNCC(pred8x8_##n##_dc)(uint8_t *_src, ptrdiff_t stride)\
543
{\
544
    int i;\
545
    const pixel4 a = PIXEL_SPLAT_X4(v);\
546
    pixel *src = (pixel*)_src;\
547
    stride >>= sizeof(pixel)-1;\
548
    for(i=0; i<8; i++){\
549
        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);\
550
        AV_WN4PA(((pixel4*)(src+i*stride))+1, a);\
551
    }\
552
}
553
554
1026
PRED8x8_X(127, (1<<(BIT_DEPTH-1))-1)
555
1119438
PRED8x8_X(128, (1<<(BIT_DEPTH-1))+0)
556
2412
PRED8x8_X(129, (1<<(BIT_DEPTH-1))+1)
557
558
42
static void FUNCC(pred8x16_128_dc)(uint8_t *_src, ptrdiff_t stride)
559
{
560
42
    FUNCC(pred8x8_128_dc)(_src, stride);
561
42
    FUNCC(pred8x8_128_dc)(_src+8*stride, stride);
562
42
}
563
564
251474
static void FUNCC(pred8x8_left_dc)(uint8_t *_src, ptrdiff_t stride)
565
{
566
    int i;
567
    int dc0, dc2;
568
    pixel4 dc0splat, dc2splat;
569
251474
    pixel *src = (pixel*)_src;
570
251474
    stride >>= sizeof(pixel)-1;
571
572
251474
    dc0=dc2=0;
573
1257370
    for(i=0;i<4; i++){
574
1005896
        dc0+= src[-1+i*stride];
575
1005896
        dc2+= src[-1+(i+4)*stride];
576
    }
577
251474
    dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
578
251474
    dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
579
580
1257370
    for(i=0; i<4; i++){
581
1005896
        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
582
1005896
        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc0splat);
583
    }
584
1257370
    for(i=4; i<8; i++){
585
1005896
        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat);
586
1005896
        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc2splat);
587
    }
588
251474
}
589
590
770
static void FUNCC(pred8x16_left_dc)(uint8_t *_src, ptrdiff_t stride)
591
{
592
770
    FUNCC(pred8x8_left_dc)(_src, stride);
593
770
    FUNCC(pred8x8_left_dc)(_src+8*stride, stride);
594
770
}
595
596
69745
static void FUNCC(pred8x8_top_dc)(uint8_t *_src, ptrdiff_t stride)
597
{
598
    int i;
599
    int dc0, dc1;
600
    pixel4 dc0splat, dc1splat;
601
69745
    pixel *src = (pixel*)_src;
602
69745
    stride >>= sizeof(pixel)-1;
603
604
69745
    dc0=dc1=0;
605
348725
    for(i=0;i<4; i++){
606
278980
        dc0+= src[i-stride];
607
278980
        dc1+= src[4+i-stride];
608
    }
609
69745
    dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
610
69745
    dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
611
612
348725
    for(i=0; i<4; i++){
613
278980
        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
614
278980
        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
615
    }
616
348725
    for(i=4; i<8; i++){
617
278980
        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
618
278980
        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
619
    }
620
69745
}
621
622
316
static void FUNCC(pred8x16_top_dc)(uint8_t *_src, ptrdiff_t stride)
623
{
624
    int i;
625
    int dc0, dc1;
626
    pixel4 dc0splat, dc1splat;
627
316
    pixel *src = (pixel*)_src;
628
316
    stride >>= sizeof(pixel)-1;
629
630
316
    dc0=dc1=0;
631
1580
    for(i=0;i<4; i++){
632
1264
        dc0+= src[i-stride];
633
1264
        dc1+= src[4+i-stride];
634
    }
635
316
    dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
636
316
    dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
637
638
5372
    for(i=0; i<16; i++){
639
5056
        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
640
5056
        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
641
    }
642
316
}
643
644
1958421
static void FUNCC(pred8x8_dc)(uint8_t *_src, ptrdiff_t stride)
645
{
646
    int i;
647
    int dc0, dc1, dc2;
648
    pixel4 dc0splat, dc1splat, dc2splat, dc3splat;
649
1958421
    pixel *src = (pixel*)_src;
650
1958421
    stride >>= sizeof(pixel)-1;
651
652
1958421
    dc0=dc1=dc2=0;
653
9792105
    for(i=0;i<4; i++){
654
7833684
        dc0+= src[-1+i*stride] + src[i-stride];
655
7833684
        dc1+= src[4+i-stride];
656
7833684
        dc2+= src[-1+(i+4)*stride];
657
    }
658
1958421
    dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3);
659
1958421
    dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
660
1958421
    dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
661
1958421
    dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3);
662
663
9792105
    for(i=0; i<4; i++){
664
7833684
        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
665
7833684
        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
666
    }
667
9792105
    for(i=4; i<8; i++){
668
7833684
        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat);
669
7833684
        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc3splat);
670
    }
671
1958421
}
672
673
8160
static void FUNCC(pred8x16_dc)(uint8_t *_src, ptrdiff_t stride)
674
{
675
    int i;
676
    int dc0, dc1, dc2, dc3, dc4;
677
    pixel4 dc0splat, dc1splat, dc2splat, dc3splat, dc4splat, dc5splat, dc6splat, dc7splat;
678
8160
    pixel *src = (pixel*)_src;
679
8160
    stride >>= sizeof(pixel)-1;
680
681
8160
    dc0=dc1=dc2=dc3=dc4=0;
682
40800
    for(i=0;i<4; i++){
683
32640
        dc0+= src[-1+i*stride] + src[i-stride];
684
32640
        dc1+= src[4+i-stride];
685
32640
        dc2+= src[-1+(i+4)*stride];
686
32640
        dc3+= src[-1+(i+8)*stride];
687
32640
        dc4+= src[-1+(i+12)*stride];
688
    }
689
8160
    dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3);
690
8160
    dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
691
8160
    dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
692
8160
    dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3);
693
8160
    dc4splat = PIXEL_SPLAT_X4((dc3 + 2)>>2);
694
8160
    dc5splat = PIXEL_SPLAT_X4((dc1 + dc3 + 4)>>3);
695
8160
    dc6splat = PIXEL_SPLAT_X4((dc4 + 2)>>2);
696
8160
    dc7splat = PIXEL_SPLAT_X4((dc1 + dc4 + 4)>>3);
697
698
40800
    for(i=0; i<4; i++){
699
32640
        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
700
32640
        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
701
    }
702
40800
    for(i=4; i<8; i++){
703
32640
        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat);
704
32640
        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc3splat);
705
    }
706
40800
    for(i=8; i<12; i++){
707
32640
        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc4splat);
708
32640
        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc5splat);
709
    }
710
40800
    for(i=12; i<16; i++){
711
32640
        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc6splat);
712
32640
        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc7splat);
713
    }
714
8160
}
715
716
//the following 4 function should not be optimized!
717
30
static void FUNC(pred8x8_mad_cow_dc_l0t)(uint8_t *src, ptrdiff_t stride)
718
{
719
30
    FUNCC(pred8x8_top_dc)(src, stride);
720
30
    FUNCC(pred4x4_dc)(src, NULL, stride);
721
30
}
722
723
2
static void FUNC(pred8x16_mad_cow_dc_l0t)(uint8_t *src, ptrdiff_t stride)
724
{
725
2
    FUNCC(pred8x16_top_dc)(src, stride);
726
2
    FUNCC(pred4x4_dc)(src, NULL, stride);
727
2
}
728
729
14
static void FUNC(pred8x8_mad_cow_dc_0lt)(uint8_t *src, ptrdiff_t stride)
730
{
731
14
    FUNCC(pred8x8_dc)(src, stride);
732
14
    FUNCC(pred4x4_top_dc)(src, NULL, stride);
733
14
}
734
735
2
static void FUNC(pred8x16_mad_cow_dc_0lt)(uint8_t *src, ptrdiff_t stride)
736
{
737
2
    FUNCC(pred8x16_dc)(src, stride);
738
2
    FUNCC(pred4x4_top_dc)(src, NULL, stride);
739
2
}
740
741
6
static void FUNC(pred8x8_mad_cow_dc_l00)(uint8_t *src, ptrdiff_t stride)
742
{
743
6
    FUNCC(pred8x8_left_dc)(src, stride);
744
6
    FUNCC(pred4x4_128_dc)(src + 4*stride                  , NULL, stride);
745
6
    FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride);
746
6
}
747
748
2
static void FUNC(pred8x16_mad_cow_dc_l00)(uint8_t *src, ptrdiff_t stride)
749
{
750
2
    FUNCC(pred8x16_left_dc)(src, stride);
751
2
    FUNCC(pred4x4_128_dc)(src + 4*stride                  , NULL, stride);
752
2
    FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride);
753
2
}
754
755
12
static void FUNC(pred8x8_mad_cow_dc_0l0)(uint8_t *src, ptrdiff_t stride)
756
{
757
12
    FUNCC(pred8x8_left_dc)(src, stride);
758
12
    FUNCC(pred4x4_128_dc)(src                  , NULL, stride);
759
12
    FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride);
760
12
}
761
762
2
static void FUNC(pred8x16_mad_cow_dc_0l0)(uint8_t *src, ptrdiff_t stride)
763
{
764
2
    FUNCC(pred8x16_left_dc)(src, stride);
765
2
    FUNCC(pred4x4_128_dc)(src                  , NULL, stride);
766
2
    FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride);
767
2
}
768
769
353553
static void FUNCC(pred8x8_plane)(uint8_t *_src, ptrdiff_t _stride)
770
{
771
  int j, k;
772
  int a;
773
  INIT_CLIP
774
353553
  pixel *src = (pixel*)_src;
775
353553
  int stride = _stride>>(sizeof(pixel)-1);
776
353553
  const pixel * const src0 = src +3-stride;
777
353553
  const pixel *       src1 = src +4*stride-1;
778
353553
  const pixel *       src2 = src1-2*stride;    // == src+2*stride-1;
779
353553
  int H = src0[1] - src0[-1];
780
353553
  int V = src1[0] - src2[ 0];
781
1414212
  for(k=2; k<=4; ++k) {
782
1060659
    src1 += stride; src2 -= stride;
783
1060659
    H += k*(src0[k] - src0[-k]);
784
1060659
    V += k*(src1[0] - src2[ 0]);
785
  }
786
353553
  H = ( 17*H+16 ) >> 5;
787
353553
  V = ( 17*V+16 ) >> 5;
788
789
353553
  a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
790
3181977
  for(j=8; j>0; --j) {
791
2828424
    int b = a;
792
2828424
    a += V;
793
2828424
    src[0] = CLIP((b    ) >> 5);
794
2828424
    src[1] = CLIP((b+  H) >> 5);
795
2828424
    src[2] = CLIP((b+2*H) >> 5);
796
2828424
    src[3] = CLIP((b+3*H) >> 5);
797
2828424
    src[4] = CLIP((b+4*H) >> 5);
798
2828424
    src[5] = CLIP((b+5*H) >> 5);
799
2828424
    src[6] = CLIP((b+6*H) >> 5);
800
2828424
    src[7] = CLIP((b+7*H) >> 5);
801
2828424
    src += stride;
802
  }
803
353553
}
804
805
1750
static void FUNCC(pred8x16_plane)(uint8_t *_src, ptrdiff_t _stride)
806
{
807
  int j, k;
808
  int a;
809
  INIT_CLIP
810
1750
  pixel *src = (pixel*)_src;
811
1750
  int stride = _stride>>(sizeof(pixel)-1);
812
1750
  const pixel * const src0 = src +3-stride;
813
1750
  const pixel *       src1 = src +8*stride-1;
814
1750
  const pixel *       src2 = src1-2*stride;    // == src+6*stride-1;
815
1750
  int H = src0[1] - src0[-1];
816
1750
  int V = src1[0] - src2[ 0];
817
818
7000
  for (k = 2; k <= 4; ++k) {
819
5250
      src1 += stride; src2 -= stride;
820
5250
      H += k*(src0[k] - src0[-k]);
821
5250
      V += k*(src1[0] - src2[ 0]);
822
  }
823
8750
  for (; k <= 8; ++k) {
824
7000
      src1 += stride; src2 -= stride;
825
7000
      V += k*(src1[0] - src2[0]);
826
  }
827
828
1750
  H = (17*H+16) >> 5;
829
1750
  V = (5*V+32) >> 6;
830
831
1750
  a = 16*(src1[0] + src2[8] + 1) - 7*V - 3*H;
832
29750
  for(j=16; j>0; --j) {
833
28000
    int b = a;
834
28000
    a += V;
835
28000
    src[0] = CLIP((b    ) >> 5);
836
28000
    src[1] = CLIP((b+  H) >> 5);
837
28000
    src[2] = CLIP((b+2*H) >> 5);
838
28000
    src[3] = CLIP((b+3*H) >> 5);
839
28000
    src[4] = CLIP((b+4*H) >> 5);
840
28000
    src[5] = CLIP((b+5*H) >> 5);
841
28000
    src[6] = CLIP((b+6*H) >> 5);
842
28000
    src[7] = CLIP((b+7*H) >> 5);
843
28000
    src += stride;
844
  }
845
1750
}
846
847
#define SRC(x,y) src[(x)+(y)*stride]
848
#define PL(y) \
849
    const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
850
#define PREDICT_8x8_LOAD_LEFT \
851
    const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
852
                     + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
853
    PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
854
    const int l7 av_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
855
856
#define PT(x) \
857
    const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
858
#define PREDICT_8x8_LOAD_TOP \
859
    const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
860
                     + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
861
    PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
862
    const int t7 av_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
863
                     + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
864
865
#define PTR(x) \
866
    t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
867
#define PREDICT_8x8_LOAD_TOPRIGHT \
868
    int t8, t9, t10, t11, t12, t13, t14, t15; \
869
    if(has_topright) { \
870
        PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
871
        t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
872
    } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
873
874
#define PREDICT_8x8_LOAD_TOPLEFT \
875
    const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
876
877
#define PREDICT_8x8_DC(v) \
878
    int y; \
879
    for( y = 0; y < 8; y++ ) { \
880
        AV_WN4PA(((pixel4*)src)+0, v); \
881
        AV_WN4PA(((pixel4*)src)+1, v); \
882
        src += stride; \
883
    }
884
885
337
static void FUNCC(pred8x8l_128_dc)(uint8_t *_src, int has_topleft,
886
                                   int has_topright, ptrdiff_t _stride)
887
{
888
337
    pixel *src = (pixel*)_src;
889
337
    int stride = _stride>>(sizeof(pixel)-1);
890
891
3033
    PREDICT_8x8_DC(PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1)));
892
337
}
893
55354
static void FUNCC(pred8x8l_left_dc)(uint8_t *_src, int has_topleft,
894
                                    int has_topright, ptrdiff_t _stride)
895
{
896
55354
    pixel *src = (pixel*)_src;
897
55354
    int stride = _stride>>(sizeof(pixel)-1);
898
899
55354
    PREDICT_8x8_LOAD_LEFT;
900
55354
    const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3);
901
498186
    PREDICT_8x8_DC(dc);
902
55354
}
903
3092
static void FUNCC(pred8x8l_top_dc)(uint8_t *_src, int has_topleft,
904
                                   int has_topright, ptrdiff_t _stride)
905
{
906
3092
    pixel *src = (pixel*)_src;
907
3092
    int stride = _stride>>(sizeof(pixel)-1);
908
909

3092
    PREDICT_8x8_LOAD_TOP;
910
3092
    const pixel4 dc = PIXEL_SPLAT_X4((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3);
911
27828
    PREDICT_8x8_DC(dc);
912
3092
}
913
948772
static void FUNCC(pred8x8l_dc)(uint8_t *_src, int has_topleft,
914
                               int has_topright, ptrdiff_t _stride)
915
{
916
948772
    pixel *src = (pixel*)_src;
917
948772
    int stride = _stride>>(sizeof(pixel)-1);
918
919
948772
    PREDICT_8x8_LOAD_LEFT;
920

948772
    PREDICT_8x8_LOAD_TOP;
921
948772
    const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7
922
                                     +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4);
923
8538948
    PREDICT_8x8_DC(dc);
924
948772
}
925
468888
static void FUNCC(pred8x8l_horizontal)(uint8_t *_src, int has_topleft,
926
                                       int has_topright, ptrdiff_t _stride)
927
{
928
468888
    pixel *src = (pixel*)_src;
929
468888
    int stride = _stride>>(sizeof(pixel)-1);
930
    pixel4 a;
931
932
468888
    PREDICT_8x8_LOAD_LEFT;
933
#define ROW(y) a = PIXEL_SPLAT_X4(l##y); \
934
               AV_WN4PA(src+y*stride, a); \
935
               AV_WN4PA(src+y*stride+4, a);
936
468888
    ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
937
#undef ROW
938
468888
}
939
189569
static void FUNCC(pred8x8l_vertical)(uint8_t *_src, int has_topleft,
940
                                     int has_topright, ptrdiff_t _stride)
941
{
942
    int y;
943
189569
    pixel *src = (pixel*)_src;
944
189569
    int stride = _stride>>(sizeof(pixel)-1);
945
    pixel4 a, b;
946
947

189569
    PREDICT_8x8_LOAD_TOP;
948
189569
    src[0] = t0;
949
189569
    src[1] = t1;
950
189569
    src[2] = t2;
951
189569
    src[3] = t3;
952
189569
    src[4] = t4;
953
189569
    src[5] = t5;
954
189569
    src[6] = t6;
955
189569
    src[7] = t7;
956
189569
    a = AV_RN4PA(((pixel4*)src)+0);
957
189569
    b = AV_RN4PA(((pixel4*)src)+1);
958
1516552
    for( y = 1; y < 8; y++ ) {
959
1326983
        AV_WN4PA(((pixel4*)(src+y*stride))+0, a);
960
1326983
        AV_WN4PA(((pixel4*)(src+y*stride))+1, b);
961
    }
962
189569
}
963
53127
static void FUNCC(pred8x8l_down_left)(uint8_t *_src, int has_topleft,
964
                                      int has_topright, ptrdiff_t _stride)
965
{
966
53127
    pixel *src = (pixel*)_src;
967
53127
    int stride = _stride>>(sizeof(pixel)-1);
968

53127
    PREDICT_8x8_LOAD_TOP;
969
53127
    PREDICT_8x8_LOAD_TOPRIGHT;
970
53127
    SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
971
53127
    SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
972
53127
    SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
973
53127
    SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
974
53127
    SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
975
53127
    SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
976
53127
    SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
977
53127
    SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
978
53127
    SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
979
53127
    SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
980
53127
    SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
981
53127
    SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
982
53127
    SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
983
53127
    SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
984
53127
    SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
985
53127
}
986
85928
static void FUNCC(pred8x8l_down_right)(uint8_t *_src, int has_topleft,
987
                                       int has_topright, ptrdiff_t _stride)
988
{
989
85928
    pixel *src = (pixel*)_src;
990
85928
    int stride = _stride>>(sizeof(pixel)-1);
991

85928
    PREDICT_8x8_LOAD_TOP;
992
85928
    PREDICT_8x8_LOAD_LEFT;
993
85928
    PREDICT_8x8_LOAD_TOPLEFT;
994
85928
    SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
995
85928
    SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
996
85928
    SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
997
85928
    SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
998
85928
    SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
999
85928
    SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
1000
85928
    SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
1001
85928
    SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
1002
85928
    SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
1003
85928
    SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
1004
85928
    SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
1005
85928
    SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
1006
85928
    SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
1007
85928
    SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
1008
85928
    SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
1009
85928
}
1010
58593
static void FUNCC(pred8x8l_vertical_right)(uint8_t *_src, int has_topleft,
1011
                                           int has_topright, ptrdiff_t _stride)
1012
{
1013
58593
    pixel *src = (pixel*)_src;
1014
58593
    int stride = _stride>>(sizeof(pixel)-1);
1015

58593
    PREDICT_8x8_LOAD_TOP;
1016
58593
    PREDICT_8x8_LOAD_LEFT;
1017
58593
    PREDICT_8x8_LOAD_TOPLEFT;
1018
58593
    SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
1019
58593
    SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
1020
58593
    SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
1021
58593
    SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
1022
58593
    SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
1023
58593
    SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
1024
58593
    SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
1025
58593
    SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
1026
58593
    SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
1027
58593
    SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
1028
58593
    SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
1029
58593
    SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
1030
58593
    SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
1031
58593
    SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
1032
58593
    SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
1033
58593
    SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
1034
58593
    SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
1035
58593
    SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
1036
58593
    SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
1037
58593
    SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
1038
58593
    SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
1039
58593
    SRC(7,0)= (t6 + t7 + 1) >> 1;
1040
58593
}
1041
122283
static void FUNCC(pred8x8l_horizontal_down)(uint8_t *_src, int has_topleft,
1042
                                            int has_topright, ptrdiff_t _stride)
1043
{
1044
122283
    pixel *src = (pixel*)_src;
1045
122283
    int stride = _stride>>(sizeof(pixel)-1);
1046

122283
    PREDICT_8x8_LOAD_TOP;
1047
122283
    PREDICT_8x8_LOAD_LEFT;
1048
122283
    PREDICT_8x8_LOAD_TOPLEFT;
1049
122283
    SRC(0,7)= (l6 + l7 + 1) >> 1;
1050
122283
    SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
1051
122283
    SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
1052
122283
    SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
1053
122283
    SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
1054
122283
    SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
1055
122283
    SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
1056
122283
    SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
1057
122283
    SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
1058
122283
    SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
1059
122283
    SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
1060
122283
    SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
1061
122283
    SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
1062
122283
    SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
1063
122283
    SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
1064
122283
    SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
1065
122283
    SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
1066
122283
    SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
1067
122283
    SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
1068
122283
    SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
1069
122283
    SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
1070
122283
    SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
1071
122283
}
1072
54418
static void FUNCC(pred8x8l_vertical_left)(uint8_t *_src, int has_topleft,
1073
                                          int has_topright, ptrdiff_t _stride)
1074
{
1075
54418
    pixel *src = (pixel*)_src;
1076
54418
    int stride = _stride>>(sizeof(pixel)-1);
1077

54418
    PREDICT_8x8_LOAD_TOP;
1078
54418
    PREDICT_8x8_LOAD_TOPRIGHT;
1079
54418
    SRC(0,0)= (t0 + t1 + 1) >> 1;
1080
54418
    SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
1081
54418
    SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
1082
54418
    SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
1083
54418
    SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
1084
54418
    SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
1085
54418
    SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
1086
54418
    SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
1087
54418
    SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
1088
54418
    SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
1089
54418
    SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
1090
54418
    SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
1091
54418
    SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
1092
54418
    SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
1093
54418
    SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
1094
54418
    SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
1095
54418
    SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
1096
54418
    SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
1097
54418
    SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
1098
54418
    SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
1099
54418
    SRC(7,6)= (t10 + t11 + 1) >> 1;
1100
54418
    SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
1101
54418
}
1102
177633
static void FUNCC(pred8x8l_horizontal_up)(uint8_t *_src, int has_topleft,
1103
                                          int has_topright, ptrdiff_t _stride)
1104
{
1105
177633
    pixel *src = (pixel*)_src;
1106
177633
    int stride = _stride>>(sizeof(pixel)-1);
1107
177633
    PREDICT_8x8_LOAD_LEFT;
1108
177633
    SRC(0,0)= (l0 + l1 + 1) >> 1;
1109
177633
    SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
1110
177633
    SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
1111
177633
    SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
1112
177633
    SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
1113
177633
    SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
1114
177633
    SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
1115
177633
    SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
1116
177633
    SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
1117
177633
    SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
1118
177633
    SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
1119
177633
    SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
1120
177633
    SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
1121
177633
    SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
1122
177633
    SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
1123
177633
    SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
1124
177633
    SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
1125
177633
    SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
1126
177633
}
1127
1128
static void FUNCC(pred8x8l_vertical_filter_add)(uint8_t *_src, int16_t *_block, int has_topleft,
1129
                                     int has_topright, ptrdiff_t _stride)
1130
{
1131
    int i;
1132
    pixel *src = (pixel*)_src;
1133
    const dctcoef *block = (const dctcoef*)_block;
1134
    pixel pix[8];
1135
    int stride = _stride>>(sizeof(pixel)-1);
1136
    PREDICT_8x8_LOAD_TOP;
1137
1138
    pix[0] = t0;
1139
    pix[1] = t1;
1140
    pix[2] = t2;
1141
    pix[3] = t3;
1142
    pix[4] = t4;
1143
    pix[5] = t5;
1144
    pix[6] = t6;
1145
    pix[7] = t7;
1146
1147
    for(i=0; i<8; i++){
1148
        pixel v = pix[i];
1149
        src[0*stride]= v += block[0];
1150
        src[1*stride]= v += block[8];
1151
        src[2*stride]= v += block[16];
1152
        src[3*stride]= v += block[24];
1153
        src[4*stride]= v += block[32];
1154
        src[5*stride]= v += block[40];
1155
        src[6*stride]= v += block[48];
1156
        src[7*stride]= v +  block[56];
1157
        src++;
1158
        block++;
1159
    }
1160
1161
    memset(_block, 0, sizeof(dctcoef) * 64);
1162
}
1163
1164
static void FUNCC(pred8x8l_horizontal_filter_add)(uint8_t *_src, int16_t *_block, int has_topleft,
1165
                               int has_topright, ptrdiff_t _stride)
1166
{
1167
    int i;
1168
    pixel *src = (pixel*)_src;
1169
    const dctcoef *block = (const dctcoef*)_block;
1170
    pixel pix[8];
1171
    int stride = _stride>>(sizeof(pixel)-1);
1172
    PREDICT_8x8_LOAD_LEFT;
1173
1174
    pix[0] = l0;
1175
    pix[1] = l1;
1176
    pix[2] = l2;
1177
    pix[3] = l3;
1178
    pix[4] = l4;
1179
    pix[5] = l5;
1180
    pix[6] = l6;
1181
    pix[7] = l7;
1182
1183
    for(i=0; i<8; i++){
1184
        pixel v = pix[i];
1185
        src[0]= v += block[0];
1186
        src[1]= v += block[1];
1187
        src[2]= v += block[2];
1188
        src[3]= v += block[3];
1189
        src[4]= v += block[4];
1190
        src[5]= v += block[5];
1191
        src[6]= v += block[6];
1192
        src[7]= v +  block[7];
1193
        src+= stride;
1194
        block+= 8;
1195
    }
1196
1197
    memset(_block, 0, sizeof(dctcoef) * 64);
1198
}
1199
1200
#undef PREDICT_8x8_LOAD_LEFT
1201
#undef PREDICT_8x8_LOAD_TOP
1202
#undef PREDICT_8x8_LOAD_TOPLEFT
1203
#undef PREDICT_8x8_LOAD_TOPRIGHT
1204
#undef PREDICT_8x8_DC
1205
#undef PTR
1206
#undef PT
1207
#undef PL
1208
#undef SRC
1209
1210
80320
static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, int16_t *_block,
1211
                                        ptrdiff_t stride)
1212
{
1213
    int i;
1214
80320
    pixel *pix = (pixel*)_pix;
1215
80320
    const dctcoef *block = (const dctcoef*)_block;
1216
80320
    stride >>= sizeof(pixel)-1;
1217
80320
    pix -= stride;
1218
401600
    for(i=0; i<4; i++){
1219
321280
        pixel v = pix[0];
1220
321280
        pix[1*stride]= v += block[0];
1221
321280
        pix[2*stride]= v += block[4];
1222
321280
        pix[3*stride]= v += block[8];
1223
321280
        pix[4*stride]= v +  block[12];
1224
321280
        pix++;
1225
321280
        block++;
1226
    }
1227
1228
80320
    memset(_block, 0, sizeof(dctcoef) * 16);
1229
80320
}
1230
1231
104227
static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, int16_t *_block,
1232
                                          ptrdiff_t stride)
1233
{
1234
    int i;
1235
104227
    pixel *pix = (pixel*)_pix;
1236
104227
    const dctcoef *block = (const dctcoef*)_block;
1237
104227
    stride >>= sizeof(pixel)-1;
1238
521135
    for(i=0; i<4; i++){
1239
416908
        pixel v = pix[-1];
1240
416908
        pix[0]= v += block[0];
1241
416908
        pix[1]= v += block[1];
1242
416908
        pix[2]= v += block[2];
1243
416908
        pix[3]= v +  block[3];
1244
416908
        pix+= stride;
1245
416908
        block+= 4;
1246
    }
1247
1248
104227
    memset(_block, 0, sizeof(dctcoef) * 16);
1249
104227
}
1250
1251
1074
static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, int16_t *_block,
1252
                                         ptrdiff_t stride)
1253
{
1254
    int i;
1255
1074
    pixel *pix = (pixel*)_pix;
1256
1074
    const dctcoef *block = (const dctcoef*)_block;
1257
1074
    stride >>= sizeof(pixel)-1;
1258
1074
    pix -= stride;
1259
9666
    for(i=0; i<8; i++){
1260
8592
        pixel v = pix[0];
1261
8592
        pix[1*stride]= v += block[0];
1262
8592
        pix[2*stride]= v += block[8];
1263
8592
        pix[3*stride]= v += block[16];
1264
8592
        pix[4*stride]= v += block[24];
1265
8592
        pix[5*stride]= v += block[32];
1266
8592
        pix[6*stride]= v += block[40];
1267
8592
        pix[7*stride]= v += block[48];
1268
8592
        pix[8*stride]= v +  block[56];
1269
8592
        pix++;
1270
8592
        block++;
1271
    }
1272
1273
1074
    memset(_block, 0, sizeof(dctcoef) * 64);
1274
1074
}
1275
1276
1414
static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, int16_t *_block,
1277
                                           ptrdiff_t stride)
1278
{
1279
    int i;
1280
1414
    pixel *pix = (pixel*)_pix;
1281
1414
    const dctcoef *block = (const dctcoef*)_block;
1282
1414
    stride >>= sizeof(pixel)-1;
1283
12726
    for(i=0; i<8; i++){
1284
11312
        pixel v = pix[-1];
1285
11312
        pix[0]= v += block[0];
1286
11312
        pix[1]= v += block[1];
1287
11312
        pix[2]= v += block[2];
1288
11312
        pix[3]= v += block[3];
1289
11312
        pix[4]= v += block[4];
1290
11312
        pix[5]= v += block[5];
1291
11312
        pix[6]= v += block[6];
1292
11312
        pix[7]= v +  block[7];
1293
11312
        pix+= stride;
1294
11312
        block+= 8;
1295
    }
1296
1297
1414
    memset(_block, 0, sizeof(dctcoef) * 64);
1298
1414
}
1299
1300
353
static void FUNCC(pred16x16_vertical_add)(uint8_t *pix, const int *block_offset,
1301
                                          int16_t *block,
1302
                                          ptrdiff_t stride)
1303
{
1304
    int i;
1305
6001
    for(i=0; i<16; i++)
1306
5648
        FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1307
353
}
1308
1309
272
static void FUNCC(pred16x16_horizontal_add)(uint8_t *pix,
1310
                                            const int *block_offset,
1311
                                            int16_t *block,
1312
                                            ptrdiff_t stride)
1313
{
1314
    int i;
1315
4624
    for(i=0; i<16; i++)
1316
4352
        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1317
272
}
1318
1319
2258
static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset,
1320
                                        int16_t *block, ptrdiff_t stride)
1321
{
1322
    int i;
1323
11290
    for(i=0; i<4; i++)
1324
9032
        FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1325
2258
}
1326
1327
static void FUNCC(pred8x16_vertical_add)(uint8_t *pix, const int *block_offset,
1328
                                         int16_t *block, ptrdiff_t stride)
1329
{
1330
    int i;
1331
    for(i=0; i<4; i++)
1332
        FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1333
    for(i=4; i<8; i++)
1334
        FUNCC(pred4x4_vertical_add)(pix + block_offset[i+4], block + i*16*sizeof(pixel), stride);
1335
}
1336
1337
2888
static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset,
1338
                                          int16_t *block,
1339
                                          ptrdiff_t stride)
1340
{
1341
    int i;
1342
14440
    for(i=0; i<4; i++)
1343
11552
        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1344
2888
}
1345
1346
static void FUNCC(pred8x16_horizontal_add)(uint8_t *pix,
1347
                                           const int *block_offset,
1348
                                           int16_t *block, ptrdiff_t stride)
1349
{
1350
    int i;
1351
    for(i=0; i<4; i++)
1352
        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1353
    for(i=4; i<8; i++)
1354
        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i+4], block + i*16*sizeof(pixel), stride);
1355
}