GCC Code Coverage Report
Directory: ../../../ffmpeg/ Exec Total Coverage
File: src/libavcodec/x86/me_cmp_init.c Lines: 71 105 67.6 %
Date: 2021-04-15 16:04:23 Branches: 19 26 73.1 %

Line Branch Exec Source
1
/*
2
 * SIMD-optimized motion estimation
3
 * Copyright (c) 2000, 2001 Fabrice Bellard
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24
25
#include "libavutil/attributes.h"
26
#include "libavutil/cpu.h"
27
#include "libavutil/mem_internal.h"
28
#include "libavutil/x86/asm.h"
29
#include "libavutil/x86/cpu.h"
30
#include "libavcodec/me_cmp.h"
31
#include "libavcodec/mpegvideo.h"
32
33
int ff_sum_abs_dctelem_mmx(int16_t *block);
34
int ff_sum_abs_dctelem_mmxext(int16_t *block);
35
int ff_sum_abs_dctelem_sse2(int16_t *block);
36
int ff_sum_abs_dctelem_ssse3(int16_t *block);
37
int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
38
                ptrdiff_t stride, int h);
39
int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
40
                 ptrdiff_t stride, int h);
41
int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
42
                  ptrdiff_t stride, int h);
43
int ff_hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
44
int ff_hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
45
int ff_sad8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
46
                   ptrdiff_t stride, int h);
47
int ff_sad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
48
                    ptrdiff_t stride, int h);
49
int ff_sad16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
50
                  ptrdiff_t stride, int h);
51
int ff_sad8_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
52
                      ptrdiff_t stride, int h);
53
int ff_sad16_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
54
                       ptrdiff_t stride, int h);
55
int ff_sad16_x2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
56
                     ptrdiff_t stride, int h);
57
int ff_sad8_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
58
                      ptrdiff_t stride, int h);
59
int ff_sad16_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
60
                       ptrdiff_t stride, int h);
61
int ff_sad16_y2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
62
                     ptrdiff_t stride, int h);
63
int ff_sad8_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
64
                              ptrdiff_t stride, int h);
65
int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
66
                               ptrdiff_t stride, int h);
67
int ff_sad16_approx_xy2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
68
                             ptrdiff_t stride, int h);
69
int ff_vsad_intra8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
70
                          ptrdiff_t stride, int h);
71
int ff_vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
72
                           ptrdiff_t stride, int h);
73
int ff_vsad_intra16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
74
                         ptrdiff_t stride, int h);
75
int ff_vsad8_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
76
                    ptrdiff_t stride, int h);
77
int ff_vsad16_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
78
                     ptrdiff_t stride, int h);
79
int ff_vsad16_approx_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
80
                   ptrdiff_t stride, int h);
81
82
#define hadamard_func(cpu)                                                    \
83
    int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,           \
84
                                  uint8_t *src2, ptrdiff_t stride, int h);    \
85
    int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1,         \
86
                                    uint8_t *src2, ptrdiff_t stride, int h);
87
88
hadamard_func(mmx)
89
hadamard_func(mmxext)
90
hadamard_func(sse2)
91
hadamard_func(ssse3)
92
93
#if HAVE_X86ASM
94
static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
95
                      ptrdiff_t stride, int h)
96
{
97
    int score1, score2;
98
99
    if (c)
100
        score1 = c->mecc.sse[0](c, pix1, pix2, stride, h);
101
    else
102
        score1 = ff_sse16_mmx(c, pix1, pix2, stride, h);
103
    score2 = ff_hf_noise16_mmx(pix1, stride, h) + ff_hf_noise8_mmx(pix1+8, stride, h)
104
           - ff_hf_noise16_mmx(pix2, stride, h) - ff_hf_noise8_mmx(pix2+8, stride, h);
105
106
    if (c)
107
        return score1 + FFABS(score2) * c->avctx->nsse_weight;
108
    else
109
        return score1 + FFABS(score2) * 8;
110
}
111
112
static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
113
                     ptrdiff_t stride, int h)
114
{
115
    int score1 = ff_sse8_mmx(c, pix1, pix2, stride, h);
116
    int score2 = ff_hf_noise8_mmx(pix1, stride, h) -
117
                 ff_hf_noise8_mmx(pix2, stride, h);
118
119
    if (c)
120
        return score1 + FFABS(score2) * c->avctx->nsse_weight;
121
    else
122
        return score1 + FFABS(score2) * 8;
123
}
124
125
#endif /* HAVE_X86ASM */
126
127
#if HAVE_INLINE_ASM
128
129
static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
130
                            ptrdiff_t stride, int h)
131
{
132
    int tmp;
133
134
    av_assert2((((int) pix) & 7) == 0);
135
    av_assert2((stride & 7) == 0);
136
137
#define SUM(in0, in1, out0, out1)               \
138
    "movq (%0), %%mm2\n"                        \
139
    "movq 8(%0), %%mm3\n"                       \
140
    "add %2,%0\n"                               \
141
    "movq %%mm2, " #out0 "\n"                   \
142
    "movq %%mm3, " #out1 "\n"                   \
143
    "psubusb " #in0 ", %%mm2\n"                 \
144
    "psubusb " #in1 ", %%mm3\n"                 \
145
    "psubusb " #out0 ", " #in0 "\n"             \
146
    "psubusb " #out1 ", " #in1 "\n"             \
147
    "por %%mm2, " #in0 "\n"                     \
148
    "por %%mm3, " #in1 "\n"                     \
149
    "movq " #in0 ", %%mm2\n"                    \
150
    "movq " #in1 ", %%mm3\n"                    \
151
    "punpcklbw %%mm7, " #in0 "\n"               \
152
    "punpcklbw %%mm7, " #in1 "\n"               \
153
    "punpckhbw %%mm7, %%mm2\n"                  \
154
    "punpckhbw %%mm7, %%mm3\n"                  \
155
    "paddw " #in1 ", " #in0 "\n"                \
156
    "paddw %%mm3, %%mm2\n"                      \
157
    "paddw %%mm2, " #in0 "\n"                   \
158
    "paddw " #in0 ", %%mm6\n"
159
160
161
    __asm__ volatile (
162
        "movl    %3, %%ecx\n"
163
        "pxor %%mm6, %%mm6\n"
164
        "pxor %%mm7, %%mm7\n"
165
        "movq  (%0), %%mm0\n"
166
        "movq 8(%0), %%mm1\n"
167
        "add %2, %0\n"
168
        "jmp 2f\n"
169
        "1:\n"
170
171
        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
172
        "2:\n"
173
        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
174
175
        "subl $2, %%ecx\n"
176
        "jnz 1b\n"
177
178
        "movq  %%mm6, %%mm0\n"
179
        "psrlq $32,   %%mm6\n"
180
        "paddw %%mm6, %%mm0\n"
181
        "movq  %%mm0, %%mm6\n"
182
        "psrlq $16,   %%mm0\n"
183
        "paddw %%mm6, %%mm0\n"
184
        "movd  %%mm0, %1\n"
185
        : "+r" (pix), "=r" (tmp)
186
        : "r" (stride), "m" (h)
187
        : "%ecx");
188
189
    return tmp & 0xFFFF;
190
}
191
#undef SUM
192
193
static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
194
                      ptrdiff_t stride, int h)
195
{
196
    int tmp;
197
198
    av_assert2((((int) pix1) & 7) == 0);
199
    av_assert2((((int) pix2) & 7) == 0);
200
    av_assert2((stride & 7) == 0);
201
202
#define SUM(in0, in1, out0, out1)       \
203
    "movq (%0), %%mm2\n"                \
204
    "movq (%1), " #out0 "\n"            \
205
    "movq 8(%0), %%mm3\n"               \
206
    "movq 8(%1), " #out1 "\n"           \
207
    "add %3, %0\n"                      \
208
    "add %3, %1\n"                      \
209
    "psubb " #out0 ", %%mm2\n"          \
210
    "psubb " #out1 ", %%mm3\n"          \
211
    "pxor %%mm7, %%mm2\n"               \
212
    "pxor %%mm7, %%mm3\n"               \
213
    "movq %%mm2, " #out0 "\n"           \
214
    "movq %%mm3, " #out1 "\n"           \
215
    "psubusb " #in0 ", %%mm2\n"         \
216
    "psubusb " #in1 ", %%mm3\n"         \
217
    "psubusb " #out0 ", " #in0 "\n"     \
218
    "psubusb " #out1 ", " #in1 "\n"     \
219
    "por %%mm2, " #in0 "\n"             \
220
    "por %%mm3, " #in1 "\n"             \
221
    "movq " #in0 ", %%mm2\n"            \
222
    "movq " #in1 ", %%mm3\n"            \
223
    "punpcklbw %%mm7, " #in0 "\n"       \
224
    "punpcklbw %%mm7, " #in1 "\n"       \
225
    "punpckhbw %%mm7, %%mm2\n"          \
226
    "punpckhbw %%mm7, %%mm3\n"          \
227
    "paddw " #in1 ", " #in0 "\n"        \
228
    "paddw %%mm3, %%mm2\n"              \
229
    "paddw %%mm2, " #in0 "\n"           \
230
    "paddw " #in0 ", %%mm6\n"
231
232
233
    __asm__ volatile (
234
        "movl %4, %%ecx\n"
235
        "pxor %%mm6, %%mm6\n"
236
        "pcmpeqw %%mm7, %%mm7\n"
237
        "psllw $15, %%mm7\n"
238
        "packsswb %%mm7, %%mm7\n"
239
        "movq (%0), %%mm0\n"
240
        "movq (%1), %%mm2\n"
241
        "movq 8(%0), %%mm1\n"
242
        "movq 8(%1), %%mm3\n"
243
        "add %3, %0\n"
244
        "add %3, %1\n"
245
        "psubb %%mm2, %%mm0\n"
246
        "psubb %%mm3, %%mm1\n"
247
        "pxor %%mm7, %%mm0\n"
248
        "pxor %%mm7, %%mm1\n"
249
        "jmp 2f\n"
250
        "1:\n"
251
252
        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
253
        "2:\n"
254
        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
255
256
        "subl $2, %%ecx\n"
257
        "jnz 1b\n"
258
259
        "movq %%mm6, %%mm0\n"
260
        "psrlq $32, %%mm6\n"
261
        "paddw %%mm6, %%mm0\n"
262
        "movq %%mm0, %%mm6\n"
263
        "psrlq $16, %%mm0\n"
264
        "paddw %%mm6, %%mm0\n"
265
        "movd %%mm0, %2\n"
266
        : "+r" (pix1), "+r" (pix2), "=r" (tmp)
267
        : "r" (stride), "m" (h)
268
        : "%ecx");
269
270
    return tmp & 0x7FFF;
271
}
272
#undef SUM
273
274
DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
275
    0x0000000000000000ULL,
276
    0x0001000100010001ULL,
277
    0x0002000200020002ULL,
278
};
279
280
static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2,
281
                              ptrdiff_t stride, int h)
282
{
283
    x86_reg len = -stride * h;
284
    __asm__ volatile (
285
        ".p2align 4                     \n\t"
286
        "1:                             \n\t"
287
        "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
288
        "movq (%2, %%"FF_REG_a"), %%mm2 \n\t"
289
        "movq (%2, %%"FF_REG_a"), %%mm4 \n\t"
290
        "add %3, %%"FF_REG_a"           \n\t"
291
        "psubusb %%mm0, %%mm2           \n\t"
292
        "psubusb %%mm4, %%mm0           \n\t"
293
        "movq (%1, %%"FF_REG_a"), %%mm1 \n\t"
294
        "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
295
        "movq (%2, %%"FF_REG_a"), %%mm5 \n\t"
296
        "psubusb %%mm1, %%mm3           \n\t"
297
        "psubusb %%mm5, %%mm1           \n\t"
298
        "por %%mm2, %%mm0               \n\t"
299
        "por %%mm1, %%mm3               \n\t"
300
        "movq %%mm0, %%mm1              \n\t"
301
        "movq %%mm3, %%mm2              \n\t"
302
        "punpcklbw %%mm7, %%mm0         \n\t"
303
        "punpckhbw %%mm7, %%mm1         \n\t"
304
        "punpcklbw %%mm7, %%mm3         \n\t"
305
        "punpckhbw %%mm7, %%mm2         \n\t"
306
        "paddw %%mm1, %%mm0             \n\t"
307
        "paddw %%mm3, %%mm2             \n\t"
308
        "paddw %%mm2, %%mm0             \n\t"
309
        "paddw %%mm0, %%mm6             \n\t"
310
        "add %3, %%"FF_REG_a"           \n\t"
311
        " js 1b                         \n\t"
312
        : "+a" (len)
313
        : "r" (blk1 - len), "r" (blk2 - len), "r" (stride));
314
}
315
316
static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
317
                              ptrdiff_t stride, int h)
318
{
319
    x86_reg len = -stride * h;
320
    __asm__ volatile (
321
        ".p2align 4                     \n\t"
322
        "1:                             \n\t"
323
        "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
324
        "movq (%2, %%"FF_REG_a"), %%mm1 \n\t"
325
        "movq (%1, %%"FF_REG_a"), %%mm2 \n\t"
326
        "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
327
        "punpcklbw %%mm7, %%mm0         \n\t"
328
        "punpcklbw %%mm7, %%mm1         \n\t"
329
        "punpckhbw %%mm7, %%mm2         \n\t"
330
        "punpckhbw %%mm7, %%mm3         \n\t"
331
        "paddw %%mm0, %%mm1             \n\t"
332
        "paddw %%mm2, %%mm3             \n\t"
333
        "movq (%3, %%"FF_REG_a"), %%mm4 \n\t"
334
        "movq (%3, %%"FF_REG_a"), %%mm2 \n\t"
335
        "paddw %%mm5, %%mm1             \n\t"
336
        "paddw %%mm5, %%mm3             \n\t"
337
        "psrlw $1, %%mm1                \n\t"
338
        "psrlw $1, %%mm3                \n\t"
339
        "packuswb %%mm3, %%mm1          \n\t"
340
        "psubusb %%mm1, %%mm4           \n\t"
341
        "psubusb %%mm2, %%mm1           \n\t"
342
        "por %%mm4, %%mm1               \n\t"
343
        "movq %%mm1, %%mm0              \n\t"
344
        "punpcklbw %%mm7, %%mm0         \n\t"
345
        "punpckhbw %%mm7, %%mm1         \n\t"
346
        "paddw %%mm1, %%mm0             \n\t"
347
        "paddw %%mm0, %%mm6             \n\t"
348
        "add %4, %%"FF_REG_a"           \n\t"
349
        " js 1b                         \n\t"
350
        : "+a" (len)
351
        : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len),
352
          "r" (stride));
353
}
354
355
59940
static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2,
356
                              ptrdiff_t stride, int h)
357
{
358
59940
    x86_reg len = -stride * h;
359
59940
    __asm__ volatile (
360
        "movq  (%1, %%"FF_REG_a"), %%mm0\n\t"
361
        "movq 1(%1, %%"FF_REG_a"), %%mm2\n\t"
362
        "movq %%mm0, %%mm1              \n\t"
363
        "movq %%mm2, %%mm3              \n\t"
364
        "punpcklbw %%mm7, %%mm0         \n\t"
365
        "punpckhbw %%mm7, %%mm1         \n\t"
366
        "punpcklbw %%mm7, %%mm2         \n\t"
367
        "punpckhbw %%mm7, %%mm3         \n\t"
368
        "paddw %%mm2, %%mm0             \n\t"
369
        "paddw %%mm3, %%mm1             \n\t"
370
        ".p2align 4                     \n\t"
371
        "1:                             \n\t"
372
        "movq  (%2, %%"FF_REG_a"), %%mm2\n\t"
373
        "movq 1(%2, %%"FF_REG_a"), %%mm4\n\t"
374
        "movq %%mm2, %%mm3              \n\t"
375
        "movq %%mm4, %%mm5              \n\t"
376
        "punpcklbw %%mm7, %%mm2         \n\t"
377
        "punpckhbw %%mm7, %%mm3         \n\t"
378
        "punpcklbw %%mm7, %%mm4         \n\t"
379
        "punpckhbw %%mm7, %%mm5         \n\t"
380
        "paddw %%mm4, %%mm2             \n\t"
381
        "paddw %%mm5, %%mm3             \n\t"
382
        "movq %5, %%mm5                 \n\t"
383
        "paddw %%mm2, %%mm0             \n\t"
384
        "paddw %%mm3, %%mm1             \n\t"
385
        "paddw %%mm5, %%mm0             \n\t"
386
        "paddw %%mm5, %%mm1             \n\t"
387
        "movq (%3, %%"FF_REG_a"), %%mm4 \n\t"
388
        "movq (%3, %%"FF_REG_a"), %%mm5 \n\t"
389
        "psrlw $2, %%mm0                \n\t"
390
        "psrlw $2, %%mm1                \n\t"
391
        "packuswb %%mm1, %%mm0          \n\t"
392
        "psubusb %%mm0, %%mm4           \n\t"
393
        "psubusb %%mm5, %%mm0           \n\t"
394
        "por %%mm4, %%mm0               \n\t"
395
        "movq %%mm0, %%mm4              \n\t"
396
        "punpcklbw %%mm7, %%mm0         \n\t"
397
        "punpckhbw %%mm7, %%mm4         \n\t"
398
        "paddw %%mm0, %%mm6             \n\t"
399
        "paddw %%mm4, %%mm6             \n\t"
400
        "movq  %%mm2, %%mm0             \n\t"
401
        "movq  %%mm3, %%mm1             \n\t"
402
        "add %4, %%"FF_REG_a"           \n\t"
403
        " js 1b                         \n\t"
404
        : "+a" (len)
405
59940
        : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
406
          "r" (stride), "m" (round_tab[2]));
407
59940
}
408
409
29970
static inline int sum_mmx(void)
410
{
411
    int ret;
412
29970
    __asm__ volatile (
413
        "movq %%mm6, %%mm0              \n\t"
414
        "psrlq $32, %%mm6               \n\t"
415
        "paddw %%mm0, %%mm6             \n\t"
416
        "movq %%mm6, %%mm0              \n\t"
417
        "psrlq $16, %%mm6               \n\t"
418
        "paddw %%mm0, %%mm6             \n\t"
419
        "movd %%mm6, %0                 \n\t"
420
        : "=r" (ret));
421
29970
    return ret & 0xFFFF;
422
}
423
424
static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2,
425
                                ptrdiff_t stride, int h)
426
{
427
    sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h);
428
}
429
430
static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2,
431
                                ptrdiff_t stride, int h)
432
{
433
    sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h);
434
}
435
436
#define PIX_SAD(suf)                                                    \
437
static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2,               \
438
                        uint8_t *blk1, ptrdiff_t stride, int h)         \
439
{                                                                       \
440
    av_assert2(h == 8);                                                     \
441
    __asm__ volatile (                                                  \
442
        "pxor %%mm7, %%mm7     \n\t"                                    \
443
        "pxor %%mm6, %%mm6     \n\t"                                    \
444
        :);                                                             \
445
                                                                        \
446
    sad8_1_ ## suf(blk1, blk2, stride, 8);                              \
447
                                                                        \
448
    return sum_ ## suf();                                               \
449
}                                                                       \
450
                                                                        \
451
static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
452
                           uint8_t *blk1, ptrdiff_t stride, int h)      \
453
{                                                                       \
454
    av_assert2(h == 8);                                                     \
455
    __asm__ volatile (                                                  \
456
        "pxor %%mm7, %%mm7     \n\t"                                    \
457
        "pxor %%mm6, %%mm6     \n\t"                                    \
458
        "movq %0, %%mm5        \n\t"                                    \
459
        :: "m" (round_tab[1]));                                         \
460
                                                                        \
461
    sad8_x2a_ ## suf(blk1, blk2, stride, 8);                            \
462
                                                                        \
463
    return sum_ ## suf();                                               \
464
}                                                                       \
465
                                                                        \
466
static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
467
                           uint8_t *blk1, ptrdiff_t stride, int h)      \
468
{                                                                       \
469
    av_assert2(h == 8);                                                     \
470
    __asm__ volatile (                                                  \
471
        "pxor %%mm7, %%mm7     \n\t"                                    \
472
        "pxor %%mm6, %%mm6     \n\t"                                    \
473
        "movq %0, %%mm5        \n\t"                                    \
474
        :: "m" (round_tab[1]));                                         \
475
                                                                        \
476
    sad8_y2a_ ## suf(blk1, blk2, stride, 8);                            \
477
                                                                        \
478
    return sum_ ## suf();                                               \
479
}                                                                       \
480
                                                                        \
481
static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
482
                            uint8_t *blk1, ptrdiff_t stride, int h)     \
483
{                                                                       \
484
    av_assert2(h == 8);                                                     \
485
    __asm__ volatile (                                                  \
486
        "pxor %%mm7, %%mm7     \n\t"                                    \
487
        "pxor %%mm6, %%mm6     \n\t"                                    \
488
        ::);                                                            \
489
                                                                        \
490
    sad8_4_ ## suf(blk1, blk2, stride, 8);                              \
491
                                                                        \
492
    return sum_ ## suf();                                               \
493
}                                                                       \
494
                                                                        \
495
static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2,              \
496
                         uint8_t *blk1, ptrdiff_t stride, int h)        \
497
{                                                                       \
498
    __asm__ volatile (                                                  \
499
        "pxor %%mm7, %%mm7     \n\t"                                    \
500
        "pxor %%mm6, %%mm6     \n\t"                                    \
501
        :);                                                             \
502
                                                                        \
503
    sad8_1_ ## suf(blk1,     blk2,     stride, h);                      \
504
    sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h);                      \
505
                                                                        \
506
    return sum_ ## suf();                                               \
507
}                                                                       \
508
                                                                        \
509
static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
510
                            uint8_t *blk1, ptrdiff_t stride, int h)     \
511
{                                                                       \
512
    __asm__ volatile (                                                  \
513
        "pxor %%mm7, %%mm7     \n\t"                                    \
514
        "pxor %%mm6, %%mm6     \n\t"                                    \
515
        "movq %0, %%mm5        \n\t"                                    \
516
        :: "m" (round_tab[1]));                                         \
517
                                                                        \
518
    sad8_x2a_ ## suf(blk1,     blk2,     stride, h);                    \
519
    sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h);                    \
520
                                                                        \
521
    return sum_ ## suf();                                               \
522
}                                                                       \
523
                                                                        \
524
static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
525
                            uint8_t *blk1, ptrdiff_t stride, int h)     \
526
{                                                                       \
527
    __asm__ volatile (                                                  \
528
        "pxor %%mm7, %%mm7     \n\t"                                    \
529
        "pxor %%mm6, %%mm6     \n\t"                                    \
530
        "movq %0, %%mm5        \n\t"                                    \
531
        :: "m" (round_tab[1]));                                         \
532
                                                                        \
533
    sad8_y2a_ ## suf(blk1,     blk2,     stride, h);                    \
534
    sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h);                    \
535
                                                                        \
536
    return sum_ ## suf();                                               \
537
}                                                                       \
538
                                                                        \
539
static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,          \
540
                             uint8_t *blk1, ptrdiff_t stride, int h)    \
541
{                                                                       \
542
    __asm__ volatile (                                                  \
543
        "pxor %%mm7, %%mm7     \n\t"                                    \
544
        "pxor %%mm6, %%mm6     \n\t"                                    \
545
        ::);                                                            \
546
                                                                        \
547
    sad8_4_ ## suf(blk1,     blk2,     stride, h);                      \
548
    sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h);                      \
549
                                                                        \
550
    return sum_ ## suf();                                               \
551
}                                                                       \
552
553
59940
PIX_SAD(mmx)
554
555
#endif /* HAVE_INLINE_ASM */
556
557
993
av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
558
{
559
993
    int cpu_flags = av_get_cpu_flags();
560
561
#if HAVE_INLINE_ASM
562
993
    if (INLINE_MMX(cpu_flags)) {
563
41
        c->pix_abs[0][0] = sad16_mmx;
564
41
        c->pix_abs[0][1] = sad16_x2_mmx;
565
41
        c->pix_abs[0][2] = sad16_y2_mmx;
566
41
        c->pix_abs[0][3] = sad16_xy2_mmx;
567
41
        c->pix_abs[1][0] = sad8_mmx;
568
41
        c->pix_abs[1][1] = sad8_x2_mmx;
569
41
        c->pix_abs[1][2] = sad8_y2_mmx;
570
41
        c->pix_abs[1][3] = sad8_xy2_mmx;
571
572
41
        c->sad[0] = sad16_mmx;
573
41
        c->sad[1] = sad8_mmx;
574
575
41
        c->vsad[4] = vsad_intra16_mmx;
576
577
41
        if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
578
39
            c->vsad[0] = vsad16_mmx;
579
        }
580
    }
581
582
#endif /* HAVE_INLINE_ASM */
583
584
993
    if (EXTERNAL_MMX(cpu_flags)) {
585
41
        c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
586
41
        c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
587
41
        c->sum_abs_dctelem   = ff_sum_abs_dctelem_mmx;
588
41
        c->sse[0]            = ff_sse16_mmx;
589
41
        c->sse[1]            = ff_sse8_mmx;
590
#if HAVE_X86ASM
591
41
        c->nsse[0]           = nsse16_mmx;
592
41
        c->nsse[1]           = nsse8_mmx;
593
#endif
594
    }
595
596
993
    if (EXTERNAL_MMXEXT(cpu_flags)) {
597
41
        c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
598
41
        c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
599
41
        c->sum_abs_dctelem   = ff_sum_abs_dctelem_mmxext;
600
601
41
        c->sad[0] = ff_sad16_mmxext;
602
41
        c->sad[1] = ff_sad8_mmxext;
603
604
41
        c->pix_abs[0][0] = ff_sad16_mmxext;
605
41
        c->pix_abs[0][1] = ff_sad16_x2_mmxext;
606
41
        c->pix_abs[0][2] = ff_sad16_y2_mmxext;
607
41
        c->pix_abs[1][0] = ff_sad8_mmxext;
608
41
        c->pix_abs[1][1] = ff_sad8_x2_mmxext;
609
41
        c->pix_abs[1][2] = ff_sad8_y2_mmxext;
610
611
41
        c->vsad[4] = ff_vsad_intra16_mmxext;
612
41
        c->vsad[5] = ff_vsad_intra8_mmxext;
613
614
41
        if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
615
39
            c->pix_abs[0][3] = ff_sad16_approx_xy2_mmxext;
616
39
            c->pix_abs[1][3] = ff_sad8_approx_xy2_mmxext;
617
618
39
            c->vsad[0] = ff_vsad16_approx_mmxext;
619
39
            c->vsad[1] = ff_vsad8_approx_mmxext;
620
        }
621
    }
622
623
993
    if (EXTERNAL_SSE2(cpu_flags)) {
624
41
        c->sse[0] = ff_sse16_sse2;
625
41
        c->sum_abs_dctelem   = ff_sum_abs_dctelem_sse2;
626
627
#if HAVE_ALIGNED_STACK
628
41
        c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
629
41
        c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
630
#endif
631

41
        if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
632
39
            c->sad[0]        = ff_sad16_sse2;
633
39
            c->pix_abs[0][0] = ff_sad16_sse2;
634
39
            c->pix_abs[0][1] = ff_sad16_x2_sse2;
635
39
            c->pix_abs[0][2] = ff_sad16_y2_sse2;
636
637
39
            c->vsad[4]       = ff_vsad_intra16_sse2;
638
39
            if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
639
37
                c->pix_abs[0][3] = ff_sad16_approx_xy2_sse2;
640
37
                c->vsad[0]       = ff_vsad16_approx_sse2;
641
            }
642
        }
643
    }
644
645
993
    if (EXTERNAL_SSSE3(cpu_flags)) {
646
41
        c->sum_abs_dctelem   = ff_sum_abs_dctelem_ssse3;
647
#if HAVE_ALIGNED_STACK
648
41
        c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
649
41
        c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
650
#endif
651
    }
652
993
}