GCC Code Coverage Report
Directory: ../../../ffmpeg/ Exec Total Coverage
File: src/libavcodec/x86/me_cmp_init.c Lines: 71 105 67.6 %
Date: 2020-10-23 17:01:47 Branches: 19 26 73.1 %

Line Branch Exec Source
1
/*
2
 * SIMD-optimized motion estimation
3
 * Copyright (c) 2000, 2001 Fabrice Bellard
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24
25
#include "libavutil/attributes.h"
26
#include "libavutil/cpu.h"
27
#include "libavutil/x86/asm.h"
28
#include "libavutil/x86/cpu.h"
29
#include "libavcodec/me_cmp.h"
30
#include "libavcodec/mpegvideo.h"
31
32
int ff_sum_abs_dctelem_mmx(int16_t *block);
33
int ff_sum_abs_dctelem_mmxext(int16_t *block);
34
int ff_sum_abs_dctelem_sse2(int16_t *block);
35
int ff_sum_abs_dctelem_ssse3(int16_t *block);
36
int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
37
                ptrdiff_t stride, int h);
38
int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
39
                 ptrdiff_t stride, int h);
40
int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
41
                  ptrdiff_t stride, int h);
42
int ff_hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
43
int ff_hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
44
int ff_sad8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
45
                   ptrdiff_t stride, int h);
46
int ff_sad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
47
                    ptrdiff_t stride, int h);
48
int ff_sad16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
49
                  ptrdiff_t stride, int h);
50
int ff_sad8_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
51
                      ptrdiff_t stride, int h);
52
int ff_sad16_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
53
                       ptrdiff_t stride, int h);
54
int ff_sad16_x2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
55
                     ptrdiff_t stride, int h);
56
int ff_sad8_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
57
                      ptrdiff_t stride, int h);
58
int ff_sad16_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
59
                       ptrdiff_t stride, int h);
60
int ff_sad16_y2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
61
                     ptrdiff_t stride, int h);
62
int ff_sad8_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
63
                              ptrdiff_t stride, int h);
64
int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
65
                               ptrdiff_t stride, int h);
66
int ff_sad16_approx_xy2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
67
                             ptrdiff_t stride, int h);
68
int ff_vsad_intra8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
69
                          ptrdiff_t stride, int h);
70
int ff_vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
71
                           ptrdiff_t stride, int h);
72
int ff_vsad_intra16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
73
                         ptrdiff_t stride, int h);
74
int ff_vsad8_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
75
                    ptrdiff_t stride, int h);
76
int ff_vsad16_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
77
                     ptrdiff_t stride, int h);
78
int ff_vsad16_approx_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
79
                   ptrdiff_t stride, int h);
80
81
#define hadamard_func(cpu)                                                    \
82
    int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,           \
83
                                  uint8_t *src2, ptrdiff_t stride, int h);    \
84
    int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1,         \
85
                                    uint8_t *src2, ptrdiff_t stride, int h);
86
87
hadamard_func(mmx)
88
hadamard_func(mmxext)
89
hadamard_func(sse2)
90
hadamard_func(ssse3)
91
92
#if HAVE_X86ASM
93
static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
94
                      ptrdiff_t stride, int h)
95
{
96
    int score1, score2;
97
98
    if (c)
99
        score1 = c->mecc.sse[0](c, pix1, pix2, stride, h);
100
    else
101
        score1 = ff_sse16_mmx(c, pix1, pix2, stride, h);
102
    score2 = ff_hf_noise16_mmx(pix1, stride, h) + ff_hf_noise8_mmx(pix1+8, stride, h)
103
           - ff_hf_noise16_mmx(pix2, stride, h) - ff_hf_noise8_mmx(pix2+8, stride, h);
104
105
    if (c)
106
        return score1 + FFABS(score2) * c->avctx->nsse_weight;
107
    else
108
        return score1 + FFABS(score2) * 8;
109
}
110
111
static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
112
                     ptrdiff_t stride, int h)
113
{
114
    int score1 = ff_sse8_mmx(c, pix1, pix2, stride, h);
115
    int score2 = ff_hf_noise8_mmx(pix1, stride, h) -
116
                 ff_hf_noise8_mmx(pix2, stride, h);
117
118
    if (c)
119
        return score1 + FFABS(score2) * c->avctx->nsse_weight;
120
    else
121
        return score1 + FFABS(score2) * 8;
122
}
123
124
#endif /* HAVE_X86ASM */
125
126
#if HAVE_INLINE_ASM
127
128
static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
129
                            ptrdiff_t stride, int h)
130
{
131
    int tmp;
132
133
    av_assert2((((int) pix) & 7) == 0);
134
    av_assert2((stride & 7) == 0);
135
136
#define SUM(in0, in1, out0, out1)               \
137
    "movq (%0), %%mm2\n"                        \
138
    "movq 8(%0), %%mm3\n"                       \
139
    "add %2,%0\n"                               \
140
    "movq %%mm2, " #out0 "\n"                   \
141
    "movq %%mm3, " #out1 "\n"                   \
142
    "psubusb " #in0 ", %%mm2\n"                 \
143
    "psubusb " #in1 ", %%mm3\n"                 \
144
    "psubusb " #out0 ", " #in0 "\n"             \
145
    "psubusb " #out1 ", " #in1 "\n"             \
146
    "por %%mm2, " #in0 "\n"                     \
147
    "por %%mm3, " #in1 "\n"                     \
148
    "movq " #in0 ", %%mm2\n"                    \
149
    "movq " #in1 ", %%mm3\n"                    \
150
    "punpcklbw %%mm7, " #in0 "\n"               \
151
    "punpcklbw %%mm7, " #in1 "\n"               \
152
    "punpckhbw %%mm7, %%mm2\n"                  \
153
    "punpckhbw %%mm7, %%mm3\n"                  \
154
    "paddw " #in1 ", " #in0 "\n"                \
155
    "paddw %%mm3, %%mm2\n"                      \
156
    "paddw %%mm2, " #in0 "\n"                   \
157
    "paddw " #in0 ", %%mm6\n"
158
159
160
    __asm__ volatile (
161
        "movl    %3, %%ecx\n"
162
        "pxor %%mm6, %%mm6\n"
163
        "pxor %%mm7, %%mm7\n"
164
        "movq  (%0), %%mm0\n"
165
        "movq 8(%0), %%mm1\n"
166
        "add %2, %0\n"
167
        "jmp 2f\n"
168
        "1:\n"
169
170
        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
171
        "2:\n"
172
        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
173
174
        "subl $2, %%ecx\n"
175
        "jnz 1b\n"
176
177
        "movq  %%mm6, %%mm0\n"
178
        "psrlq $32,   %%mm6\n"
179
        "paddw %%mm6, %%mm0\n"
180
        "movq  %%mm0, %%mm6\n"
181
        "psrlq $16,   %%mm0\n"
182
        "paddw %%mm6, %%mm0\n"
183
        "movd  %%mm0, %1\n"
184
        : "+r" (pix), "=r" (tmp)
185
        : "r" (stride), "m" (h)
186
        : "%ecx");
187
188
    return tmp & 0xFFFF;
189
}
190
#undef SUM
191
192
static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
193
                      ptrdiff_t stride, int h)
194
{
195
    int tmp;
196
197
    av_assert2((((int) pix1) & 7) == 0);
198
    av_assert2((((int) pix2) & 7) == 0);
199
    av_assert2((stride & 7) == 0);
200
201
#define SUM(in0, in1, out0, out1)       \
202
    "movq (%0), %%mm2\n"                \
203
    "movq (%1), " #out0 "\n"            \
204
    "movq 8(%0), %%mm3\n"               \
205
    "movq 8(%1), " #out1 "\n"           \
206
    "add %3, %0\n"                      \
207
    "add %3, %1\n"                      \
208
    "psubb " #out0 ", %%mm2\n"          \
209
    "psubb " #out1 ", %%mm3\n"          \
210
    "pxor %%mm7, %%mm2\n"               \
211
    "pxor %%mm7, %%mm3\n"               \
212
    "movq %%mm2, " #out0 "\n"           \
213
    "movq %%mm3, " #out1 "\n"           \
214
    "psubusb " #in0 ", %%mm2\n"         \
215
    "psubusb " #in1 ", %%mm3\n"         \
216
    "psubusb " #out0 ", " #in0 "\n"     \
217
    "psubusb " #out1 ", " #in1 "\n"     \
218
    "por %%mm2, " #in0 "\n"             \
219
    "por %%mm3, " #in1 "\n"             \
220
    "movq " #in0 ", %%mm2\n"            \
221
    "movq " #in1 ", %%mm3\n"            \
222
    "punpcklbw %%mm7, " #in0 "\n"       \
223
    "punpcklbw %%mm7, " #in1 "\n"       \
224
    "punpckhbw %%mm7, %%mm2\n"          \
225
    "punpckhbw %%mm7, %%mm3\n"          \
226
    "paddw " #in1 ", " #in0 "\n"        \
227
    "paddw %%mm3, %%mm2\n"              \
228
    "paddw %%mm2, " #in0 "\n"           \
229
    "paddw " #in0 ", %%mm6\n"
230
231
232
    __asm__ volatile (
233
        "movl %4, %%ecx\n"
234
        "pxor %%mm6, %%mm6\n"
235
        "pcmpeqw %%mm7, %%mm7\n"
236
        "psllw $15, %%mm7\n"
237
        "packsswb %%mm7, %%mm7\n"
238
        "movq (%0), %%mm0\n"
239
        "movq (%1), %%mm2\n"
240
        "movq 8(%0), %%mm1\n"
241
        "movq 8(%1), %%mm3\n"
242
        "add %3, %0\n"
243
        "add %3, %1\n"
244
        "psubb %%mm2, %%mm0\n"
245
        "psubb %%mm3, %%mm1\n"
246
        "pxor %%mm7, %%mm0\n"
247
        "pxor %%mm7, %%mm1\n"
248
        "jmp 2f\n"
249
        "1:\n"
250
251
        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
252
        "2:\n"
253
        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
254
255
        "subl $2, %%ecx\n"
256
        "jnz 1b\n"
257
258
        "movq %%mm6, %%mm0\n"
259
        "psrlq $32, %%mm6\n"
260
        "paddw %%mm6, %%mm0\n"
261
        "movq %%mm0, %%mm6\n"
262
        "psrlq $16, %%mm0\n"
263
        "paddw %%mm6, %%mm0\n"
264
        "movd %%mm0, %2\n"
265
        : "+r" (pix1), "+r" (pix2), "=r" (tmp)
266
        : "r" (stride), "m" (h)
267
        : "%ecx");
268
269
    return tmp & 0x7FFF;
270
}
271
#undef SUM
272
273
DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
274
    0x0000000000000000ULL,
275
    0x0001000100010001ULL,
276
    0x0002000200020002ULL,
277
};
278
279
static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2,
280
                              ptrdiff_t stride, int h)
281
{
282
    x86_reg len = -stride * h;
283
    __asm__ volatile (
284
        ".p2align 4                     \n\t"
285
        "1:                             \n\t"
286
        "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
287
        "movq (%2, %%"FF_REG_a"), %%mm2 \n\t"
288
        "movq (%2, %%"FF_REG_a"), %%mm4 \n\t"
289
        "add %3, %%"FF_REG_a"           \n\t"
290
        "psubusb %%mm0, %%mm2           \n\t"
291
        "psubusb %%mm4, %%mm0           \n\t"
292
        "movq (%1, %%"FF_REG_a"), %%mm1 \n\t"
293
        "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
294
        "movq (%2, %%"FF_REG_a"), %%mm5 \n\t"
295
        "psubusb %%mm1, %%mm3           \n\t"
296
        "psubusb %%mm5, %%mm1           \n\t"
297
        "por %%mm2, %%mm0               \n\t"
298
        "por %%mm1, %%mm3               \n\t"
299
        "movq %%mm0, %%mm1              \n\t"
300
        "movq %%mm3, %%mm2              \n\t"
301
        "punpcklbw %%mm7, %%mm0         \n\t"
302
        "punpckhbw %%mm7, %%mm1         \n\t"
303
        "punpcklbw %%mm7, %%mm3         \n\t"
304
        "punpckhbw %%mm7, %%mm2         \n\t"
305
        "paddw %%mm1, %%mm0             \n\t"
306
        "paddw %%mm3, %%mm2             \n\t"
307
        "paddw %%mm2, %%mm0             \n\t"
308
        "paddw %%mm0, %%mm6             \n\t"
309
        "add %3, %%"FF_REG_a"           \n\t"
310
        " js 1b                         \n\t"
311
        : "+a" (len)
312
        : "r" (blk1 - len), "r" (blk2 - len), "r" (stride));
313
}
314
315
static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
316
                              ptrdiff_t stride, int h)
317
{
318
    x86_reg len = -stride * h;
319
    __asm__ volatile (
320
        ".p2align 4                     \n\t"
321
        "1:                             \n\t"
322
        "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
323
        "movq (%2, %%"FF_REG_a"), %%mm1 \n\t"
324
        "movq (%1, %%"FF_REG_a"), %%mm2 \n\t"
325
        "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
326
        "punpcklbw %%mm7, %%mm0         \n\t"
327
        "punpcklbw %%mm7, %%mm1         \n\t"
328
        "punpckhbw %%mm7, %%mm2         \n\t"
329
        "punpckhbw %%mm7, %%mm3         \n\t"
330
        "paddw %%mm0, %%mm1             \n\t"
331
        "paddw %%mm2, %%mm3             \n\t"
332
        "movq (%3, %%"FF_REG_a"), %%mm4 \n\t"
333
        "movq (%3, %%"FF_REG_a"), %%mm2 \n\t"
334
        "paddw %%mm5, %%mm1             \n\t"
335
        "paddw %%mm5, %%mm3             \n\t"
336
        "psrlw $1, %%mm1                \n\t"
337
        "psrlw $1, %%mm3                \n\t"
338
        "packuswb %%mm3, %%mm1          \n\t"
339
        "psubusb %%mm1, %%mm4           \n\t"
340
        "psubusb %%mm2, %%mm1           \n\t"
341
        "por %%mm4, %%mm1               \n\t"
342
        "movq %%mm1, %%mm0              \n\t"
343
        "punpcklbw %%mm7, %%mm0         \n\t"
344
        "punpckhbw %%mm7, %%mm1         \n\t"
345
        "paddw %%mm1, %%mm0             \n\t"
346
        "paddw %%mm0, %%mm6             \n\t"
347
        "add %4, %%"FF_REG_a"           \n\t"
348
        " js 1b                         \n\t"
349
        : "+a" (len)
350
        : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len),
351
          "r" (stride));
352
}
353
354
59940
static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2,
355
                              ptrdiff_t stride, int h)
356
{
357
59940
    x86_reg len = -stride * h;
358
59940
    __asm__ volatile (
359
        "movq  (%1, %%"FF_REG_a"), %%mm0\n\t"
360
        "movq 1(%1, %%"FF_REG_a"), %%mm2\n\t"
361
        "movq %%mm0, %%mm1              \n\t"
362
        "movq %%mm2, %%mm3              \n\t"
363
        "punpcklbw %%mm7, %%mm0         \n\t"
364
        "punpckhbw %%mm7, %%mm1         \n\t"
365
        "punpcklbw %%mm7, %%mm2         \n\t"
366
        "punpckhbw %%mm7, %%mm3         \n\t"
367
        "paddw %%mm2, %%mm0             \n\t"
368
        "paddw %%mm3, %%mm1             \n\t"
369
        ".p2align 4                     \n\t"
370
        "1:                             \n\t"
371
        "movq  (%2, %%"FF_REG_a"), %%mm2\n\t"
372
        "movq 1(%2, %%"FF_REG_a"), %%mm4\n\t"
373
        "movq %%mm2, %%mm3              \n\t"
374
        "movq %%mm4, %%mm5              \n\t"
375
        "punpcklbw %%mm7, %%mm2         \n\t"
376
        "punpckhbw %%mm7, %%mm3         \n\t"
377
        "punpcklbw %%mm7, %%mm4         \n\t"
378
        "punpckhbw %%mm7, %%mm5         \n\t"
379
        "paddw %%mm4, %%mm2             \n\t"
380
        "paddw %%mm5, %%mm3             \n\t"
381
        "movq %5, %%mm5                 \n\t"
382
        "paddw %%mm2, %%mm0             \n\t"
383
        "paddw %%mm3, %%mm1             \n\t"
384
        "paddw %%mm5, %%mm0             \n\t"
385
        "paddw %%mm5, %%mm1             \n\t"
386
        "movq (%3, %%"FF_REG_a"), %%mm4 \n\t"
387
        "movq (%3, %%"FF_REG_a"), %%mm5 \n\t"
388
        "psrlw $2, %%mm0                \n\t"
389
        "psrlw $2, %%mm1                \n\t"
390
        "packuswb %%mm1, %%mm0          \n\t"
391
        "psubusb %%mm0, %%mm4           \n\t"
392
        "psubusb %%mm5, %%mm0           \n\t"
393
        "por %%mm4, %%mm0               \n\t"
394
        "movq %%mm0, %%mm4              \n\t"
395
        "punpcklbw %%mm7, %%mm0         \n\t"
396
        "punpckhbw %%mm7, %%mm4         \n\t"
397
        "paddw %%mm0, %%mm6             \n\t"
398
        "paddw %%mm4, %%mm6             \n\t"
399
        "movq  %%mm2, %%mm0             \n\t"
400
        "movq  %%mm3, %%mm1             \n\t"
401
        "add %4, %%"FF_REG_a"           \n\t"
402
        " js 1b                         \n\t"
403
        : "+a" (len)
404
59940
        : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
405
          "r" (stride), "m" (round_tab[2]));
406
59940
}
407
408
29970
static inline int sum_mmx(void)
409
{
410
    int ret;
411
29970
    __asm__ volatile (
412
        "movq %%mm6, %%mm0              \n\t"
413
        "psrlq $32, %%mm6               \n\t"
414
        "paddw %%mm0, %%mm6             \n\t"
415
        "movq %%mm6, %%mm0              \n\t"
416
        "psrlq $16, %%mm6               \n\t"
417
        "paddw %%mm0, %%mm6             \n\t"
418
        "movd %%mm6, %0                 \n\t"
419
        : "=r" (ret));
420
29970
    return ret & 0xFFFF;
421
}
422
423
static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2,
424
                                ptrdiff_t stride, int h)
425
{
426
    sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h);
427
}
428
429
static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2,
430
                                ptrdiff_t stride, int h)
431
{
432
    sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h);
433
}
434
435
#define PIX_SAD(suf)                                                    \
436
static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2,               \
437
                        uint8_t *blk1, ptrdiff_t stride, int h)         \
438
{                                                                       \
439
    av_assert2(h == 8);                                                     \
440
    __asm__ volatile (                                                  \
441
        "pxor %%mm7, %%mm7     \n\t"                                    \
442
        "pxor %%mm6, %%mm6     \n\t"                                    \
443
        :);                                                             \
444
                                                                        \
445
    sad8_1_ ## suf(blk1, blk2, stride, 8);                              \
446
                                                                        \
447
    return sum_ ## suf();                                               \
448
}                                                                       \
449
                                                                        \
450
static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
451
                           uint8_t *blk1, ptrdiff_t stride, int h)      \
452
{                                                                       \
453
    av_assert2(h == 8);                                                     \
454
    __asm__ volatile (                                                  \
455
        "pxor %%mm7, %%mm7     \n\t"                                    \
456
        "pxor %%mm6, %%mm6     \n\t"                                    \
457
        "movq %0, %%mm5        \n\t"                                    \
458
        :: "m" (round_tab[1]));                                         \
459
                                                                        \
460
    sad8_x2a_ ## suf(blk1, blk2, stride, 8);                            \
461
                                                                        \
462
    return sum_ ## suf();                                               \
463
}                                                                       \
464
                                                                        \
465
static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
466
                           uint8_t *blk1, ptrdiff_t stride, int h)      \
467
{                                                                       \
468
    av_assert2(h == 8);                                                     \
469
    __asm__ volatile (                                                  \
470
        "pxor %%mm7, %%mm7     \n\t"                                    \
471
        "pxor %%mm6, %%mm6     \n\t"                                    \
472
        "movq %0, %%mm5        \n\t"                                    \
473
        :: "m" (round_tab[1]));                                         \
474
                                                                        \
475
    sad8_y2a_ ## suf(blk1, blk2, stride, 8);                            \
476
                                                                        \
477
    return sum_ ## suf();                                               \
478
}                                                                       \
479
                                                                        \
480
static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
481
                            uint8_t *blk1, ptrdiff_t stride, int h)     \
482
{                                                                       \
483
    av_assert2(h == 8);                                                     \
484
    __asm__ volatile (                                                  \
485
        "pxor %%mm7, %%mm7     \n\t"                                    \
486
        "pxor %%mm6, %%mm6     \n\t"                                    \
487
        ::);                                                            \
488
                                                                        \
489
    sad8_4_ ## suf(blk1, blk2, stride, 8);                              \
490
                                                                        \
491
    return sum_ ## suf();                                               \
492
}                                                                       \
493
                                                                        \
494
static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2,              \
495
                         uint8_t *blk1, ptrdiff_t stride, int h)        \
496
{                                                                       \
497
    __asm__ volatile (                                                  \
498
        "pxor %%mm7, %%mm7     \n\t"                                    \
499
        "pxor %%mm6, %%mm6     \n\t"                                    \
500
        :);                                                             \
501
                                                                        \
502
    sad8_1_ ## suf(blk1,     blk2,     stride, h);                      \
503
    sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h);                      \
504
                                                                        \
505
    return sum_ ## suf();                                               \
506
}                                                                       \
507
                                                                        \
508
static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
509
                            uint8_t *blk1, ptrdiff_t stride, int h)     \
510
{                                                                       \
511
    __asm__ volatile (                                                  \
512
        "pxor %%mm7, %%mm7     \n\t"                                    \
513
        "pxor %%mm6, %%mm6     \n\t"                                    \
514
        "movq %0, %%mm5        \n\t"                                    \
515
        :: "m" (round_tab[1]));                                         \
516
                                                                        \
517
    sad8_x2a_ ## suf(blk1,     blk2,     stride, h);                    \
518
    sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h);                    \
519
                                                                        \
520
    return sum_ ## suf();                                               \
521
}                                                                       \
522
                                                                        \
523
static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
524
                            uint8_t *blk1, ptrdiff_t stride, int h)     \
525
{                                                                       \
526
    __asm__ volatile (                                                  \
527
        "pxor %%mm7, %%mm7     \n\t"                                    \
528
        "pxor %%mm6, %%mm6     \n\t"                                    \
529
        "movq %0, %%mm5        \n\t"                                    \
530
        :: "m" (round_tab[1]));                                         \
531
                                                                        \
532
    sad8_y2a_ ## suf(blk1,     blk2,     stride, h);                    \
533
    sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h);                    \
534
                                                                        \
535
    return sum_ ## suf();                                               \
536
}                                                                       \
537
                                                                        \
538
static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,          \
539
                             uint8_t *blk1, ptrdiff_t stride, int h)    \
540
{                                                                       \
541
    __asm__ volatile (                                                  \
542
        "pxor %%mm7, %%mm7     \n\t"                                    \
543
        "pxor %%mm6, %%mm6     \n\t"                                    \
544
        ::);                                                            \
545
                                                                        \
546
    sad8_4_ ## suf(blk1,     blk2,     stride, h);                      \
547
    sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h);                      \
548
                                                                        \
549
    return sum_ ## suf();                                               \
550
}                                                                       \
551
552
59940
PIX_SAD(mmx)
553
554
#endif /* HAVE_INLINE_ASM */
555
556
980
av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
557
{
558
980
    int cpu_flags = av_get_cpu_flags();
559
560
#if HAVE_INLINE_ASM
561
980
    if (INLINE_MMX(cpu_flags)) {
562
37
        c->pix_abs[0][0] = sad16_mmx;
563
37
        c->pix_abs[0][1] = sad16_x2_mmx;
564
37
        c->pix_abs[0][2] = sad16_y2_mmx;
565
37
        c->pix_abs[0][3] = sad16_xy2_mmx;
566
37
        c->pix_abs[1][0] = sad8_mmx;
567
37
        c->pix_abs[1][1] = sad8_x2_mmx;
568
37
        c->pix_abs[1][2] = sad8_y2_mmx;
569
37
        c->pix_abs[1][3] = sad8_xy2_mmx;
570
571
37
        c->sad[0] = sad16_mmx;
572
37
        c->sad[1] = sad8_mmx;
573
574
37
        c->vsad[4] = vsad_intra16_mmx;
575
576
37
        if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
577
35
            c->vsad[0] = vsad16_mmx;
578
        }
579
    }
580
581
#endif /* HAVE_INLINE_ASM */
582
583
980
    if (EXTERNAL_MMX(cpu_flags)) {
584
37
        c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
585
37
        c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
586
37
        c->sum_abs_dctelem   = ff_sum_abs_dctelem_mmx;
587
37
        c->sse[0]            = ff_sse16_mmx;
588
37
        c->sse[1]            = ff_sse8_mmx;
589
#if HAVE_X86ASM
590
37
        c->nsse[0]           = nsse16_mmx;
591
37
        c->nsse[1]           = nsse8_mmx;
592
#endif
593
    }
594
595
980
    if (EXTERNAL_MMXEXT(cpu_flags)) {
596
37
        c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
597
37
        c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
598
37
        c->sum_abs_dctelem   = ff_sum_abs_dctelem_mmxext;
599
600
37
        c->sad[0] = ff_sad16_mmxext;
601
37
        c->sad[1] = ff_sad8_mmxext;
602
603
37
        c->pix_abs[0][0] = ff_sad16_mmxext;
604
37
        c->pix_abs[0][1] = ff_sad16_x2_mmxext;
605
37
        c->pix_abs[0][2] = ff_sad16_y2_mmxext;
606
37
        c->pix_abs[1][0] = ff_sad8_mmxext;
607
37
        c->pix_abs[1][1] = ff_sad8_x2_mmxext;
608
37
        c->pix_abs[1][2] = ff_sad8_y2_mmxext;
609
610
37
        c->vsad[4] = ff_vsad_intra16_mmxext;
611
37
        c->vsad[5] = ff_vsad_intra8_mmxext;
612
613
37
        if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
614
35
            c->pix_abs[0][3] = ff_sad16_approx_xy2_mmxext;
615
35
            c->pix_abs[1][3] = ff_sad8_approx_xy2_mmxext;
616
617
35
            c->vsad[0] = ff_vsad16_approx_mmxext;
618
35
            c->vsad[1] = ff_vsad8_approx_mmxext;
619
        }
620
    }
621
622
980
    if (EXTERNAL_SSE2(cpu_flags)) {
623
37
        c->sse[0] = ff_sse16_sse2;
624
37
        c->sum_abs_dctelem   = ff_sum_abs_dctelem_sse2;
625
626
#if HAVE_ALIGNED_STACK
627
37
        c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
628
37
        c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
629
#endif
630

37
        if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
631
35
            c->sad[0]        = ff_sad16_sse2;
632
35
            c->pix_abs[0][0] = ff_sad16_sse2;
633
35
            c->pix_abs[0][1] = ff_sad16_x2_sse2;
634
35
            c->pix_abs[0][2] = ff_sad16_y2_sse2;
635
636
35
            c->vsad[4]       = ff_vsad_intra16_sse2;
637
35
            if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
638
33
                c->pix_abs[0][3] = ff_sad16_approx_xy2_sse2;
639
33
                c->vsad[0]       = ff_vsad16_approx_sse2;
640
            }
641
        }
642
    }
643
644
980
    if (EXTERNAL_SSSE3(cpu_flags)) {
645
37
        c->sum_abs_dctelem   = ff_sum_abs_dctelem_ssse3;
646
#if HAVE_ALIGNED_STACK
647
37
        c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
648
37
        c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
649
#endif
650
    }
651
980
}