FFmpeg coverage


Directory: ../../../ffmpeg/
File: src/libavcodec/x86/me_cmp_init.c
Date: 2025-01-20 09:27:23
Exec Total Coverage
Lines: 66 69 95.7%
Functions: 7 7 100.0%
Branches: 19 24 79.2%

Line Branch Exec Source
1 /*
2 * SIMD-optimized motion estimation
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7 *
8 * This file is part of FFmpeg.
9 *
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/mem_internal.h"
28 #include "libavutil/x86/asm.h"
29 #include "libavutil/x86/cpu.h"
30 #include "libavcodec/me_cmp.h"
31 #include "libavcodec/mpegvideo.h"
32
33 int ff_sum_abs_dctelem_sse2(const int16_t *block);
34 int ff_sum_abs_dctelem_ssse3(const int16_t *block);
35 int ff_sse8_mmx(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
36 ptrdiff_t stride, int h);
37 int ff_sse16_mmx(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
38 ptrdiff_t stride, int h);
39 int ff_sse16_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
40 ptrdiff_t stride, int h);
41 int ff_hf_noise8_mmx(const uint8_t *pix1, ptrdiff_t stride, int h);
42 int ff_hf_noise16_mmx(const uint8_t *pix1, ptrdiff_t stride, int h);
43 int ff_sad8_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
44 ptrdiff_t stride, int h);
45 int ff_sad16_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
46 ptrdiff_t stride, int h);
47 int ff_sad16_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
48 ptrdiff_t stride, int h);
49 int ff_sad8_x2_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
50 ptrdiff_t stride, int h);
51 int ff_sad16_x2_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
52 ptrdiff_t stride, int h);
53 int ff_sad16_x2_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
54 ptrdiff_t stride, int h);
55 int ff_sad8_y2_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
56 ptrdiff_t stride, int h);
57 int ff_sad16_y2_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
58 ptrdiff_t stride, int h);
59 int ff_sad16_y2_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
60 ptrdiff_t stride, int h);
61 int ff_sad8_approx_xy2_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
62 ptrdiff_t stride, int h);
63 int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
64 ptrdiff_t stride, int h);
65 int ff_sad16_approx_xy2_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
66 ptrdiff_t stride, int h);
67 int ff_vsad_intra8_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
68 ptrdiff_t stride, int h);
69 int ff_vsad_intra16_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
70 ptrdiff_t stride, int h);
71 int ff_vsad_intra16_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
72 ptrdiff_t stride, int h);
73 int ff_vsad8_approx_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
74 ptrdiff_t stride, int h);
75 int ff_vsad16_approx_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
76 ptrdiff_t stride, int h);
77 int ff_vsad16_approx_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
78 ptrdiff_t stride, int h);
79
80 #define hadamard_func(cpu) \
81 int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, const uint8_t *src1, \
82 const uint8_t *src2, ptrdiff_t stride, int h); \
83 int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, const uint8_t *src1, \
84 const uint8_t *src2, ptrdiff_t stride, int h);
85
86 hadamard_func(mmxext)
87 hadamard_func(sse2)
88 hadamard_func(ssse3)
89
90 #if HAVE_X86ASM
91 16 static int nsse16_mmx(MpegEncContext *c, const uint8_t *pix1, const uint8_t *pix2,
92 ptrdiff_t stride, int h)
93 {
94 int score1, score2;
95
96
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 16 times.
16 if (c)
97 score1 = c->sse_cmp[0](c, pix1, pix2, stride, h);
98 else
99 16 score1 = ff_sse16_mmx(c, pix1, pix2, stride, h);
100 16 score2 = ff_hf_noise16_mmx(pix1, stride, h) + ff_hf_noise8_mmx(pix1+8, stride, h)
101 16 - ff_hf_noise16_mmx(pix2, stride, h) - ff_hf_noise8_mmx(pix2+8, stride, h);
102
103
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 16 times.
16 if (c)
104 return score1 + FFABS(score2) * c->avctx->nsse_weight;
105 else
106 16 return score1 + FFABS(score2) * 8;
107 }
108
109 16 static int nsse8_mmx(MpegEncContext *c, const uint8_t *pix1, const uint8_t *pix2,
110 ptrdiff_t stride, int h)
111 {
112 16 int score1 = ff_sse8_mmx(c, pix1, pix2, stride, h);
113 16 int score2 = ff_hf_noise8_mmx(pix1, stride, h) -
114 16 ff_hf_noise8_mmx(pix2, stride, h);
115
116
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 16 times.
16 if (c)
117 return score1 + FFABS(score2) * c->avctx->nsse_weight;
118 else
119 16 return score1 + FFABS(score2) * 8;
120 }
121
122 #endif /* HAVE_X86ASM */
123
124 #if HAVE_INLINE_ASM
125
126 DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
127 0x0000000000000000ULL,
128 0x0001000100010001ULL,
129 0x0002000200020002ULL,
130 };
131
132 71328 static inline void sad8_4_mmx(const uint8_t *blk1, const uint8_t *blk2,
133 ptrdiff_t stride, int h)
134 {
135 71328 x86_reg len = -stride * h;
136 71328 __asm__ volatile (
137 "movq (%1, %%"FF_REG_a"), %%mm0\n\t"
138 "movq 1(%1, %%"FF_REG_a"), %%mm2\n\t"
139 "movq %%mm0, %%mm1 \n\t"
140 "movq %%mm2, %%mm3 \n\t"
141 "punpcklbw %%mm7, %%mm0 \n\t"
142 "punpckhbw %%mm7, %%mm1 \n\t"
143 "punpcklbw %%mm7, %%mm2 \n\t"
144 "punpckhbw %%mm7, %%mm3 \n\t"
145 "paddw %%mm2, %%mm0 \n\t"
146 "paddw %%mm3, %%mm1 \n\t"
147 ".p2align 4 \n\t"
148 "1: \n\t"
149 "movq (%2, %%"FF_REG_a"), %%mm2\n\t"
150 "movq 1(%2, %%"FF_REG_a"), %%mm4\n\t"
151 "movq %%mm2, %%mm3 \n\t"
152 "movq %%mm4, %%mm5 \n\t"
153 "punpcklbw %%mm7, %%mm2 \n\t"
154 "punpckhbw %%mm7, %%mm3 \n\t"
155 "punpcklbw %%mm7, %%mm4 \n\t"
156 "punpckhbw %%mm7, %%mm5 \n\t"
157 "paddw %%mm4, %%mm2 \n\t"
158 "paddw %%mm5, %%mm3 \n\t"
159 "movq %5, %%mm5 \n\t"
160 "paddw %%mm2, %%mm0 \n\t"
161 "paddw %%mm3, %%mm1 \n\t"
162 "paddw %%mm5, %%mm0 \n\t"
163 "paddw %%mm5, %%mm1 \n\t"
164 "movq (%3, %%"FF_REG_a"), %%mm4 \n\t"
165 "movq (%3, %%"FF_REG_a"), %%mm5 \n\t"
166 "psrlw $2, %%mm0 \n\t"
167 "psrlw $2, %%mm1 \n\t"
168 "packuswb %%mm1, %%mm0 \n\t"
169 "psubusb %%mm0, %%mm4 \n\t"
170 "psubusb %%mm5, %%mm0 \n\t"
171 "por %%mm4, %%mm0 \n\t"
172 "movq %%mm0, %%mm4 \n\t"
173 "punpcklbw %%mm7, %%mm0 \n\t"
174 "punpckhbw %%mm7, %%mm4 \n\t"
175 "paddw %%mm0, %%mm6 \n\t"
176 "paddw %%mm4, %%mm6 \n\t"
177 "movq %%mm2, %%mm0 \n\t"
178 "movq %%mm3, %%mm1 \n\t"
179 "add %4, %%"FF_REG_a" \n\t"
180 " js 1b \n\t"
181 : "+a" (len)
182 71328 : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
183 "r" (stride), "m" (round_tab[2]));
184 71328 }
185
186 35672 static inline int sum_mmx(void)
187 {
188 int ret;
189 35672 __asm__ volatile (
190 "movq %%mm6, %%mm0 \n\t"
191 "psrlq $32, %%mm6 \n\t"
192 "paddw %%mm0, %%mm6 \n\t"
193 "movq %%mm6, %%mm0 \n\t"
194 "psrlq $16, %%mm6 \n\t"
195 "paddw %%mm0, %%mm6 \n\t"
196 "movd %%mm6, %0 \n\t"
197 : "=r" (ret));
198 35672 return ret & 0xFFFF;
199 }
200
201 #define PIX_SADXY(suf) \
202 static int sad8_xy2_ ## suf(MpegEncContext *v, const uint8_t *blk2, \
203 const uint8_t *blk1, ptrdiff_t stride, int h) \
204 { \
205 __asm__ volatile ( \
206 "pxor %%mm7, %%mm7 \n\t" \
207 "pxor %%mm6, %%mm6 \n\t" \
208 ::); \
209 \
210 sad8_4_ ## suf(blk1, blk2, stride, h); \
211 \
212 return sum_ ## suf(); \
213 } \
214 \
215 static int sad16_xy2_ ## suf(MpegEncContext *v, const uint8_t *blk2, \
216 const uint8_t *blk1, ptrdiff_t stride, int h) \
217 { \
218 __asm__ volatile ( \
219 "pxor %%mm7, %%mm7 \n\t" \
220 "pxor %%mm6, %%mm6 \n\t" \
221 ::); \
222 \
223 sad8_4_ ## suf(blk1, blk2, stride, h); \
224 sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
225 \
226 return sum_ ## suf(); \
227 } \
228
229 71344 PIX_SADXY(mmx)
230
231 #endif /* HAVE_INLINE_ASM */
232
233 1063 av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
234 {
235 1063 int cpu_flags = av_get_cpu_flags();
236
237 #if HAVE_INLINE_ASM
238
2/2
✓ Branch 0 taken 70 times.
✓ Branch 1 taken 993 times.
1063 if (INLINE_MMX(cpu_flags)) {
239 70 c->pix_abs[0][3] = sad16_xy2_mmx;
240 70 c->pix_abs[1][3] = sad8_xy2_mmx;
241 }
242
243 #endif /* HAVE_INLINE_ASM */
244
245
2/2
✓ Branch 0 taken 70 times.
✓ Branch 1 taken 993 times.
1063 if (EXTERNAL_MMX(cpu_flags)) {
246 70 c->sse[1] = ff_sse8_mmx;
247 #if HAVE_X86ASM
248 70 c->nsse[0] = nsse16_mmx;
249 70 c->nsse[1] = nsse8_mmx;
250 #endif
251 }
252
253
2/2
✓ Branch 0 taken 69 times.
✓ Branch 1 taken 994 times.
1063 if (EXTERNAL_MMXEXT(cpu_flags)) {
254 #if !HAVE_ALIGNED_STACK
255 c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
256 c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
257 #endif
258
259 69 c->sad[0] = ff_sad16_mmxext;
260 69 c->sad[1] = ff_sad8_mmxext;
261
262 69 c->pix_abs[0][0] = ff_sad16_mmxext;
263 69 c->pix_abs[0][1] = ff_sad16_x2_mmxext;
264 69 c->pix_abs[0][2] = ff_sad16_y2_mmxext;
265 69 c->pix_abs[1][0] = ff_sad8_mmxext;
266 69 c->pix_abs[1][1] = ff_sad8_x2_mmxext;
267 69 c->pix_abs[1][2] = ff_sad8_y2_mmxext;
268
269 69 c->vsad[4] = ff_vsad_intra16_mmxext;
270 69 c->vsad[5] = ff_vsad_intra8_mmxext;
271
272
2/2
✓ Branch 0 taken 56 times.
✓ Branch 1 taken 13 times.
69 if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
273 56 c->pix_abs[0][3] = ff_sad16_approx_xy2_mmxext;
274 56 c->pix_abs[1][3] = ff_sad8_approx_xy2_mmxext;
275
276 56 c->vsad[0] = ff_vsad16_approx_mmxext;
277 56 c->vsad[1] = ff_vsad8_approx_mmxext;
278 }
279 }
280
281
2/2
✓ Branch 0 taken 67 times.
✓ Branch 1 taken 996 times.
1063 if (EXTERNAL_SSE2(cpu_flags)) {
282 67 c->sse[0] = ff_sse16_sse2;
283 67 c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2;
284
285 #if HAVE_ALIGNED_STACK
286 67 c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
287 67 c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
288 #endif
289
2/4
✓ Branch 0 taken 67 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 67 times.
✗ Branch 3 not taken.
67 if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
290 67 c->sad[0] = ff_sad16_sse2;
291 67 c->pix_abs[0][0] = ff_sad16_sse2;
292 67 c->pix_abs[0][1] = ff_sad16_x2_sse2;
293 67 c->pix_abs[0][2] = ff_sad16_y2_sse2;
294
295 67 c->vsad[4] = ff_vsad_intra16_sse2;
296
2/2
✓ Branch 0 taken 56 times.
✓ Branch 1 taken 11 times.
67 if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
297 56 c->pix_abs[0][3] = ff_sad16_approx_xy2_sse2;
298 56 c->vsad[0] = ff_vsad16_approx_sse2;
299 }
300 }
301 }
302
303
2/2
✓ Branch 0 taken 65 times.
✓ Branch 1 taken 998 times.
1063 if (EXTERNAL_SSSE3(cpu_flags)) {
304 65 c->sum_abs_dctelem = ff_sum_abs_dctelem_ssse3;
305 #if HAVE_ALIGNED_STACK
306 65 c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
307 65 c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
308 #endif
309 }
310 1063 }
311