Line | Branch | Exec | Source |
---|---|---|---|
1 | /* | ||
2 | * SIMD-optimized motion estimation | ||
3 | * Copyright (c) 2000, 2001 Fabrice Bellard | ||
4 | * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> | ||
5 | * | ||
6 | * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | ||
7 | * | ||
8 | * This file is part of FFmpeg. | ||
9 | * | ||
10 | * FFmpeg is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU Lesser General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2.1 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * FFmpeg is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * Lesser General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU Lesser General Public | ||
21 | * License along with FFmpeg; if not, write to the Free Software | ||
22 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
23 | */ | ||
24 | |||
25 | #include "libavutil/attributes.h" | ||
26 | #include "libavutil/cpu.h" | ||
27 | #include "libavutil/mem_internal.h" | ||
28 | #include "libavutil/x86/asm.h" | ||
29 | #include "libavutil/x86/cpu.h" | ||
30 | #include "libavcodec/me_cmp.h" | ||
31 | #include "libavcodec/mpegvideo.h" | ||
32 | |||
33 | int ff_sum_abs_dctelem_sse2(const int16_t *block); | ||
34 | int ff_sum_abs_dctelem_ssse3(const int16_t *block); | ||
35 | int ff_sse8_mmx(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, | ||
36 | ptrdiff_t stride, int h); | ||
37 | int ff_sse16_mmx(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, | ||
38 | ptrdiff_t stride, int h); | ||
39 | int ff_sse16_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, | ||
40 | ptrdiff_t stride, int h); | ||
41 | int ff_hf_noise8_mmx(const uint8_t *pix1, ptrdiff_t stride, int h); | ||
42 | int ff_hf_noise16_mmx(const uint8_t *pix1, ptrdiff_t stride, int h); | ||
43 | int ff_sad8_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, | ||
44 | ptrdiff_t stride, int h); | ||
45 | int ff_sad16_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, | ||
46 | ptrdiff_t stride, int h); | ||
47 | int ff_sad16_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, | ||
48 | ptrdiff_t stride, int h); | ||
49 | int ff_sad8_x2_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, | ||
50 | ptrdiff_t stride, int h); | ||
51 | int ff_sad16_x2_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, | ||
52 | ptrdiff_t stride, int h); | ||
53 | int ff_sad16_x2_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, | ||
54 | ptrdiff_t stride, int h); | ||
55 | int ff_sad8_y2_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, | ||
56 | ptrdiff_t stride, int h); | ||
57 | int ff_sad16_y2_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, | ||
58 | ptrdiff_t stride, int h); | ||
59 | int ff_sad16_y2_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, | ||
60 | ptrdiff_t stride, int h); | ||
61 | int ff_sad8_approx_xy2_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, | ||
62 | ptrdiff_t stride, int h); | ||
63 | int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, | ||
64 | ptrdiff_t stride, int h); | ||
65 | int ff_sad16_approx_xy2_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, | ||
66 | ptrdiff_t stride, int h); | ||
67 | int ff_vsad_intra8_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, | ||
68 | ptrdiff_t stride, int h); | ||
69 | int ff_vsad_intra16_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, | ||
70 | ptrdiff_t stride, int h); | ||
71 | int ff_vsad_intra16_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, | ||
72 | ptrdiff_t stride, int h); | ||
73 | int ff_vsad8_approx_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, | ||
74 | ptrdiff_t stride, int h); | ||
75 | int ff_vsad16_approx_mmxext(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, | ||
76 | ptrdiff_t stride, int h); | ||
77 | int ff_vsad16_approx_sse2(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, | ||
78 | ptrdiff_t stride, int h); | ||
79 | |||
80 | #define hadamard_func(cpu) \ | ||
81 | int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, const uint8_t *src1, \ | ||
82 | const uint8_t *src2, ptrdiff_t stride, int h); \ | ||
83 | int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, const uint8_t *src1, \ | ||
84 | const uint8_t *src2, ptrdiff_t stride, int h); | ||
85 | |||
86 | hadamard_func(mmxext) | ||
87 | hadamard_func(sse2) | ||
88 | hadamard_func(ssse3) | ||
89 | |||
90 | #if HAVE_X86ASM | ||
91 | 16 | static int nsse16_mmx(MpegEncContext *c, const uint8_t *pix1, const uint8_t *pix2, | |
92 | ptrdiff_t stride, int h) | ||
93 | { | ||
94 | int score1, score2; | ||
95 | |||
96 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 16 times.
|
16 | if (c) |
97 | ✗ | score1 = c->sse_cmp[0](c, pix1, pix2, stride, h); | |
98 | else | ||
99 | 16 | score1 = ff_sse16_mmx(c, pix1, pix2, stride, h); | |
100 | 16 | score2 = ff_hf_noise16_mmx(pix1, stride, h) + ff_hf_noise8_mmx(pix1+8, stride, h) | |
101 | 16 | - ff_hf_noise16_mmx(pix2, stride, h) - ff_hf_noise8_mmx(pix2+8, stride, h); | |
102 | |||
103 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 16 times.
|
16 | if (c) |
104 | ✗ | return score1 + FFABS(score2) * c->avctx->nsse_weight; | |
105 | else | ||
106 | 16 | return score1 + FFABS(score2) * 8; | |
107 | } | ||
108 | |||
109 | 16 | static int nsse8_mmx(MpegEncContext *c, const uint8_t *pix1, const uint8_t *pix2, | |
110 | ptrdiff_t stride, int h) | ||
111 | { | ||
112 | 16 | int score1 = ff_sse8_mmx(c, pix1, pix2, stride, h); | |
113 | 16 | int score2 = ff_hf_noise8_mmx(pix1, stride, h) - | |
114 | 16 | ff_hf_noise8_mmx(pix2, stride, h); | |
115 | |||
116 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 16 times.
|
16 | if (c) |
117 | ✗ | return score1 + FFABS(score2) * c->avctx->nsse_weight; | |
118 | else | ||
119 | 16 | return score1 + FFABS(score2) * 8; | |
120 | } | ||
121 | |||
122 | #endif /* HAVE_X86ASM */ | ||
123 | |||
124 | #if HAVE_INLINE_ASM | ||
125 | |||
126 | DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = { | ||
127 | 0x0000000000000000ULL, | ||
128 | 0x0001000100010001ULL, | ||
129 | 0x0002000200020002ULL, | ||
130 | }; | ||
131 | |||
132 | 71328 | static inline void sad8_4_mmx(const uint8_t *blk1, const uint8_t *blk2, | |
133 | ptrdiff_t stride, int h) | ||
134 | { | ||
135 | 71328 | x86_reg len = -stride * h; | |
136 | 71328 | __asm__ volatile ( | |
137 | "movq (%1, %%"FF_REG_a"), %%mm0\n\t" | ||
138 | "movq 1(%1, %%"FF_REG_a"), %%mm2\n\t" | ||
139 | "movq %%mm0, %%mm1 \n\t" | ||
140 | "movq %%mm2, %%mm3 \n\t" | ||
141 | "punpcklbw %%mm7, %%mm0 \n\t" | ||
142 | "punpckhbw %%mm7, %%mm1 \n\t" | ||
143 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
144 | "punpckhbw %%mm7, %%mm3 \n\t" | ||
145 | "paddw %%mm2, %%mm0 \n\t" | ||
146 | "paddw %%mm3, %%mm1 \n\t" | ||
147 | ".p2align 4 \n\t" | ||
148 | "1: \n\t" | ||
149 | "movq (%2, %%"FF_REG_a"), %%mm2\n\t" | ||
150 | "movq 1(%2, %%"FF_REG_a"), %%mm4\n\t" | ||
151 | "movq %%mm2, %%mm3 \n\t" | ||
152 | "movq %%mm4, %%mm5 \n\t" | ||
153 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
154 | "punpckhbw %%mm7, %%mm3 \n\t" | ||
155 | "punpcklbw %%mm7, %%mm4 \n\t" | ||
156 | "punpckhbw %%mm7, %%mm5 \n\t" | ||
157 | "paddw %%mm4, %%mm2 \n\t" | ||
158 | "paddw %%mm5, %%mm3 \n\t" | ||
159 | "movq %5, %%mm5 \n\t" | ||
160 | "paddw %%mm2, %%mm0 \n\t" | ||
161 | "paddw %%mm3, %%mm1 \n\t" | ||
162 | "paddw %%mm5, %%mm0 \n\t" | ||
163 | "paddw %%mm5, %%mm1 \n\t" | ||
164 | "movq (%3, %%"FF_REG_a"), %%mm4 \n\t" | ||
165 | "movq (%3, %%"FF_REG_a"), %%mm5 \n\t" | ||
166 | "psrlw $2, %%mm0 \n\t" | ||
167 | "psrlw $2, %%mm1 \n\t" | ||
168 | "packuswb %%mm1, %%mm0 \n\t" | ||
169 | "psubusb %%mm0, %%mm4 \n\t" | ||
170 | "psubusb %%mm5, %%mm0 \n\t" | ||
171 | "por %%mm4, %%mm0 \n\t" | ||
172 | "movq %%mm0, %%mm4 \n\t" | ||
173 | "punpcklbw %%mm7, %%mm0 \n\t" | ||
174 | "punpckhbw %%mm7, %%mm4 \n\t" | ||
175 | "paddw %%mm0, %%mm6 \n\t" | ||
176 | "paddw %%mm4, %%mm6 \n\t" | ||
177 | "movq %%mm2, %%mm0 \n\t" | ||
178 | "movq %%mm3, %%mm1 \n\t" | ||
179 | "add %4, %%"FF_REG_a" \n\t" | ||
180 | " js 1b \n\t" | ||
181 | : "+a" (len) | ||
182 | 71328 | : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), | |
183 | "r" (stride), "m" (round_tab[2])); | ||
184 | 71328 | } | |
185 | |||
186 | 35672 | static inline int sum_mmx(void) | |
187 | { | ||
188 | int ret; | ||
189 | 35672 | __asm__ volatile ( | |
190 | "movq %%mm6, %%mm0 \n\t" | ||
191 | "psrlq $32, %%mm6 \n\t" | ||
192 | "paddw %%mm0, %%mm6 \n\t" | ||
193 | "movq %%mm6, %%mm0 \n\t" | ||
194 | "psrlq $16, %%mm6 \n\t" | ||
195 | "paddw %%mm0, %%mm6 \n\t" | ||
196 | "movd %%mm6, %0 \n\t" | ||
197 | : "=r" (ret)); | ||
198 | 35672 | return ret & 0xFFFF; | |
199 | } | ||
200 | |||
201 | #define PIX_SADXY(suf) \ | ||
202 | static int sad8_xy2_ ## suf(MpegEncContext *v, const uint8_t *blk2, \ | ||
203 | const uint8_t *blk1, ptrdiff_t stride, int h) \ | ||
204 | { \ | ||
205 | __asm__ volatile ( \ | ||
206 | "pxor %%mm7, %%mm7 \n\t" \ | ||
207 | "pxor %%mm6, %%mm6 \n\t" \ | ||
208 | ::); \ | ||
209 | \ | ||
210 | sad8_4_ ## suf(blk1, blk2, stride, h); \ | ||
211 | \ | ||
212 | return sum_ ## suf(); \ | ||
213 | } \ | ||
214 | \ | ||
215 | static int sad16_xy2_ ## suf(MpegEncContext *v, const uint8_t *blk2, \ | ||
216 | const uint8_t *blk1, ptrdiff_t stride, int h) \ | ||
217 | { \ | ||
218 | __asm__ volatile ( \ | ||
219 | "pxor %%mm7, %%mm7 \n\t" \ | ||
220 | "pxor %%mm6, %%mm6 \n\t" \ | ||
221 | ::); \ | ||
222 | \ | ||
223 | sad8_4_ ## suf(blk1, blk2, stride, h); \ | ||
224 | sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ | ||
225 | \ | ||
226 | return sum_ ## suf(); \ | ||
227 | } \ | ||
228 | |||
229 | 71344 | PIX_SADXY(mmx) | |
230 | |||
231 | #endif /* HAVE_INLINE_ASM */ | ||
232 | |||
233 | 1063 | av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx) | |
234 | { | ||
235 | 1063 | int cpu_flags = av_get_cpu_flags(); | |
236 | |||
237 | #if HAVE_INLINE_ASM | ||
238 |
2/2✓ Branch 0 taken 70 times.
✓ Branch 1 taken 993 times.
|
1063 | if (INLINE_MMX(cpu_flags)) { |
239 | 70 | c->pix_abs[0][3] = sad16_xy2_mmx; | |
240 | 70 | c->pix_abs[1][3] = sad8_xy2_mmx; | |
241 | } | ||
242 | |||
243 | #endif /* HAVE_INLINE_ASM */ | ||
244 | |||
245 |
2/2✓ Branch 0 taken 70 times.
✓ Branch 1 taken 993 times.
|
1063 | if (EXTERNAL_MMX(cpu_flags)) { |
246 | 70 | c->sse[1] = ff_sse8_mmx; | |
247 | #if HAVE_X86ASM | ||
248 | 70 | c->nsse[0] = nsse16_mmx; | |
249 | 70 | c->nsse[1] = nsse8_mmx; | |
250 | #endif | ||
251 | } | ||
252 | |||
253 |
2/2✓ Branch 0 taken 69 times.
✓ Branch 1 taken 994 times.
|
1063 | if (EXTERNAL_MMXEXT(cpu_flags)) { |
254 | #if !HAVE_ALIGNED_STACK | ||
255 | c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext; | ||
256 | c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext; | ||
257 | #endif | ||
258 | |||
259 | 69 | c->sad[0] = ff_sad16_mmxext; | |
260 | 69 | c->sad[1] = ff_sad8_mmxext; | |
261 | |||
262 | 69 | c->pix_abs[0][0] = ff_sad16_mmxext; | |
263 | 69 | c->pix_abs[0][1] = ff_sad16_x2_mmxext; | |
264 | 69 | c->pix_abs[0][2] = ff_sad16_y2_mmxext; | |
265 | 69 | c->pix_abs[1][0] = ff_sad8_mmxext; | |
266 | 69 | c->pix_abs[1][1] = ff_sad8_x2_mmxext; | |
267 | 69 | c->pix_abs[1][2] = ff_sad8_y2_mmxext; | |
268 | |||
269 | 69 | c->vsad[4] = ff_vsad_intra16_mmxext; | |
270 | 69 | c->vsad[5] = ff_vsad_intra8_mmxext; | |
271 | |||
272 |
2/2✓ Branch 0 taken 56 times.
✓ Branch 1 taken 13 times.
|
69 | if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) { |
273 | 56 | c->pix_abs[0][3] = ff_sad16_approx_xy2_mmxext; | |
274 | 56 | c->pix_abs[1][3] = ff_sad8_approx_xy2_mmxext; | |
275 | |||
276 | 56 | c->vsad[0] = ff_vsad16_approx_mmxext; | |
277 | 56 | c->vsad[1] = ff_vsad8_approx_mmxext; | |
278 | } | ||
279 | } | ||
280 | |||
281 |
2/2✓ Branch 0 taken 67 times.
✓ Branch 1 taken 996 times.
|
1063 | if (EXTERNAL_SSE2(cpu_flags)) { |
282 | 67 | c->sse[0] = ff_sse16_sse2; | |
283 | 67 | c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2; | |
284 | |||
285 | #if HAVE_ALIGNED_STACK | ||
286 | 67 | c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2; | |
287 | 67 | c->hadamard8_diff[1] = ff_hadamard8_diff_sse2; | |
288 | #endif | ||
289 |
2/4✓ Branch 0 taken 67 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 67 times.
✗ Branch 3 not taken.
|
67 | if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) { |
290 | 67 | c->sad[0] = ff_sad16_sse2; | |
291 | 67 | c->pix_abs[0][0] = ff_sad16_sse2; | |
292 | 67 | c->pix_abs[0][1] = ff_sad16_x2_sse2; | |
293 | 67 | c->pix_abs[0][2] = ff_sad16_y2_sse2; | |
294 | |||
295 | 67 | c->vsad[4] = ff_vsad_intra16_sse2; | |
296 |
2/2✓ Branch 0 taken 56 times.
✓ Branch 1 taken 11 times.
|
67 | if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) { |
297 | 56 | c->pix_abs[0][3] = ff_sad16_approx_xy2_sse2; | |
298 | 56 | c->vsad[0] = ff_vsad16_approx_sse2; | |
299 | } | ||
300 | } | ||
301 | } | ||
302 | |||
303 |
2/2✓ Branch 0 taken 65 times.
✓ Branch 1 taken 998 times.
|
1063 | if (EXTERNAL_SSSE3(cpu_flags)) { |
304 | 65 | c->sum_abs_dctelem = ff_sum_abs_dctelem_ssse3; | |
305 | #if HAVE_ALIGNED_STACK | ||
306 | 65 | c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3; | |
307 | 65 | c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3; | |
308 | #endif | ||
309 | } | ||
310 | 1063 | } | |
311 |