Line | Branch | Exec | Source |
---|---|---|---|
1 | /* | ||
2 | * SIMD-optimized halfpel functions | ||
3 | * Copyright (c) 2000, 2001 Fabrice Bellard | ||
4 | * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> | ||
5 | * | ||
6 | * This file is part of FFmpeg. | ||
7 | * | ||
8 | * FFmpeg is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU Lesser General Public | ||
10 | * License as published by the Free Software Foundation; either | ||
11 | * version 2.1 of the License, or (at your option) any later version. | ||
12 | * | ||
13 | * FFmpeg is distributed in the hope that it will be useful, | ||
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
16 | * Lesser General Public License for more details. | ||
17 | * | ||
18 | * You should have received a copy of the GNU Lesser General Public | ||
19 | * License along with FFmpeg; if not, write to the Free Software | ||
20 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
21 | * | ||
22 | * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | ||
23 | */ | ||
24 | |||
25 | #include <stddef.h> | ||
26 | #include <stdint.h> | ||
27 | |||
28 | #include "libavutil/attributes.h" | ||
29 | #include "libavutil/cpu.h" | ||
30 | #include "libavutil/x86/cpu.h" | ||
31 | #include "libavcodec/avcodec.h" | ||
32 | #include "libavcodec/hpeldsp.h" | ||
33 | #include "libavcodec/pixels.h" | ||
34 | #include "fpel.h" | ||
35 | #include "hpeldsp.h" | ||
36 | |||
37 | void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, | ||
38 | ptrdiff_t line_size, int h); | ||
39 | void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels, | ||
40 | ptrdiff_t line_size, int h); | ||
41 | void ff_put_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels, | ||
42 | ptrdiff_t line_size, int h); | ||
43 | void ff_avg_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels, | ||
44 | ptrdiff_t line_size, int h); | ||
45 | void ff_put_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels, | ||
46 | ptrdiff_t line_size, int h); | ||
47 | void ff_avg_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels, | ||
48 | ptrdiff_t line_size, int h); | ||
49 | void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, | ||
50 | ptrdiff_t line_size, int h); | ||
51 | void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block, | ||
52 | const uint8_t *pixels, | ||
53 | ptrdiff_t line_size, int h); | ||
54 | void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, | ||
55 | ptrdiff_t line_size, int h); | ||
56 | void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, | ||
57 | ptrdiff_t line_size, int h); | ||
58 | void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block, | ||
59 | const uint8_t *pixels, | ||
60 | ptrdiff_t line_size, int h); | ||
61 | void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, | ||
62 | ptrdiff_t line_size, int h); | ||
63 | void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, | ||
64 | ptrdiff_t line_size, int h); | ||
65 | void ff_avg_approx_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels, | ||
66 | ptrdiff_t line_size, int h); | ||
67 | |||
68 | #define put_pixels8_mmx ff_put_pixels8_mmx | ||
69 | #define put_pixels16_mmx ff_put_pixels16_mmx | ||
70 | #define put_pixels8_xy2_mmx ff_put_pixels8_xy2_mmx | ||
71 | #define put_no_rnd_pixels8_mmx ff_put_pixels8_mmx | ||
72 | #define put_no_rnd_pixels16_mmx ff_put_pixels16_mmx | ||
73 | |||
74 | #if HAVE_INLINE_ASM | ||
75 | |||
76 | /***********************************/ | ||
77 | /* MMX no rounding */ | ||
78 | #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx | ||
79 | #define SET_RND MOVQ_WONE | ||
80 | #define STATIC static | ||
81 | |||
82 | #include "rnd_template.c" | ||
83 | |||
84 | #undef DEF | ||
85 | #undef SET_RND | ||
86 | #undef STATIC | ||
87 | |||
88 | // this routine is 'slightly' suboptimal but mostly unused | ||
89 | ✗ | static void avg_no_rnd_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels, | |
90 | ptrdiff_t line_size, int h) | ||
91 | { | ||
92 | ✗ | MOVQ_ZERO(mm7); | |
93 | ✗ | MOVQ_WONE(mm6); // =2 for rnd and =1 for no_rnd version | |
94 | ✗ | __asm__ volatile( | |
95 | "movq (%1), %%mm0 \n\t" | ||
96 | "movq 1(%1), %%mm4 \n\t" | ||
97 | "movq %%mm0, %%mm1 \n\t" | ||
98 | "movq %%mm4, %%mm5 \n\t" | ||
99 | "punpcklbw %%mm7, %%mm0 \n\t" | ||
100 | "punpcklbw %%mm7, %%mm4 \n\t" | ||
101 | "punpckhbw %%mm7, %%mm1 \n\t" | ||
102 | "punpckhbw %%mm7, %%mm5 \n\t" | ||
103 | "paddusw %%mm0, %%mm4 \n\t" | ||
104 | "paddusw %%mm1, %%mm5 \n\t" | ||
105 | "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" | ||
106 | "add %3, %1 \n\t" | ||
107 | ".p2align 3 \n\t" | ||
108 | "1: \n\t" | ||
109 | "movq (%1, %%"FF_REG_a"), %%mm0 \n\t" | ||
110 | "movq 1(%1, %%"FF_REG_a"), %%mm2 \n\t" | ||
111 | "movq %%mm0, %%mm1 \n\t" | ||
112 | "movq %%mm2, %%mm3 \n\t" | ||
113 | "punpcklbw %%mm7, %%mm0 \n\t" | ||
114 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
115 | "punpckhbw %%mm7, %%mm1 \n\t" | ||
116 | "punpckhbw %%mm7, %%mm3 \n\t" | ||
117 | "paddusw %%mm2, %%mm0 \n\t" | ||
118 | "paddusw %%mm3, %%mm1 \n\t" | ||
119 | "paddusw %%mm6, %%mm4 \n\t" | ||
120 | "paddusw %%mm6, %%mm5 \n\t" | ||
121 | "paddusw %%mm0, %%mm4 \n\t" | ||
122 | "paddusw %%mm1, %%mm5 \n\t" | ||
123 | "psrlw $2, %%mm4 \n\t" | ||
124 | "psrlw $2, %%mm5 \n\t" | ||
125 | "movq (%2, %%"FF_REG_a"), %%mm3 \n\t" | ||
126 | "packuswb %%mm5, %%mm4 \n\t" | ||
127 | "pcmpeqd %%mm2, %%mm2 \n\t" | ||
128 | "paddb %%mm2, %%mm2 \n\t" | ||
129 | PAVGB_MMX(%%mm3, %%mm4, %%mm5, %%mm2) | ||
130 | "movq %%mm5, (%2, %%"FF_REG_a") \n\t" | ||
131 | "add %3, %%"FF_REG_a" \n\t" | ||
132 | |||
133 | "movq (%1, %%"FF_REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 | ||
134 | "movq 1(%1, %%"FF_REG_a"), %%mm4 \n\t" | ||
135 | "movq %%mm2, %%mm3 \n\t" | ||
136 | "movq %%mm4, %%mm5 \n\t" | ||
137 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
138 | "punpcklbw %%mm7, %%mm4 \n\t" | ||
139 | "punpckhbw %%mm7, %%mm3 \n\t" | ||
140 | "punpckhbw %%mm7, %%mm5 \n\t" | ||
141 | "paddusw %%mm2, %%mm4 \n\t" | ||
142 | "paddusw %%mm3, %%mm5 \n\t" | ||
143 | "paddusw %%mm6, %%mm0 \n\t" | ||
144 | "paddusw %%mm6, %%mm1 \n\t" | ||
145 | "paddusw %%mm4, %%mm0 \n\t" | ||
146 | "paddusw %%mm5, %%mm1 \n\t" | ||
147 | "psrlw $2, %%mm0 \n\t" | ||
148 | "psrlw $2, %%mm1 \n\t" | ||
149 | "movq (%2, %%"FF_REG_a"), %%mm3 \n\t" | ||
150 | "packuswb %%mm1, %%mm0 \n\t" | ||
151 | "pcmpeqd %%mm2, %%mm2 \n\t" | ||
152 | "paddb %%mm2, %%mm2 \n\t" | ||
153 | PAVGB_MMX(%%mm3, %%mm0, %%mm1, %%mm2) | ||
154 | "movq %%mm1, (%2, %%"FF_REG_a") \n\t" | ||
155 | "add %3, %%"FF_REG_a" \n\t" | ||
156 | |||
157 | "subl $2, %0 \n\t" | ||
158 | "jnz 1b \n\t" | ||
159 | :"+g"(h), "+S"(pixels) | ||
160 | :"D"(block), "r"((x86_reg)line_size) | ||
161 | :FF_REG_a, "memory"); | ||
162 | ✗ | } | |
163 | |||
164 | ✗ | static void put_no_rnd_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |
165 | { | ||
166 | ✗ | MOVQ_BFE(mm6); | |
167 | ✗ | __asm__ volatile( | |
168 | "lea (%3, %3), %%"FF_REG_a" \n\t" | ||
169 | ".p2align 3 \n\t" | ||
170 | "1: \n\t" | ||
171 | "movq (%1), %%mm0 \n\t" | ||
172 | "movq 1(%1), %%mm1 \n\t" | ||
173 | "movq (%1, %3), %%mm2 \n\t" | ||
174 | "movq 1(%1, %3), %%mm3 \n\t" | ||
175 | PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | ||
176 | "movq %%mm4, (%2) \n\t" | ||
177 | "movq %%mm5, (%2, %3) \n\t" | ||
178 | "add %%"FF_REG_a", %1 \n\t" | ||
179 | "add %%"FF_REG_a", %2 \n\t" | ||
180 | "movq (%1), %%mm0 \n\t" | ||
181 | "movq 1(%1), %%mm1 \n\t" | ||
182 | "movq (%1, %3), %%mm2 \n\t" | ||
183 | "movq 1(%1, %3), %%mm3 \n\t" | ||
184 | PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | ||
185 | "movq %%mm4, (%2) \n\t" | ||
186 | "movq %%mm5, (%2, %3) \n\t" | ||
187 | "add %%"FF_REG_a", %1 \n\t" | ||
188 | "add %%"FF_REG_a", %2 \n\t" | ||
189 | "subl $4, %0 \n\t" | ||
190 | "jnz 1b \n\t" | ||
191 | :"+g"(h), "+S"(pixels), "+D"(block) | ||
192 | :"r"((x86_reg)line_size) | ||
193 | :FF_REG_a, "memory"); | ||
194 | ✗ | } | |
195 | |||
196 | 1096 | static void put_no_rnd_pixels16_x2_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |
197 | { | ||
198 | 1096 | MOVQ_BFE(mm6); | |
199 | 1096 | __asm__ volatile( | |
200 | "lea (%3, %3), %%"FF_REG_a" \n\t" | ||
201 | ".p2align 3 \n\t" | ||
202 | "1: \n\t" | ||
203 | "movq (%1), %%mm0 \n\t" | ||
204 | "movq 1(%1), %%mm1 \n\t" | ||
205 | "movq (%1, %3), %%mm2 \n\t" | ||
206 | "movq 1(%1, %3), %%mm3 \n\t" | ||
207 | PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | ||
208 | "movq %%mm4, (%2) \n\t" | ||
209 | "movq %%mm5, (%2, %3) \n\t" | ||
210 | "movq 8(%1), %%mm0 \n\t" | ||
211 | "movq 9(%1), %%mm1 \n\t" | ||
212 | "movq 8(%1, %3), %%mm2 \n\t" | ||
213 | "movq 9(%1, %3), %%mm3 \n\t" | ||
214 | PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | ||
215 | "movq %%mm4, 8(%2) \n\t" | ||
216 | "movq %%mm5, 8(%2, %3) \n\t" | ||
217 | "add %%"FF_REG_a", %1 \n\t" | ||
218 | "add %%"FF_REG_a", %2 \n\t" | ||
219 | "movq (%1), %%mm0 \n\t" | ||
220 | "movq 1(%1), %%mm1 \n\t" | ||
221 | "movq (%1, %3), %%mm2 \n\t" | ||
222 | "movq 1(%1, %3), %%mm3 \n\t" | ||
223 | PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | ||
224 | "movq %%mm4, (%2) \n\t" | ||
225 | "movq %%mm5, (%2, %3) \n\t" | ||
226 | "movq 8(%1), %%mm0 \n\t" | ||
227 | "movq 9(%1), %%mm1 \n\t" | ||
228 | "movq 8(%1, %3), %%mm2 \n\t" | ||
229 | "movq 9(%1, %3), %%mm3 \n\t" | ||
230 | PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | ||
231 | "movq %%mm4, 8(%2) \n\t" | ||
232 | "movq %%mm5, 8(%2, %3) \n\t" | ||
233 | "add %%"FF_REG_a", %1 \n\t" | ||
234 | "add %%"FF_REG_a", %2 \n\t" | ||
235 | "subl $4, %0 \n\t" | ||
236 | "jnz 1b \n\t" | ||
237 | :"+g"(h), "+S"(pixels), "+D"(block) | ||
238 | :"r"((x86_reg)line_size) | ||
239 | :FF_REG_a, "memory"); | ||
240 | 1096 | } | |
241 | |||
242 | 2954 | static void put_no_rnd_pixels8_y2_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |
243 | { | ||
244 | 2954 | MOVQ_BFE(mm6); | |
245 | 2954 | __asm__ volatile( | |
246 | "lea (%3, %3), %%"FF_REG_a" \n\t" | ||
247 | "movq (%1), %%mm0 \n\t" | ||
248 | ".p2align 3 \n\t" | ||
249 | "1: \n\t" | ||
250 | "movq (%1, %3), %%mm1 \n\t" | ||
251 | "movq (%1, %%"FF_REG_a"),%%mm2\n\t" | ||
252 | PAVGBP_MMX_NO_RND(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) | ||
253 | "movq %%mm4, (%2) \n\t" | ||
254 | "movq %%mm5, (%2, %3) \n\t" | ||
255 | "add %%"FF_REG_a", %1 \n\t" | ||
256 | "add %%"FF_REG_a", %2 \n\t" | ||
257 | "movq (%1, %3), %%mm1 \n\t" | ||
258 | "movq (%1, %%"FF_REG_a"),%%mm0\n\t" | ||
259 | PAVGBP_MMX_NO_RND(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) | ||
260 | "movq %%mm4, (%2) \n\t" | ||
261 | "movq %%mm5, (%2, %3) \n\t" | ||
262 | "add %%"FF_REG_a", %1 \n\t" | ||
263 | "add %%"FF_REG_a", %2 \n\t" | ||
264 | "subl $4, %0 \n\t" | ||
265 | "jnz 1b \n\t" | ||
266 | :"+g"(h), "+S"(pixels), "+D"(block) | ||
267 | :"r"((x86_reg)line_size) | ||
268 | :FF_REG_a, "memory"); | ||
269 | 2954 | } | |
270 | |||
271 | ✗ | static void avg_no_rnd_pixels16_x2_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |
272 | { | ||
273 | ✗ | MOVQ_BFE(mm6); | |
274 | ✗ | __asm__ volatile( | |
275 | ".p2align 3 \n\t" | ||
276 | "1: \n\t" | ||
277 | "movq (%1), %%mm0 \n\t" | ||
278 | "movq 1(%1), %%mm1 \n\t" | ||
279 | "movq (%2), %%mm3 \n\t" | ||
280 | PAVGB_MMX_NO_RND(%%mm0, %%mm1, %%mm2, %%mm6) | ||
281 | PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6) | ||
282 | "movq %%mm0, (%2) \n\t" | ||
283 | "movq 8(%1), %%mm0 \n\t" | ||
284 | "movq 9(%1), %%mm1 \n\t" | ||
285 | "movq 8(%2), %%mm3 \n\t" | ||
286 | PAVGB_MMX_NO_RND(%%mm0, %%mm1, %%mm2, %%mm6) | ||
287 | PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6) | ||
288 | "movq %%mm0, 8(%2) \n\t" | ||
289 | "add %3, %1 \n\t" | ||
290 | "add %3, %2 \n\t" | ||
291 | "subl $1, %0 \n\t" | ||
292 | "jnz 1b \n\t" | ||
293 | :"+g"(h), "+S"(pixels), "+D"(block) | ||
294 | :"r"((x86_reg)line_size) | ||
295 | :"memory"); | ||
296 | ✗ | } | |
297 | |||
298 | ✗ | static void avg_no_rnd_pixels8_y2_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |
299 | { | ||
300 | ✗ | MOVQ_BFE(mm6); | |
301 | ✗ | __asm__ volatile( | |
302 | "lea (%3, %3), %%"FF_REG_a" \n\t" | ||
303 | "movq (%1), %%mm0 \n\t" | ||
304 | ".p2align 3 \n\t" | ||
305 | "1: \n\t" | ||
306 | "movq (%1, %3), %%mm1 \n\t" | ||
307 | "movq (%1, %%"FF_REG_a"), %%mm2 \n\t" | ||
308 | PAVGBP_MMX_NO_RND(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) | ||
309 | "movq (%2), %%mm3 \n\t" | ||
310 | PAVGB_MMX(%%mm3, %%mm4, %%mm0, %%mm6) | ||
311 | "movq (%2, %3), %%mm3 \n\t" | ||
312 | PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6) | ||
313 | "movq %%mm0, (%2) \n\t" | ||
314 | "movq %%mm1, (%2, %3) \n\t" | ||
315 | "add %%"FF_REG_a", %1 \n\t" | ||
316 | "add %%"FF_REG_a", %2 \n\t" | ||
317 | |||
318 | "movq (%1, %3), %%mm1 \n\t" | ||
319 | "movq (%1, %%"FF_REG_a"), %%mm0 \n\t" | ||
320 | PAVGBP_MMX_NO_RND(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) | ||
321 | "movq (%2), %%mm3 \n\t" | ||
322 | PAVGB_MMX(%%mm3, %%mm4, %%mm2, %%mm6) | ||
323 | "movq (%2, %3), %%mm3 \n\t" | ||
324 | PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6) | ||
325 | "movq %%mm2, (%2) \n\t" | ||
326 | "movq %%mm1, (%2, %3) \n\t" | ||
327 | "add %%"FF_REG_a", %1 \n\t" | ||
328 | "add %%"FF_REG_a", %2 \n\t" | ||
329 | |||
330 | "subl $4, %0 \n\t" | ||
331 | "jnz 1b \n\t" | ||
332 | :"+g"(h), "+S"(pixels), "+D"(block) | ||
333 | :"r"((x86_reg)line_size) | ||
334 | :FF_REG_a, "memory"); | ||
335 | ✗ | } | |
336 | |||
337 | #if HAVE_MMX | ||
338 | ✗ | CALL_2X_PIXELS(avg_no_rnd_pixels16_y2_mmx, avg_no_rnd_pixels8_y2_mmx, 8) | |
339 | 1477 | CALL_2X_PIXELS(put_no_rnd_pixels16_y2_mmx, put_no_rnd_pixels8_y2_mmx, 8) | |
340 | |||
341 | ✗ | CALL_2X_PIXELS(avg_no_rnd_pixels16_xy2_mmx, avg_no_rnd_pixels8_xy2_mmx, 8) | |
342 | 10118 | CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, put_no_rnd_pixels8_xy2_mmx, 8) | |
343 | #endif | ||
344 | |||
345 | /***********************************/ | ||
346 | /* MMX rounding */ | ||
347 | |||
348 | #define SET_RND MOVQ_WTWO | ||
349 | #define DEF(x, y) ff_ ## x ## _ ## y ## _mmx | ||
350 | #define STATIC | ||
351 | |||
352 | #include "rnd_template.c" | ||
353 | |||
354 | #undef NO_AVG | ||
355 | #undef DEF | ||
356 | #undef SET_RND | ||
357 | |||
358 | #if HAVE_MMX | ||
359 | ✗ | CALL_2X_PIXELS(put_pixels16_xy2_mmx, ff_put_pixels8_xy2_mmx, 8) | |
360 | #endif | ||
361 | |||
362 | #endif /* HAVE_INLINE_ASM */ | ||
363 | |||
364 | |||
365 | #if HAVE_X86ASM | ||
366 | |||
367 | #define HPELDSP_AVG_PIXELS16(CPUEXT) \ | ||
368 | CALL_2X_PIXELS(put_no_rnd_pixels16_x2 ## CPUEXT, ff_put_no_rnd_pixels8_x2 ## CPUEXT, 8) \ | ||
369 | CALL_2X_PIXELS(put_pixels16_y2 ## CPUEXT, ff_put_pixels8_y2 ## CPUEXT, 8) \ | ||
370 | CALL_2X_PIXELS(put_no_rnd_pixels16_y2 ## CPUEXT, ff_put_no_rnd_pixels8_y2 ## CPUEXT, 8) \ | ||
371 | CALL_2X_PIXELS(avg_pixels16_x2 ## CPUEXT, ff_avg_pixels8_x2 ## CPUEXT, 8) \ | ||
372 | CALL_2X_PIXELS(avg_pixels16_y2 ## CPUEXT, ff_avg_pixels8_y2 ## CPUEXT, 8) \ | ||
373 | CALL_2X_PIXELS(avg_pixels16_xy2 ## CPUEXT, ff_avg_pixels8_xy2 ## CPUEXT, 8) \ | ||
374 | CALL_2X_PIXELS(avg_approx_pixels16_xy2## CPUEXT, ff_avg_approx_pixels8_xy2## CPUEXT, 8) | ||
375 | |||
376 | 17364 | HPELDSP_AVG_PIXELS16(_mmxext) | |
377 | |||
378 | #endif /* HAVE_X86ASM */ | ||
379 | |||
380 | #define SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU) \ | ||
381 | if (HAVE_MMX_EXTERNAL) \ | ||
382 | c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU | ||
383 | |||
384 | #define SET_HPEL_FUNCS03(PFX, IDX, SIZE, CPU) \ | ||
385 | do { \ | ||
386 | SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU); \ | ||
387 | c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \ | ||
388 | } while (0) | ||
389 | #define SET_HPEL_FUNCS12(PFX, IDX, SIZE, CPU) \ | ||
390 | do { \ | ||
391 | c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \ | ||
392 | c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \ | ||
393 | } while (0) | ||
394 | #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \ | ||
395 | do { \ | ||
396 | SET_HPEL_FUNCS03(PFX, IDX, SIZE, CPU); \ | ||
397 | SET_HPEL_FUNCS12(PFX, IDX, SIZE, CPU); \ | ||
398 | } while (0) | ||
399 | |||
400 | 190 | static void hpeldsp_init_mmx(HpelDSPContext *c, int flags) | |
401 | { | ||
402 | #if HAVE_MMX_INLINE | ||
403 | 190 | SET_HPEL_FUNCS03(put, [0], 16, mmx); | |
404 | 190 | SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx); | |
405 | 190 | SET_HPEL_FUNCS12(avg_no_rnd, , 16, mmx); | |
406 | 190 | c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_mmx; | |
407 | 190 | SET_HPEL_FUNCS03(put, [1], 8, mmx); | |
408 | 190 | SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx); | |
409 | #endif | ||
410 | 190 | } | |
411 | |||
412 | 190 | static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags) | |
413 | { | ||
414 | #if HAVE_MMXEXT_EXTERNAL | ||
415 | 190 | c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext; | |
416 | 190 | c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext; | |
417 | |||
418 | 190 | c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext; | |
419 | 190 | c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext; | |
420 | 190 | c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext; | |
421 | 190 | c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext; | |
422 | |||
423 | 190 | c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext; | |
424 | 190 | c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext; | |
425 | |||
426 | 190 | c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext; | |
427 | 190 | c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext; | |
428 | 190 | c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext; | |
429 | 190 | c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext; | |
430 | |||
431 | 190 | c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext; | |
432 | 190 | c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext; | |
433 | |||
434 | 190 | c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_mmxext; | |
435 | |||
436 |
2/2✓ Branch 0 taken 173 times.
✓ Branch 1 taken 17 times.
|
190 | if (!(flags & AV_CODEC_FLAG_BITEXACT)) { |
437 | 173 | c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext; | |
438 | 173 | c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext; | |
439 | 173 | c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext; | |
440 | 173 | c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext; | |
441 | |||
442 | 173 | c->avg_pixels_tab[0][3] = avg_approx_pixels16_xy2_mmxext; | |
443 | 173 | c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_mmxext; | |
444 | } | ||
445 | #endif /* HAVE_MMXEXT_EXTERNAL */ | ||
446 | 190 | } | |
447 | |||
448 | 190 | static void hpeldsp_init_sse2_fast(HpelDSPContext *c, int flags) | |
449 | { | ||
450 | #if HAVE_SSE2_EXTERNAL | ||
451 | 190 | c->put_pixels_tab[0][0] = ff_put_pixels16_sse2; | |
452 | 190 | c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2; | |
453 | 190 | c->put_pixels_tab[0][1] = ff_put_pixels16_x2_sse2; | |
454 | 190 | c->put_pixels_tab[0][2] = ff_put_pixels16_y2_sse2; | |
455 | 190 | c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_sse2; | |
456 | 190 | c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2; | |
457 | 190 | c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_sse2; | |
458 | 190 | c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_sse2; | |
459 | 190 | c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_sse2; | |
460 | 190 | c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_sse2; | |
461 | #endif /* HAVE_SSE2_EXTERNAL */ | ||
462 | 190 | } | |
463 | |||
464 | 190 | static void hpeldsp_init_ssse3(HpelDSPContext *c, int flags) | |
465 | { | ||
466 | #if HAVE_SSSE3_EXTERNAL | ||
467 | 190 | c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_ssse3; | |
468 | 190 | c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_ssse3; | |
469 | 190 | c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_ssse3; | |
470 | 190 | c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_ssse3; | |
471 | #endif | ||
472 | 190 | } | |
473 | |||
474 | 1134 | av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags) | |
475 | { | ||
476 | 1134 | int cpu_flags = av_get_cpu_flags(); | |
477 | |||
478 |
2/2✓ Branch 0 taken 190 times.
✓ Branch 1 taken 944 times.
|
1134 | if (INLINE_MMX(cpu_flags)) |
479 | 190 | hpeldsp_init_mmx(c, flags); | |
480 | |||
481 |
2/2✓ Branch 0 taken 190 times.
✓ Branch 1 taken 944 times.
|
1134 | if (EXTERNAL_MMXEXT(cpu_flags)) |
482 | 190 | hpeldsp_init_mmxext(c, flags); | |
483 | |||
484 |
3/4✓ Branch 0 taken 190 times.
✓ Branch 1 taken 944 times.
✓ Branch 2 taken 190 times.
✗ Branch 3 not taken.
|
1134 | if (EXTERNAL_SSE2_FAST(cpu_flags)) |
485 | 190 | hpeldsp_init_sse2_fast(c, flags); | |
486 | |||
487 |
2/2✓ Branch 0 taken 190 times.
✓ Branch 1 taken 944 times.
|
1134 | if (EXTERNAL_SSSE3(cpu_flags)) |
488 | 190 | hpeldsp_init_ssse3(c, flags); | |
489 | 1134 | } | |
490 |