Line | Branch | Exec | Source |
---|---|---|---|
1 | /* | ||
2 | * SIMD-optimized halfpel functions | ||
3 | * Copyright (c) 2000, 2001 Fabrice Bellard | ||
4 | * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> | ||
5 | * | ||
6 | * This file is part of FFmpeg. | ||
7 | * | ||
8 | * FFmpeg is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU Lesser General Public | ||
10 | * License as published by the Free Software Foundation; either | ||
11 | * version 2.1 of the License, or (at your option) any later version. | ||
12 | * | ||
13 | * FFmpeg is distributed in the hope that it will be useful, | ||
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
16 | * Lesser General Public License for more details. | ||
17 | * | ||
18 | * You should have received a copy of the GNU Lesser General Public | ||
19 | * License along with FFmpeg; if not, write to the Free Software | ||
20 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
21 | * | ||
22 | * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | ||
23 | */ | ||
24 | |||
25 | #include <stddef.h> | ||
26 | #include <stdint.h> | ||
27 | |||
28 | #include "libavutil/attributes.h" | ||
29 | #include "libavutil/cpu.h" | ||
30 | #include "libavutil/x86/cpu.h" | ||
31 | #include "libavcodec/avcodec.h" | ||
32 | #include "libavcodec/hpeldsp.h" | ||
33 | #include "libavcodec/pixels.h" | ||
34 | #include "fpel.h" | ||
35 | #include "hpeldsp.h" | ||
36 | #include "inline_asm.h" | ||
37 | |||
38 | void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, | ||
39 | ptrdiff_t line_size, int h); | ||
40 | void ff_put_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels, | ||
41 | ptrdiff_t line_size, int h); | ||
42 | void ff_avg_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels, | ||
43 | ptrdiff_t line_size, int h); | ||
44 | void ff_put_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels, | ||
45 | ptrdiff_t line_size, int h); | ||
46 | void ff_avg_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels, | ||
47 | ptrdiff_t line_size, int h); | ||
48 | void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, | ||
49 | ptrdiff_t line_size, int h); | ||
50 | void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block, | ||
51 | const uint8_t *pixels, | ||
52 | ptrdiff_t line_size, int h); | ||
53 | void ff_put_no_rnd_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels, | ||
54 | ptrdiff_t line_size, int h); | ||
55 | void ff_avg_no_rnd_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels, | ||
56 | ptrdiff_t line_size, int h); | ||
57 | void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, | ||
58 | ptrdiff_t line_size, int h); | ||
59 | void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, | ||
60 | ptrdiff_t line_size, int h); | ||
61 | void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block, | ||
62 | const uint8_t *pixels, | ||
63 | ptrdiff_t line_size, int h); | ||
64 | void ff_put_no_rnd_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels, | ||
65 | ptrdiff_t line_size, int h); | ||
66 | void ff_avg_no_rnd_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels, | ||
67 | ptrdiff_t line_size, int h); | ||
68 | void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, | ||
69 | ptrdiff_t line_size, int h); | ||
70 | void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, | ||
71 | ptrdiff_t line_size, int h); | ||
72 | |||
73 | #if HAVE_INLINE_ASM | ||
74 | |||
75 | /***********************************/ | ||
76 | /* MMX no rounding */ | ||
77 | |||
78 | // put_pixels | ||
79 | 72222 | static void put_no_rnd_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels, | |
80 | ptrdiff_t line_size, int h) | ||
81 | { | ||
82 | 72222 | MOVQ_ZERO(mm7); | |
83 | 72222 | MOVQ_WONE(mm6); // =1 for no_rnd version | |
84 | 72222 | __asm__ volatile( | |
85 | "movq (%1), %%mm0 \n\t" | ||
86 | "movq 1(%1), %%mm4 \n\t" | ||
87 | "movq %%mm0, %%mm1 \n\t" | ||
88 | "movq %%mm4, %%mm5 \n\t" | ||
89 | "punpcklbw %%mm7, %%mm0 \n\t" | ||
90 | "punpcklbw %%mm7, %%mm4 \n\t" | ||
91 | "punpckhbw %%mm7, %%mm1 \n\t" | ||
92 | "punpckhbw %%mm7, %%mm5 \n\t" | ||
93 | "paddusw %%mm0, %%mm4 \n\t" | ||
94 | "paddusw %%mm1, %%mm5 \n\t" | ||
95 | "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" | ||
96 | "add %3, %1 \n\t" | ||
97 | ".p2align 3 \n\t" | ||
98 | "1: \n\t" | ||
99 | "movq (%1, %%"FF_REG_a"), %%mm0 \n\t" | ||
100 | "movq 1(%1, %%"FF_REG_a"), %%mm2 \n\t" | ||
101 | "movq %%mm0, %%mm1 \n\t" | ||
102 | "movq %%mm2, %%mm3 \n\t" | ||
103 | "punpcklbw %%mm7, %%mm0 \n\t" | ||
104 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
105 | "punpckhbw %%mm7, %%mm1 \n\t" | ||
106 | "punpckhbw %%mm7, %%mm3 \n\t" | ||
107 | "paddusw %%mm2, %%mm0 \n\t" | ||
108 | "paddusw %%mm3, %%mm1 \n\t" | ||
109 | "paddusw %%mm6, %%mm4 \n\t" | ||
110 | "paddusw %%mm6, %%mm5 \n\t" | ||
111 | "paddusw %%mm0, %%mm4 \n\t" | ||
112 | "paddusw %%mm1, %%mm5 \n\t" | ||
113 | "psrlw $2, %%mm4 \n\t" | ||
114 | "psrlw $2, %%mm5 \n\t" | ||
115 | "packuswb %%mm5, %%mm4 \n\t" | ||
116 | "movq %%mm4, (%2, %%"FF_REG_a") \n\t" | ||
117 | "add %3, %%"FF_REG_a" \n\t" | ||
118 | |||
119 | "movq (%1, %%"FF_REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 | ||
120 | "movq 1(%1, %%"FF_REG_a"), %%mm4 \n\t" | ||
121 | "movq %%mm2, %%mm3 \n\t" | ||
122 | "movq %%mm4, %%mm5 \n\t" | ||
123 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
124 | "punpcklbw %%mm7, %%mm4 \n\t" | ||
125 | "punpckhbw %%mm7, %%mm3 \n\t" | ||
126 | "punpckhbw %%mm7, %%mm5 \n\t" | ||
127 | "paddusw %%mm2, %%mm4 \n\t" | ||
128 | "paddusw %%mm3, %%mm5 \n\t" | ||
129 | "paddusw %%mm6, %%mm0 \n\t" | ||
130 | "paddusw %%mm6, %%mm1 \n\t" | ||
131 | "paddusw %%mm4, %%mm0 \n\t" | ||
132 | "paddusw %%mm5, %%mm1 \n\t" | ||
133 | "psrlw $2, %%mm0 \n\t" | ||
134 | "psrlw $2, %%mm1 \n\t" | ||
135 | "packuswb %%mm1, %%mm0 \n\t" | ||
136 | "movq %%mm0, (%2, %%"FF_REG_a") \n\t" | ||
137 | "add %3, %%"FF_REG_a" \n\t" | ||
138 | |||
139 | "subl $2, %0 \n\t" | ||
140 | "jnz 1b \n\t" | ||
141 | :"+g"(h), "+S"(pixels) | ||
142 | :"D"(block), "r"((x86_reg)line_size) | ||
143 | :FF_REG_a, "memory"); | ||
144 | 72222 | } | |
145 | |||
146 | // this routine is 'slightly' suboptimal but mostly unused | ||
147 | 2 | static void avg_no_rnd_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels, | |
148 | ptrdiff_t line_size, int h) | ||
149 | { | ||
150 | 2 | MOVQ_ZERO(mm7); | |
151 | 2 | MOVQ_WONE(mm6); // =2 for rnd and =1 for no_rnd version | |
152 | 2 | __asm__ volatile( | |
153 | "movq (%1), %%mm0 \n\t" | ||
154 | "movq 1(%1), %%mm4 \n\t" | ||
155 | "movq %%mm0, %%mm1 \n\t" | ||
156 | "movq %%mm4, %%mm5 \n\t" | ||
157 | "punpcklbw %%mm7, %%mm0 \n\t" | ||
158 | "punpcklbw %%mm7, %%mm4 \n\t" | ||
159 | "punpckhbw %%mm7, %%mm1 \n\t" | ||
160 | "punpckhbw %%mm7, %%mm5 \n\t" | ||
161 | "paddusw %%mm0, %%mm4 \n\t" | ||
162 | "paddusw %%mm1, %%mm5 \n\t" | ||
163 | "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" | ||
164 | "add %3, %1 \n\t" | ||
165 | ".p2align 3 \n\t" | ||
166 | "1: \n\t" | ||
167 | "movq (%1, %%"FF_REG_a"), %%mm0 \n\t" | ||
168 | "movq 1(%1, %%"FF_REG_a"), %%mm2 \n\t" | ||
169 | "movq %%mm0, %%mm1 \n\t" | ||
170 | "movq %%mm2, %%mm3 \n\t" | ||
171 | "punpcklbw %%mm7, %%mm0 \n\t" | ||
172 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
173 | "punpckhbw %%mm7, %%mm1 \n\t" | ||
174 | "punpckhbw %%mm7, %%mm3 \n\t" | ||
175 | "paddusw %%mm2, %%mm0 \n\t" | ||
176 | "paddusw %%mm3, %%mm1 \n\t" | ||
177 | "paddusw %%mm6, %%mm4 \n\t" | ||
178 | "paddusw %%mm6, %%mm5 \n\t" | ||
179 | "paddusw %%mm0, %%mm4 \n\t" | ||
180 | "paddusw %%mm1, %%mm5 \n\t" | ||
181 | "psrlw $2, %%mm4 \n\t" | ||
182 | "psrlw $2, %%mm5 \n\t" | ||
183 | "movq (%2, %%"FF_REG_a"), %%mm3 \n\t" | ||
184 | "packuswb %%mm5, %%mm4 \n\t" | ||
185 | "pcmpeqd %%mm2, %%mm2 \n\t" | ||
186 | "paddb %%mm2, %%mm2 \n\t" | ||
187 | PAVGB_MMX(%%mm3, %%mm4, %%mm5, %%mm2) | ||
188 | "movq %%mm5, (%2, %%"FF_REG_a") \n\t" | ||
189 | "add %3, %%"FF_REG_a" \n\t" | ||
190 | |||
191 | "movq (%1, %%"FF_REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 | ||
192 | "movq 1(%1, %%"FF_REG_a"), %%mm4 \n\t" | ||
193 | "movq %%mm2, %%mm3 \n\t" | ||
194 | "movq %%mm4, %%mm5 \n\t" | ||
195 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
196 | "punpcklbw %%mm7, %%mm4 \n\t" | ||
197 | "punpckhbw %%mm7, %%mm3 \n\t" | ||
198 | "punpckhbw %%mm7, %%mm5 \n\t" | ||
199 | "paddusw %%mm2, %%mm4 \n\t" | ||
200 | "paddusw %%mm3, %%mm5 \n\t" | ||
201 | "paddusw %%mm6, %%mm0 \n\t" | ||
202 | "paddusw %%mm6, %%mm1 \n\t" | ||
203 | "paddusw %%mm4, %%mm0 \n\t" | ||
204 | "paddusw %%mm5, %%mm1 \n\t" | ||
205 | "psrlw $2, %%mm0 \n\t" | ||
206 | "psrlw $2, %%mm1 \n\t" | ||
207 | "movq (%2, %%"FF_REG_a"), %%mm3 \n\t" | ||
208 | "packuswb %%mm1, %%mm0 \n\t" | ||
209 | "pcmpeqd %%mm2, %%mm2 \n\t" | ||
210 | "paddb %%mm2, %%mm2 \n\t" | ||
211 | PAVGB_MMX(%%mm3, %%mm0, %%mm1, %%mm2) | ||
212 | "movq %%mm1, (%2, %%"FF_REG_a") \n\t" | ||
213 | "add %3, %%"FF_REG_a" \n\t" | ||
214 | |||
215 | "subl $2, %0 \n\t" | ||
216 | "jnz 1b \n\t" | ||
217 | :"+g"(h), "+S"(pixels) | ||
218 | :"D"(block), "r"((x86_reg)line_size) | ||
219 | :FF_REG_a, "memory"); | ||
220 | 2 | } | |
221 | |||
222 | #if HAVE_MMX | ||
223 | 1 | CALL_2X_PIXELS(avg_no_rnd_pixels16_xy2_mmx, avg_no_rnd_pixels8_xy2_mmx, 8) | |
224 | 10119 | CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, put_no_rnd_pixels8_xy2_mmx, 8) | |
225 | #endif | ||
226 | #endif /* HAVE_INLINE_ASM */ | ||
227 | |||
228 | 171 | static void hpeldsp_init_mmx(HpelDSPContext *c, int flags) | |
229 | { | ||
230 | #if HAVE_MMX_INLINE | ||
231 | 171 | c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx; | |
232 | 171 | c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx; | |
233 | 171 | c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_mmx; | |
234 | #endif | ||
235 | #if HAVE_MMX_EXTERNAL | ||
236 | 171 | c->put_no_rnd_pixels_tab[1][0] = | |
237 | 171 | c->put_pixels_tab[1][0] = ff_put_pixels8_mmx; | |
238 | #endif | ||
239 | 171 | } | |
240 | |||
241 | 170 | static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags) | |
242 | { | ||
243 | #if HAVE_MMXEXT_EXTERNAL | ||
244 | 170 | c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext; | |
245 | 170 | c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext; | |
246 | |||
247 | 170 | c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext; | |
248 | 170 | c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext; | |
249 | 170 | c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext; | |
250 | |||
251 | 170 | c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext; | |
252 | 170 | c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext; | |
253 | |||
254 |
2/2✓ Branch 0 taken 142 times.
✓ Branch 1 taken 28 times.
|
170 | if (!(flags & AV_CODEC_FLAG_BITEXACT)) { |
255 | 142 | c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext; | |
256 | 142 | c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext; | |
257 | } | ||
258 | #endif /* HAVE_MMXEXT_EXTERNAL */ | ||
259 | 170 | } | |
260 | |||
261 | 168 | static void hpeldsp_init_sse2(HpelDSPContext *c, int flags) | |
262 | { | ||
263 | #if HAVE_SSE2_EXTERNAL | ||
264 | 168 | c->put_pixels_tab[0][0] = ff_put_pixels16_sse2; | |
265 | 168 | c->put_pixels_tab[0][1] = ff_put_pixels16_x2_sse2; | |
266 | 168 | c->put_pixels_tab[0][2] = ff_put_pixels16_y2_sse2; | |
267 | 168 | c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_sse2; | |
268 | |||
269 | 168 | c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2; | |
270 | 168 | c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_sse2; | |
271 | 168 | c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_sse2; | |
272 | |||
273 | 168 | c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2; | |
274 | 168 | c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_sse2; | |
275 | 168 | c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_sse2; | |
276 | 168 | c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_sse2; | |
277 | |||
278 | 168 | c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_sse2; | |
279 | 168 | c->avg_no_rnd_pixels_tab[1] = ff_avg_no_rnd_pixels16_x2_sse2; | |
280 | 168 | c->avg_no_rnd_pixels_tab[2] = ff_avg_no_rnd_pixels16_y2_sse2; | |
281 | #endif /* HAVE_SSE2_EXTERNAL */ | ||
282 | 168 | } | |
283 | |||
284 | 166 | static void hpeldsp_init_ssse3(HpelDSPContext *c, int flags) | |
285 | { | ||
286 | #if HAVE_SSSE3_EXTERNAL | ||
287 | 166 | c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_ssse3; | |
288 | 166 | c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_ssse3; | |
289 | 166 | c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_ssse3; | |
290 | 166 | c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_ssse3; | |
291 | #endif | ||
292 | 166 | } | |
293 | |||
294 | 941 | av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags) | |
295 | { | ||
296 | 941 | int cpu_flags = av_get_cpu_flags(); | |
297 | |||
298 |
2/2✓ Branch 0 taken 171 times.
✓ Branch 1 taken 770 times.
|
941 | if (INLINE_MMX(cpu_flags)) |
299 | 171 | hpeldsp_init_mmx(c, flags); | |
300 | |||
301 |
2/2✓ Branch 0 taken 170 times.
✓ Branch 1 taken 771 times.
|
941 | if (EXTERNAL_MMXEXT(cpu_flags)) |
302 | 170 | hpeldsp_init_mmxext(c, flags); | |
303 | |||
304 |
2/2✓ Branch 0 taken 168 times.
✓ Branch 1 taken 773 times.
|
941 | if (EXTERNAL_SSE2(cpu_flags)) |
305 | 168 | hpeldsp_init_sse2(c, flags); | |
306 | |||
307 |
2/2✓ Branch 0 taken 166 times.
✓ Branch 1 taken 775 times.
|
941 | if (EXTERNAL_SSSE3(cpu_flags)) |
308 | 166 | hpeldsp_init_ssse3(c, flags); | |
309 | 941 | } | |
310 |