1 |
|
|
/* |
2 |
|
|
* SIMD-optimized halfpel functions |
3 |
|
|
* Copyright (c) 2000, 2001 Fabrice Bellard |
4 |
|
|
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
5 |
|
|
* |
6 |
|
|
* This file is part of FFmpeg. |
7 |
|
|
* |
8 |
|
|
* FFmpeg is free software; you can redistribute it and/or |
9 |
|
|
* modify it under the terms of the GNU Lesser General Public |
10 |
|
|
* License as published by the Free Software Foundation; either |
11 |
|
|
* version 2.1 of the License, or (at your option) any later version. |
12 |
|
|
* |
13 |
|
|
* FFmpeg is distributed in the hope that it will be useful, |
14 |
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 |
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
16 |
|
|
* Lesser General Public License for more details. |
17 |
|
|
* |
18 |
|
|
* You should have received a copy of the GNU Lesser General Public |
19 |
|
|
* License along with FFmpeg; if not, write to the Free Software |
20 |
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
21 |
|
|
* |
22 |
|
|
* MMX optimization by Nick Kurshev <nickols_k@mail.ru> |
23 |
|
|
*/ |
24 |
|
|
|
25 |
|
|
#include "libavutil/attributes.h" |
26 |
|
|
#include "libavutil/cpu.h" |
27 |
|
|
#include "libavutil/x86/cpu.h" |
28 |
|
|
#include "libavcodec/avcodec.h" |
29 |
|
|
#include "libavcodec/hpeldsp.h" |
30 |
|
|
#include "libavcodec/pixels.h" |
31 |
|
|
#include "fpel.h" |
32 |
|
|
#include "hpeldsp.h" |
33 |
|
|
|
34 |
|
|
void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, |
35 |
|
|
ptrdiff_t line_size, int h); |
36 |
|
|
void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels, |
37 |
|
|
ptrdiff_t line_size, int h); |
38 |
|
|
void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels, |
39 |
|
|
ptrdiff_t line_size, int h); |
40 |
|
|
void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels, |
41 |
|
|
ptrdiff_t line_size, int h); |
42 |
|
|
void ff_put_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels, |
43 |
|
|
ptrdiff_t line_size, int h); |
44 |
|
|
void ff_avg_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels, |
45 |
|
|
ptrdiff_t line_size, int h); |
46 |
|
|
void ff_put_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels, |
47 |
|
|
ptrdiff_t line_size, int h); |
48 |
|
|
void ff_avg_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels, |
49 |
|
|
ptrdiff_t line_size, int h); |
50 |
|
|
void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, |
51 |
|
|
ptrdiff_t line_size, int h); |
52 |
|
|
void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels, |
53 |
|
|
ptrdiff_t line_size, int h); |
54 |
|
|
void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, |
55 |
|
|
ptrdiff_t line_size, int h); |
56 |
|
|
void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels, |
57 |
|
|
ptrdiff_t line_size, int h); |
58 |
|
|
void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, |
59 |
|
|
ptrdiff_t line_size, int h); |
60 |
|
|
void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels, |
61 |
|
|
ptrdiff_t line_size, int h); |
62 |
|
|
void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels, |
63 |
|
|
ptrdiff_t line_size, int h); |
64 |
|
|
void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, |
65 |
|
|
ptrdiff_t line_size, int h); |
66 |
|
|
void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels, |
67 |
|
|
ptrdiff_t line_size, int h); |
68 |
|
|
void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, |
69 |
|
|
ptrdiff_t line_size, int h); |
70 |
|
|
void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels, |
71 |
|
|
ptrdiff_t line_size, int h); |
72 |
|
|
void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels, |
73 |
|
|
ptrdiff_t line_size, int h); |
74 |
|
|
void ff_avg_approx_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels, |
75 |
|
|
ptrdiff_t line_size, int h); |
76 |
|
|
void ff_avg_approx_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels, |
77 |
|
|
ptrdiff_t line_size, int h); |
78 |
|
|
|
79 |
|
|
#define avg_pixels8_mmx ff_avg_pixels8_mmx |
80 |
|
|
#define avg_pixels8_x2_mmx ff_avg_pixels8_x2_mmx |
81 |
|
|
#define avg_pixels16_mmx ff_avg_pixels16_mmx |
82 |
|
|
#define avg_pixels8_xy2_mmx ff_avg_pixels8_xy2_mmx |
83 |
|
|
#define avg_pixels16_xy2_mmx ff_avg_pixels16_xy2_mmx |
84 |
|
|
#define put_pixels8_mmx ff_put_pixels8_mmx |
85 |
|
|
#define put_pixels16_mmx ff_put_pixels16_mmx |
86 |
|
|
#define put_pixels8_xy2_mmx ff_put_pixels8_xy2_mmx |
87 |
|
|
#define put_pixels16_xy2_mmx ff_put_pixels16_xy2_mmx |
88 |
|
|
#define avg_no_rnd_pixels16_mmx ff_avg_pixels16_mmx |
89 |
|
|
#define put_no_rnd_pixels8_mmx ff_put_pixels8_mmx |
90 |
|
|
#define put_no_rnd_pixels16_mmx ff_put_pixels16_mmx |
91 |
|
|
|
92 |
|
|
#if HAVE_INLINE_ASM |
93 |
|
|
|
94 |
|
|
/***********************************/ |
95 |
|
|
/* MMX no rounding */ |
96 |
|
|
#define DEF(x, y) x ## _no_rnd_ ## y ## _mmx |
97 |
|
|
#define SET_RND MOVQ_WONE |
98 |
|
|
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f) |
99 |
|
|
#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e) |
100 |
|
|
#define STATIC static |
101 |
|
|
|
102 |
|
|
#include "rnd_template.c" |
103 |
|
|
#include "hpeldsp_rnd_template.c" |
104 |
|
|
|
105 |
|
|
#undef DEF |
106 |
|
|
#undef SET_RND |
107 |
|
|
#undef PAVGBP |
108 |
|
|
#undef PAVGB |
109 |
|
|
#undef STATIC |
110 |
|
|
|
111 |
|
|
#if HAVE_MMX |
112 |
|
|
CALL_2X_PIXELS(avg_no_rnd_pixels16_y2_mmx, avg_no_rnd_pixels8_y2_mmx, 8) |
113 |
|
74 |
CALL_2X_PIXELS(put_no_rnd_pixels16_y2_mmx, put_no_rnd_pixels8_y2_mmx, 8) |
114 |
|
|
|
115 |
|
|
CALL_2X_PIXELS(avg_no_rnd_pixels16_xy2_mmx, avg_no_rnd_pixels8_xy2_mmx, 8) |
116 |
|
7956 |
CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, put_no_rnd_pixels8_xy2_mmx, 8) |
117 |
|
|
#endif |
118 |
|
|
|
119 |
|
|
/***********************************/ |
120 |
|
|
/* MMX rounding */ |
121 |
|
|
|
122 |
|
|
#define DEF(x, y) x ## _ ## y ## _mmx |
123 |
|
|
#define SET_RND MOVQ_WTWO |
124 |
|
|
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f) |
125 |
|
|
#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e) |
126 |
|
|
|
127 |
|
|
#include "hpeldsp_rnd_template.c" |
128 |
|
|
|
129 |
|
|
#undef DEF |
130 |
|
|
#define DEF(x, y) ff_ ## x ## _ ## y ## _mmx |
131 |
|
|
#define STATIC |
132 |
|
|
|
133 |
|
|
#include "rnd_template.c" |
134 |
|
|
|
135 |
|
|
#undef DEF |
136 |
|
|
#undef SET_RND |
137 |
|
|
#undef PAVGBP |
138 |
|
|
#undef PAVGB |
139 |
|
|
|
140 |
|
|
#if HAVE_MMX |
141 |
|
|
CALL_2X_PIXELS(avg_pixels16_y2_mmx, avg_pixels8_y2_mmx, 8) |
142 |
|
|
CALL_2X_PIXELS(put_pixels16_y2_mmx, put_pixels8_y2_mmx, 8) |
143 |
|
|
|
144 |
|
|
CALL_2X_PIXELS_EXPORT(ff_avg_pixels16_xy2_mmx, ff_avg_pixels8_xy2_mmx, 8) |
145 |
|
|
CALL_2X_PIXELS_EXPORT(ff_put_pixels16_xy2_mmx, ff_put_pixels8_xy2_mmx, 8) |
146 |
|
|
#endif |
147 |
|
|
|
148 |
|
|
#endif /* HAVE_INLINE_ASM */ |
149 |
|
|
|
150 |
|
|
|
151 |
|
|
#if HAVE_X86ASM |
152 |
|
|
|
153 |
|
|
#define HPELDSP_AVG_PIXELS16(CPUEXT) \ |
154 |
|
|
CALL_2X_PIXELS(put_no_rnd_pixels16_x2 ## CPUEXT, ff_put_no_rnd_pixels8_x2 ## CPUEXT, 8) \ |
155 |
|
|
CALL_2X_PIXELS(put_pixels16_y2 ## CPUEXT, ff_put_pixels8_y2 ## CPUEXT, 8) \ |
156 |
|
|
CALL_2X_PIXELS(put_no_rnd_pixels16_y2 ## CPUEXT, ff_put_no_rnd_pixels8_y2 ## CPUEXT, 8) \ |
157 |
|
|
CALL_2X_PIXELS(avg_pixels16 ## CPUEXT, ff_avg_pixels8 ## CPUEXT, 8) \ |
158 |
|
|
CALL_2X_PIXELS(avg_pixels16_x2 ## CPUEXT, ff_avg_pixels8_x2 ## CPUEXT, 8) \ |
159 |
|
|
CALL_2X_PIXELS(avg_pixels16_y2 ## CPUEXT, ff_avg_pixels8_y2 ## CPUEXT, 8) \ |
160 |
|
|
CALL_2X_PIXELS(avg_pixels16_xy2 ## CPUEXT, ff_avg_pixels8_xy2 ## CPUEXT, 8) \ |
161 |
|
|
CALL_2X_PIXELS(avg_approx_pixels16_xy2## CPUEXT, ff_avg_approx_pixels8_xy2## CPUEXT, 8) |
162 |
|
|
|
163 |
|
|
HPELDSP_AVG_PIXELS16(_3dnow) |
164 |
|
7897 |
HPELDSP_AVG_PIXELS16(_mmxext) |
165 |
|
|
|
166 |
|
|
#endif /* HAVE_X86ASM */ |
167 |
|
|
|
168 |
|
|
#define SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU) \ |
169 |
|
|
if (HAVE_MMX_EXTERNAL) \ |
170 |
|
|
c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; |
171 |
|
|
|
172 |
|
|
#if HAVE_MMX_INLINE |
173 |
|
|
#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \ |
174 |
|
|
do { \ |
175 |
|
|
SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU) \ |
176 |
|
|
c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \ |
177 |
|
|
c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \ |
178 |
|
|
c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \ |
179 |
|
|
} while (0) |
180 |
|
|
#else |
181 |
|
|
#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \ |
182 |
|
|
do { \ |
183 |
|
|
SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU) \ |
184 |
|
|
} while (0) |
185 |
|
|
#endif |
186 |
|
|
|
187 |
|
143 |
static void hpeldsp_init_mmx(HpelDSPContext *c, int flags) |
188 |
|
|
{ |
189 |
|
143 |
SET_HPEL_FUNCS(put, [0], 16, mmx); |
190 |
|
143 |
SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx); |
191 |
|
143 |
SET_HPEL_FUNCS(avg, [0], 16, mmx); |
192 |
|
143 |
SET_HPEL_FUNCS(avg_no_rnd, , 16, mmx); |
193 |
|
143 |
SET_HPEL_FUNCS(put, [1], 8, mmx); |
194 |
|
143 |
SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx); |
195 |
|
|
if (HAVE_MMX_EXTERNAL) { |
196 |
|
143 |
c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmx; |
197 |
|
143 |
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmx; |
198 |
|
|
} |
199 |
|
|
#if HAVE_MMX_INLINE |
200 |
|
143 |
c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx; |
201 |
|
143 |
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmx; |
202 |
|
|
#endif |
203 |
|
143 |
} |
204 |
|
|
|
205 |
|
143 |
static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags) |
206 |
|
|
{ |
207 |
|
|
#if HAVE_MMXEXT_EXTERNAL |
208 |
|
143 |
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext; |
209 |
|
143 |
c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext; |
210 |
|
|
|
211 |
|
143 |
c->avg_pixels_tab[0][0] = avg_pixels16_mmxext; |
212 |
|
143 |
c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext; |
213 |
|
143 |
c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext; |
214 |
|
143 |
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext; |
215 |
|
|
|
216 |
|
143 |
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext; |
217 |
|
143 |
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext; |
218 |
|
|
|
219 |
|
143 |
c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext; |
220 |
|
143 |
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext; |
221 |
|
143 |
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext; |
222 |
|
143 |
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext; |
223 |
|
|
|
224 |
✓✓ |
143 |
if (!(flags & AV_CODEC_FLAG_BITEXACT)) { |
225 |
|
140 |
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext; |
226 |
|
140 |
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext; |
227 |
|
140 |
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext; |
228 |
|
140 |
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext; |
229 |
|
|
|
230 |
|
140 |
c->avg_pixels_tab[0][3] = avg_approx_pixels16_xy2_mmxext; |
231 |
|
140 |
c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_mmxext; |
232 |
|
|
} |
233 |
|
|
#endif /* HAVE_MMXEXT_EXTERNAL */ |
234 |
|
143 |
} |
235 |
|
|
|
236 |
|
|
static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags) |
237 |
|
|
{ |
238 |
|
|
#if HAVE_AMD3DNOW_EXTERNAL |
239 |
|
|
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow; |
240 |
|
|
c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow; |
241 |
|
|
|
242 |
|
|
c->avg_pixels_tab[0][0] = avg_pixels16_3dnow; |
243 |
|
|
c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow; |
244 |
|
|
c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow; |
245 |
|
|
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow; |
246 |
|
|
|
247 |
|
|
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow; |
248 |
|
|
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow; |
249 |
|
|
|
250 |
|
|
c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow; |
251 |
|
|
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow; |
252 |
|
|
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow; |
253 |
|
|
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow; |
254 |
|
|
|
255 |
|
|
if (!(flags & AV_CODEC_FLAG_BITEXACT)){ |
256 |
|
|
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow; |
257 |
|
|
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow; |
258 |
|
|
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow; |
259 |
|
|
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow; |
260 |
|
|
|
261 |
|
|
c->avg_pixels_tab[0][3] = avg_approx_pixels16_xy2_3dnow; |
262 |
|
|
c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_3dnow; |
263 |
|
|
} |
264 |
|
|
#endif /* HAVE_AMD3DNOW_EXTERNAL */ |
265 |
|
|
} |
266 |
|
|
|
267 |
|
143 |
static void hpeldsp_init_sse2_fast(HpelDSPContext *c, int flags) |
268 |
|
|
{ |
269 |
|
|
#if HAVE_SSE2_EXTERNAL |
270 |
|
143 |
c->put_pixels_tab[0][0] = ff_put_pixels16_sse2; |
271 |
|
143 |
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2; |
272 |
|
143 |
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_sse2; |
273 |
|
143 |
c->put_pixels_tab[0][2] = ff_put_pixels16_y2_sse2; |
274 |
|
143 |
c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_sse2; |
275 |
|
143 |
c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2; |
276 |
|
143 |
c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_sse2; |
277 |
|
143 |
c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_sse2; |
278 |
|
143 |
c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_sse2; |
279 |
|
|
#endif /* HAVE_SSE2_EXTERNAL */ |
280 |
|
143 |
} |
281 |
|
|
|
282 |
|
143 |
static void hpeldsp_init_ssse3(HpelDSPContext *c, int flags) |
283 |
|
|
{ |
284 |
|
|
#if HAVE_SSSE3_EXTERNAL |
285 |
|
143 |
c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_ssse3; |
286 |
|
143 |
c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_ssse3; |
287 |
|
143 |
c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_ssse3; |
288 |
|
143 |
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_ssse3; |
289 |
|
|
#endif |
290 |
|
143 |
} |
291 |
|
|
|
292 |
|
997 |
av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags) |
293 |
|
|
{ |
294 |
|
997 |
int cpu_flags = av_get_cpu_flags(); |
295 |
|
|
|
296 |
✓✓ |
997 |
if (INLINE_MMX(cpu_flags)) |
297 |
|
143 |
hpeldsp_init_mmx(c, flags); |
298 |
|
|
|
299 |
✗✓ |
997 |
if (EXTERNAL_AMD3DNOW(cpu_flags)) |
300 |
|
|
hpeldsp_init_3dnow(c, flags); |
301 |
|
|
|
302 |
✓✓ |
997 |
if (EXTERNAL_MMXEXT(cpu_flags)) |
303 |
|
143 |
hpeldsp_init_mmxext(c, flags); |
304 |
|
|
|
305 |
✓✓✓✗
|
997 |
if (EXTERNAL_SSE2_FAST(cpu_flags)) |
306 |
|
143 |
hpeldsp_init_sse2_fast(c, flags); |
307 |
|
|
|
308 |
✓✓ |
997 |
if (EXTERNAL_SSSE3(cpu_flags)) |
309 |
|
143 |
hpeldsp_init_ssse3(c, flags); |
310 |
|
|
|
311 |
|
|
if (CONFIG_VP3_DECODER) |
312 |
|
997 |
ff_hpeldsp_vp3_init_x86(c, cpu_flags, flags); |
313 |
|
997 |
} |