Line |
Branch |
Exec |
Source |
1 |
|
|
/* |
2 |
|
|
* Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> |
3 |
|
|
* |
4 |
|
|
* This file is part of FFmpeg. |
5 |
|
|
* |
6 |
|
|
* FFmpeg is free software; you can redistribute it and/or |
7 |
|
|
* modify it under the terms of the GNU Lesser General Public |
8 |
|
|
* License as published by the Free Software Foundation; either |
9 |
|
|
* version 2.1 of the License, or (at your option) any later version. |
10 |
|
|
* |
11 |
|
|
* FFmpeg is distributed in the hope that it will be useful, |
12 |
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 |
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 |
|
|
* Lesser General Public License for more details. |
15 |
|
|
* |
16 |
|
|
* You should have received a copy of the GNU Lesser General Public |
17 |
|
|
* License along with FFmpeg; if not, write to the Free Software |
18 |
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
19 |
|
|
*/ |
20 |
|
|
|
21 |
|
|
#include "../swscale_internal.h" |
22 |
|
|
#include "libavutil/attributes.h" |
23 |
|
|
#include "libavutil/x86/asm.h" |
24 |
|
|
#include "libavutil/x86/cpu.h" |
25 |
|
|
#include "libavutil/mem_internal.h" |
26 |
|
|
|
27 |
|
|
#define RET 0xC3 // near return opcode for x86 |
28 |
|
|
#define PREFETCH "prefetchnta" |
29 |
|
|
|
30 |
|
|
#if HAVE_INLINE_ASM |
31 |
|
✗ |
av_cold int ff_init_hscaler_mmxext(int dstW, int xInc, uint8_t *filterCode, |
32 |
|
|
int16_t *filter, int32_t *filterPos, |
33 |
|
|
int numSplits) |
34 |
|
|
{ |
35 |
|
|
uint8_t *fragmentA; |
36 |
|
|
x86_reg imm8OfPShufW1A; |
37 |
|
|
x86_reg imm8OfPShufW2A; |
38 |
|
|
x86_reg fragmentLengthA; |
39 |
|
|
uint8_t *fragmentB; |
40 |
|
|
x86_reg imm8OfPShufW1B; |
41 |
|
|
x86_reg imm8OfPShufW2B; |
42 |
|
|
x86_reg fragmentLengthB; |
43 |
|
|
int fragmentPos; |
44 |
|
|
|
45 |
|
|
int xpos, i; |
46 |
|
|
|
47 |
|
|
// create an optimized horizontal scaling routine |
48 |
|
|
/* This scaler is made of runtime-generated MMXEXT code using specially tuned |
49 |
|
|
* pshufw instructions. For every four output pixels, if four input pixels |
50 |
|
|
* are enough for the fast bilinear scaling, then a chunk of fragmentB is |
51 |
|
|
* used. If five input pixels are needed, then a chunk of fragmentA is used. |
52 |
|
|
*/ |
53 |
|
|
|
54 |
|
|
// code fragment |
55 |
|
|
|
56 |
|
✗ |
__asm__ volatile ( |
57 |
|
|
"jmp 9f \n\t" |
58 |
|
|
// Begin |
59 |
|
|
"0: \n\t" |
60 |
|
|
"movq (%%"FF_REG_d", %%"FF_REG_a"), %%mm3 \n\t" |
61 |
|
|
"movd (%%"FF_REG_c", %%"FF_REG_S"), %%mm0 \n\t" |
62 |
|
|
"movd 1(%%"FF_REG_c", %%"FF_REG_S"), %%mm1 \n\t" |
63 |
|
|
"punpcklbw %%mm7, %%mm1 \n\t" |
64 |
|
|
"punpcklbw %%mm7, %%mm0 \n\t" |
65 |
|
|
"pshufw $0xFF, %%mm1, %%mm1 \n\t" |
66 |
|
|
"1: \n\t" |
67 |
|
|
"pshufw $0xFF, %%mm0, %%mm0 \n\t" |
68 |
|
|
"2: \n\t" |
69 |
|
|
"psubw %%mm1, %%mm0 \n\t" |
70 |
|
|
"movl 8(%%"FF_REG_b", %%"FF_REG_a"), %%esi \n\t" |
71 |
|
|
"pmullw %%mm3, %%mm0 \n\t" |
72 |
|
|
"psllw $7, %%mm1 \n\t" |
73 |
|
|
"paddw %%mm1, %%mm0 \n\t" |
74 |
|
|
|
75 |
|
|
"movq %%mm0, (%%"FF_REG_D", %%"FF_REG_a") \n\t" |
76 |
|
|
|
77 |
|
|
"add $8, %%"FF_REG_a" \n\t" |
78 |
|
|
// End |
79 |
|
|
"9: \n\t" |
80 |
|
|
"lea " LOCAL_MANGLE(0b) ", %0 \n\t" |
81 |
|
|
"lea " LOCAL_MANGLE(1b) ", %1 \n\t" |
82 |
|
|
"lea " LOCAL_MANGLE(2b) ", %2 \n\t" |
83 |
|
|
"dec %1 \n\t" |
84 |
|
|
"dec %2 \n\t" |
85 |
|
|
"sub %0, %1 \n\t" |
86 |
|
|
"sub %0, %2 \n\t" |
87 |
|
|
"lea " LOCAL_MANGLE(9b) ", %3 \n\t" |
88 |
|
|
"sub %0, %3 \n\t" |
89 |
|
|
|
90 |
|
|
|
91 |
|
|
: "=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A), |
92 |
|
|
"=r" (fragmentLengthA) |
93 |
|
|
); |
94 |
|
|
|
95 |
|
✗ |
__asm__ volatile ( |
96 |
|
|
"jmp 9f \n\t" |
97 |
|
|
// Begin |
98 |
|
|
"0: \n\t" |
99 |
|
|
"movq (%%"FF_REG_d", %%"FF_REG_a"), %%mm3 \n\t" |
100 |
|
|
"movd (%%"FF_REG_c", %%"FF_REG_S"), %%mm0 \n\t" |
101 |
|
|
"punpcklbw %%mm7, %%mm0 \n\t" |
102 |
|
|
"pshufw $0xFF, %%mm0, %%mm1 \n\t" |
103 |
|
|
"1: \n\t" |
104 |
|
|
"pshufw $0xFF, %%mm0, %%mm0 \n\t" |
105 |
|
|
"2: \n\t" |
106 |
|
|
"psubw %%mm1, %%mm0 \n\t" |
107 |
|
|
"movl 8(%%"FF_REG_b", %%"FF_REG_a"), %%esi \n\t" |
108 |
|
|
"pmullw %%mm3, %%mm0 \n\t" |
109 |
|
|
"psllw $7, %%mm1 \n\t" |
110 |
|
|
"paddw %%mm1, %%mm0 \n\t" |
111 |
|
|
|
112 |
|
|
"movq %%mm0, (%%"FF_REG_D", %%"FF_REG_a") \n\t" |
113 |
|
|
|
114 |
|
|
"add $8, %%"FF_REG_a" \n\t" |
115 |
|
|
// End |
116 |
|
|
"9: \n\t" |
117 |
|
|
"lea " LOCAL_MANGLE(0b) ", %0 \n\t" |
118 |
|
|
"lea " LOCAL_MANGLE(1b) ", %1 \n\t" |
119 |
|
|
"lea " LOCAL_MANGLE(2b) ", %2 \n\t" |
120 |
|
|
"dec %1 \n\t" |
121 |
|
|
"dec %2 \n\t" |
122 |
|
|
"sub %0, %1 \n\t" |
123 |
|
|
"sub %0, %2 \n\t" |
124 |
|
|
"lea " LOCAL_MANGLE(9b) ", %3 \n\t" |
125 |
|
|
"sub %0, %3 \n\t" |
126 |
|
|
|
127 |
|
|
|
128 |
|
|
: "=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B), |
129 |
|
|
"=r" (fragmentLengthB) |
130 |
|
|
); |
131 |
|
|
|
132 |
|
✗ |
xpos = 0; // lumXInc/2 - 0x8000; // difference between pixel centers |
133 |
|
✗ |
fragmentPos = 0; |
134 |
|
|
|
135 |
|
✗ |
for (i = 0; i < dstW / numSplits; i++) { |
136 |
|
✗ |
int xx = xpos >> 16; |
137 |
|
|
|
138 |
|
✗ |
if ((i & 3) == 0) { |
139 |
|
✗ |
int a = 0; |
140 |
|
✗ |
int b = ((xpos + xInc) >> 16) - xx; |
141 |
|
✗ |
int c = ((xpos + xInc * 2) >> 16) - xx; |
142 |
|
✗ |
int d = ((xpos + xInc * 3) >> 16) - xx; |
143 |
|
✗ |
int inc = (d + 1 < 4); |
144 |
|
✗ |
uint8_t *fragment = inc ? fragmentB : fragmentA; |
145 |
|
✗ |
x86_reg imm8OfPShufW1 = inc ? imm8OfPShufW1B : imm8OfPShufW1A; |
146 |
|
✗ |
x86_reg imm8OfPShufW2 = inc ? imm8OfPShufW2B : imm8OfPShufW2A; |
147 |
|
✗ |
x86_reg fragmentLength = inc ? fragmentLengthB : fragmentLengthA; |
148 |
|
✗ |
int maxShift = 3 - (d + inc); |
149 |
|
✗ |
int shift = 0; |
150 |
|
|
|
151 |
|
✗ |
if (filterCode) { |
152 |
|
✗ |
filter[i] = ((xpos & 0xFFFF) ^ 0xFFFF) >> 9; |
153 |
|
✗ |
filter[i + 1] = (((xpos + xInc) & 0xFFFF) ^ 0xFFFF) >> 9; |
154 |
|
✗ |
filter[i + 2] = (((xpos + xInc * 2) & 0xFFFF) ^ 0xFFFF) >> 9; |
155 |
|
✗ |
filter[i + 3] = (((xpos + xInc * 3) & 0xFFFF) ^ 0xFFFF) >> 9; |
156 |
|
✗ |
filterPos[i / 2] = xx; |
157 |
|
|
|
158 |
|
✗ |
memcpy(filterCode + fragmentPos, fragment, fragmentLength); |
159 |
|
|
|
160 |
|
✗ |
filterCode[fragmentPos + imm8OfPShufW1] = (a + inc) | |
161 |
|
✗ |
((b + inc) << 2) | |
162 |
|
✗ |
((c + inc) << 4) | |
163 |
|
✗ |
((d + inc) << 6); |
164 |
|
✗ |
filterCode[fragmentPos + imm8OfPShufW2] = a | (b << 2) | |
165 |
|
✗ |
(c << 4) | |
166 |
|
|
(d << 6); |
167 |
|
|
|
168 |
|
✗ |
if (i + 4 - inc >= dstW) |
169 |
|
✗ |
shift = maxShift; // avoid overread |
170 |
|
✗ |
else if ((filterPos[i / 2] & 3) <= maxShift) |
171 |
|
✗ |
shift = filterPos[i / 2] & 3; // align |
172 |
|
|
|
173 |
|
✗ |
if (shift && i >= shift) { |
174 |
|
✗ |
filterCode[fragmentPos + imm8OfPShufW1] += 0x55 * shift; |
175 |
|
✗ |
filterCode[fragmentPos + imm8OfPShufW2] += 0x55 * shift; |
176 |
|
✗ |
filterPos[i / 2] -= shift; |
177 |
|
|
} |
178 |
|
|
} |
179 |
|
|
|
180 |
|
✗ |
fragmentPos += fragmentLength; |
181 |
|
|
|
182 |
|
✗ |
if (filterCode) |
183 |
|
✗ |
filterCode[fragmentPos] = RET; |
184 |
|
|
} |
185 |
|
✗ |
xpos += xInc; |
186 |
|
|
} |
187 |
|
✗ |
if (filterCode) |
188 |
|
✗ |
filterPos[((i / 2) + 1) & (~1)] = xpos >> 16; // needed to jump to the next part |
189 |
|
|
|
190 |
|
✗ |
return fragmentPos + 1; |
191 |
|
|
} |
192 |
|
|
|
193 |
|
✗ |
void ff_hyscale_fast_mmxext(SwsInternal *c, int16_t *dst, |
194 |
|
|
int dstWidth, const uint8_t *src, |
195 |
|
|
int srcW, int xInc) |
196 |
|
|
{ |
197 |
|
✗ |
int32_t *filterPos = c->hLumFilterPos; |
198 |
|
✗ |
int16_t *filter = c->hLumFilter; |
199 |
|
✗ |
void *mmxextFilterCode = c->lumMmxextFilterCode; |
200 |
|
|
int i; |
201 |
|
|
#if ARCH_X86_64 |
202 |
|
|
uint64_t retsave; |
203 |
|
|
#else |
204 |
|
|
#if !HAVE_EBX_AVAILABLE |
205 |
|
|
uint64_t ebxsave; |
206 |
|
|
#endif |
207 |
|
|
#endif |
208 |
|
|
|
209 |
|
✗ |
__asm__ volatile( |
210 |
|
|
#if ARCH_X86_64 |
211 |
|
|
"mov -8(%%rsp), %%"FF_REG_a" \n\t" |
212 |
|
|
"mov %%"FF_REG_a", %5 \n\t" // retsave |
213 |
|
|
#else |
214 |
|
|
#if !HAVE_EBX_AVAILABLE |
215 |
|
|
"mov %%"FF_REG_b", %5 \n\t" // ebxsave |
216 |
|
|
#endif |
217 |
|
|
#endif |
218 |
|
|
"pxor %%mm7, %%mm7 \n\t" |
219 |
|
|
"mov %0, %%"FF_REG_c" \n\t" |
220 |
|
|
"mov %1, %%"FF_REG_D" \n\t" |
221 |
|
|
"mov %2, %%"FF_REG_d" \n\t" |
222 |
|
|
"mov %3, %%"FF_REG_b" \n\t" |
223 |
|
|
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i |
224 |
|
|
PREFETCH" (%%"FF_REG_c") \n\t" |
225 |
|
|
PREFETCH" 32(%%"FF_REG_c") \n\t" |
226 |
|
|
PREFETCH" 64(%%"FF_REG_c") \n\t" |
227 |
|
|
|
228 |
|
|
#if ARCH_X86_64 |
229 |
|
|
#define CALL_MMXEXT_FILTER_CODE \ |
230 |
|
|
"movl (%%"FF_REG_b"), %%esi \n\t"\ |
231 |
|
|
"call *%4 \n\t"\ |
232 |
|
|
"movl (%%"FF_REG_b", %%"FF_REG_a"), %%esi \n\t"\ |
233 |
|
|
"add %%"FF_REG_S", %%"FF_REG_c" \n\t"\ |
234 |
|
|
"add %%"FF_REG_a", %%"FF_REG_D" \n\t"\ |
235 |
|
|
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\ |
236 |
|
|
|
237 |
|
|
#else |
238 |
|
|
#define CALL_MMXEXT_FILTER_CODE \ |
239 |
|
|
"movl (%%"FF_REG_b"), %%esi \n\t"\ |
240 |
|
|
"call *%4 \n\t"\ |
241 |
|
|
"addl (%%"FF_REG_b", %%"FF_REG_a"), %%"FF_REG_c" \n\t"\ |
242 |
|
|
"add %%"FF_REG_a", %%"FF_REG_D" \n\t"\ |
243 |
|
|
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\ |
244 |
|
|
|
245 |
|
|
#endif /* ARCH_X86_64 */ |
246 |
|
|
|
247 |
|
|
CALL_MMXEXT_FILTER_CODE |
248 |
|
|
CALL_MMXEXT_FILTER_CODE |
249 |
|
|
CALL_MMXEXT_FILTER_CODE |
250 |
|
|
CALL_MMXEXT_FILTER_CODE |
251 |
|
|
CALL_MMXEXT_FILTER_CODE |
252 |
|
|
CALL_MMXEXT_FILTER_CODE |
253 |
|
|
CALL_MMXEXT_FILTER_CODE |
254 |
|
|
CALL_MMXEXT_FILTER_CODE |
255 |
|
|
|
256 |
|
|
#if ARCH_X86_64 |
257 |
|
|
"mov %5, %%"FF_REG_a" \n\t" |
258 |
|
|
"mov %%"FF_REG_a", -8(%%rsp) \n\t" |
259 |
|
|
#else |
260 |
|
|
#if !HAVE_EBX_AVAILABLE |
261 |
|
|
"mov %5, %%"FF_REG_b" \n\t" |
262 |
|
|
#endif |
263 |
|
|
#endif |
264 |
|
|
:: "m" (src), "m" (dst), "m" (filter), "m" (filterPos), |
265 |
|
|
"m" (mmxextFilterCode) |
266 |
|
|
#if ARCH_X86_64 |
267 |
|
|
,"m"(retsave) |
268 |
|
|
#else |
269 |
|
|
#if !HAVE_EBX_AVAILABLE |
270 |
|
|
,"m" (ebxsave) |
271 |
|
|
#endif |
272 |
|
|
#endif |
273 |
|
|
: "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_D |
274 |
|
|
#if ARCH_X86_64 || HAVE_EBX_AVAILABLE |
275 |
|
|
,"%"FF_REG_b |
276 |
|
|
#endif |
277 |
|
|
); |
278 |
|
|
|
279 |
|
✗ |
for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) |
280 |
|
✗ |
dst[i] = src[srcW-1]*128; |
281 |
|
✗ |
} |
282 |
|
|
|
283 |
|
✗ |
void ff_hcscale_fast_mmxext(SwsInternal *c, int16_t *dst1, int16_t *dst2, |
284 |
|
|
int dstWidth, const uint8_t *src1, |
285 |
|
|
const uint8_t *src2, int srcW, int xInc) |
286 |
|
|
{ |
287 |
|
✗ |
int32_t *filterPos = c->hChrFilterPos; |
288 |
|
✗ |
int16_t *filter = c->hChrFilter; |
289 |
|
✗ |
void *mmxextFilterCode = c->chrMmxextFilterCode; |
290 |
|
|
int i; |
291 |
|
|
#if ARCH_X86_64 |
292 |
|
|
DECLARE_ALIGNED(8, uint64_t, retsave); |
293 |
|
|
#else |
294 |
|
|
#if !HAVE_EBX_AVAILABLE |
295 |
|
|
DECLARE_ALIGNED(8, uint64_t, ebxsave); |
296 |
|
|
#endif |
297 |
|
|
#endif |
298 |
|
✗ |
__asm__ volatile( |
299 |
|
|
#if ARCH_X86_64 |
300 |
|
|
"mov -8(%%rsp), %%"FF_REG_a" \n\t" |
301 |
|
|
"mov %%"FF_REG_a", %7 \n\t" // retsave |
302 |
|
|
#else |
303 |
|
|
#if !HAVE_EBX_AVAILABLE |
304 |
|
|
"mov %%"FF_REG_b", %7 \n\t" // ebxsave |
305 |
|
|
#endif |
306 |
|
|
#endif |
307 |
|
|
"pxor %%mm7, %%mm7 \n\t" |
308 |
|
|
"mov %0, %%"FF_REG_c" \n\t" |
309 |
|
|
"mov %1, %%"FF_REG_D" \n\t" |
310 |
|
|
"mov %2, %%"FF_REG_d" \n\t" |
311 |
|
|
"mov %3, %%"FF_REG_b" \n\t" |
312 |
|
|
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i |
313 |
|
|
PREFETCH" (%%"FF_REG_c") \n\t" |
314 |
|
|
PREFETCH" 32(%%"FF_REG_c") \n\t" |
315 |
|
|
PREFETCH" 64(%%"FF_REG_c") \n\t" |
316 |
|
|
|
317 |
|
|
CALL_MMXEXT_FILTER_CODE |
318 |
|
|
CALL_MMXEXT_FILTER_CODE |
319 |
|
|
CALL_MMXEXT_FILTER_CODE |
320 |
|
|
CALL_MMXEXT_FILTER_CODE |
321 |
|
|
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i |
322 |
|
|
"mov %5, %%"FF_REG_c" \n\t" // src2 |
323 |
|
|
"mov %6, %%"FF_REG_D" \n\t" // dst2 |
324 |
|
|
PREFETCH" (%%"FF_REG_c") \n\t" |
325 |
|
|
PREFETCH" 32(%%"FF_REG_c") \n\t" |
326 |
|
|
PREFETCH" 64(%%"FF_REG_c") \n\t" |
327 |
|
|
|
328 |
|
|
CALL_MMXEXT_FILTER_CODE |
329 |
|
|
CALL_MMXEXT_FILTER_CODE |
330 |
|
|
CALL_MMXEXT_FILTER_CODE |
331 |
|
|
CALL_MMXEXT_FILTER_CODE |
332 |
|
|
|
333 |
|
|
#if ARCH_X86_64 |
334 |
|
|
"mov %7, %%"FF_REG_a" \n\t" |
335 |
|
|
"mov %%"FF_REG_a", -8(%%rsp) \n\t" |
336 |
|
|
#else |
337 |
|
|
#if !HAVE_EBX_AVAILABLE |
338 |
|
|
"mov %7, %%"FF_REG_b" \n\t" |
339 |
|
|
#endif |
340 |
|
|
#endif |
341 |
|
|
:: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos), |
342 |
|
|
"m" (mmxextFilterCode), "m" (src2), "m"(dst2) |
343 |
|
|
#if ARCH_X86_64 |
344 |
|
|
,"m"(retsave) |
345 |
|
|
#else |
346 |
|
|
#if !HAVE_EBX_AVAILABLE |
347 |
|
|
,"m" (ebxsave) |
348 |
|
|
#endif |
349 |
|
|
#endif |
350 |
|
|
: "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_D |
351 |
|
|
#if ARCH_X86_64 || HAVE_EBX_AVAILABLE |
352 |
|
|
,"%"FF_REG_b |
353 |
|
|
#endif |
354 |
|
|
); |
355 |
|
|
|
356 |
|
✗ |
for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) { |
357 |
|
✗ |
dst1[i] = src1[srcW-1]*128; |
358 |
|
✗ |
dst2[i] = src2[srcW-1]*128; |
359 |
|
|
} |
360 |
|
✗ |
} |
361 |
|
|
#endif //HAVE_INLINE_ASM |
362 |
|
|
|