Line | Branch | Exec | Source |
---|---|---|---|
1 | /* | ||
2 | * software RGB to RGB converter | ||
3 | * pluralize by software PAL8 to RGB converter | ||
4 | * software YUV to YUV converter | ||
5 | * software YUV to RGB converter | ||
6 | * Written by Nick Kurshev. | ||
7 | * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at) | ||
8 | * | ||
9 | * This file is part of FFmpeg. | ||
10 | * | ||
11 | * FFmpeg is free software; you can redistribute it and/or | ||
12 | * modify it under the terms of the GNU Lesser General Public | ||
13 | * License as published by the Free Software Foundation; either | ||
14 | * version 2.1 of the License, or (at your option) any later version. | ||
15 | * | ||
16 | * FFmpeg is distributed in the hope that it will be useful, | ||
17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
19 | * Lesser General Public License for more details. | ||
20 | * | ||
21 | * You should have received a copy of the GNU Lesser General Public | ||
22 | * License along with FFmpeg; if not, write to the Free Software | ||
23 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
24 | */ | ||
25 | |||
26 | #include <stdint.h> | ||
27 | |||
28 | #include "config.h" | ||
29 | #include "libavutil/attributes.h" | ||
30 | #include "libavutil/x86/cpu.h" | ||
31 | #include "libavutil/cpu.h" | ||
32 | #include "libavutil/bswap.h" | ||
33 | #include "libavutil/mem_internal.h" | ||
34 | |||
35 | #include "libswscale/rgb2rgb.h" | ||
36 | #include "libswscale/swscale.h" | ||
37 | #include "libswscale/swscale_internal.h" | ||
38 | |||
39 | #if HAVE_INLINE_ASM | ||
40 | #include "libavutil/x86/asm.h" | ||
41 | |||
42 | DECLARE_ASM_CONST(8, uint64_t, mmx_ff) = 0x00000000000000FFULL; | ||
43 | DECLARE_ASM_CONST(8, uint64_t, mmx_null) = 0x0000000000000000ULL; | ||
44 | DECLARE_ASM_CONST(8, uint64_t, mask32a) = 0xFF000000FF000000ULL; | ||
45 | DECLARE_ASM_CONST(8, uint64_t, mask3216br) = 0x00F800F800F800F8ULL; | ||
46 | DECLARE_ASM_CONST(8, uint64_t, mask3216g) = 0x0000FC000000FC00ULL; | ||
47 | DECLARE_ASM_CONST(8, uint64_t, mask3215g) = 0x0000F8000000F800ULL; | ||
48 | DECLARE_ASM_CONST(8, uint64_t, mul3216) = 0x2000000420000004ULL; | ||
49 | DECLARE_ASM_CONST(8, uint64_t, mul3215) = 0x2000000820000008ULL; | ||
50 | DECLARE_ASM_CONST(8, uint64_t, mask24b) = 0x00FF0000FF0000FFULL; | ||
51 | DECLARE_ASM_CONST(8, uint64_t, mask24g) = 0xFF0000FF0000FF00ULL; | ||
52 | DECLARE_ASM_CONST(8, uint64_t, mask24r) = 0x0000FF0000FF0000ULL; | ||
53 | DECLARE_ASM_CONST(8, uint64_t, mask24l) = 0x0000000000FFFFFFULL; | ||
54 | DECLARE_ASM_CONST(8, uint64_t, mask24h) = 0x0000FFFFFF000000ULL; | ||
55 | DECLARE_ASM_CONST(8, uint64_t, mask15b) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */ | ||
56 | DECLARE_ASM_CONST(8, uint64_t, mask15rg) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */ | ||
57 | DECLARE_ASM_CONST(8, uint64_t, mask15s) = 0xFFE0FFE0FFE0FFE0ULL; | ||
58 | DECLARE_ASM_CONST(8, uint64_t, mask15g) = 0x03E003E003E003E0ULL; | ||
59 | DECLARE_ASM_CONST(8, uint64_t, mask15r) = 0x7C007C007C007C00ULL; | ||
60 | #define mask16b mask15b | ||
61 | DECLARE_ASM_CONST(8, uint64_t, mask16g) = 0x07E007E007E007E0ULL; | ||
62 | DECLARE_ASM_CONST(8, uint64_t, mask16r) = 0xF800F800F800F800ULL; | ||
63 | DECLARE_ASM_CONST(8, uint64_t, red_16mask) = 0x0000f8000000f800ULL; | ||
64 | DECLARE_ASM_CONST(8, uint64_t, green_16mask) = 0x000007e0000007e0ULL; | ||
65 | DECLARE_ASM_CONST(8, uint64_t, blue_16mask) = 0x0000001f0000001fULL; | ||
66 | DECLARE_ASM_CONST(8, uint64_t, red_15mask) = 0x00007c0000007c00ULL; | ||
67 | DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL; | ||
68 | DECLARE_ASM_CONST(8, uint64_t, blue_15mask) = 0x0000001f0000001fULL; | ||
69 | DECLARE_ASM_CONST(8, uint64_t, mul15_mid) = 0x4200420042004200ULL; | ||
70 | DECLARE_ASM_CONST(8, uint64_t, mul15_hi) = 0x0210021002100210ULL; | ||
71 | DECLARE_ASM_CONST(8, uint64_t, mul16_mid) = 0x2080208020802080ULL; | ||
72 | |||
73 | DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2YOffset); | ||
74 | DECLARE_ALIGNED(8, extern const uint64_t, ff_w1111); | ||
75 | DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset); | ||
76 | |||
77 | #define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5)) | ||
78 | #define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5)) | ||
79 | #define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5)) | ||
80 | #define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5)) | ||
81 | #define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5)) | ||
82 | #define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5)) | ||
83 | #define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5)) | ||
84 | #define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5)) | ||
85 | #define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5)) | ||
86 | |||
87 | // MMXEXT versions | ||
88 | #define PREFETCH "prefetchnta" | ||
89 | #define PAVGB "pavgb" | ||
90 | #define MOVNTQ "movntq" | ||
91 | #define SFENCE "sfence" | ||
92 | |||
93 | #define EMMS "emms" | ||
94 | |||
95 | ✗ | static inline void rgb24tobgr32_mmxext(const uint8_t *src, uint8_t *dst, int src_size) | |
96 | { | ||
97 | ✗ | uint8_t *dest = dst; | |
98 | ✗ | const uint8_t *s = src; | |
99 | const uint8_t *end; | ||
100 | const uint8_t *mm_end; | ||
101 | ✗ | end = s + src_size; | |
102 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
103 | ✗ | mm_end = end - 23; | |
104 | ✗ | __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory"); | |
105 | ✗ | while (s < mm_end) { | |
106 | ✗ | __asm__ volatile( | |
107 | PREFETCH" 32(%1) \n\t" | ||
108 | "movd (%1), %%mm0 \n\t" | ||
109 | "punpckldq 3(%1), %%mm0 \n\t" | ||
110 | "movd 6(%1), %%mm1 \n\t" | ||
111 | "punpckldq 9(%1), %%mm1 \n\t" | ||
112 | "movd 12(%1), %%mm2 \n\t" | ||
113 | "punpckldq 15(%1), %%mm2 \n\t" | ||
114 | "movd 18(%1), %%mm3 \n\t" | ||
115 | "punpckldq 21(%1), %%mm3 \n\t" | ||
116 | "por %%mm7, %%mm0 \n\t" | ||
117 | "por %%mm7, %%mm1 \n\t" | ||
118 | "por %%mm7, %%mm2 \n\t" | ||
119 | "por %%mm7, %%mm3 \n\t" | ||
120 | MOVNTQ" %%mm0, (%0) \n\t" | ||
121 | MOVNTQ" %%mm1, 8(%0) \n\t" | ||
122 | MOVNTQ" %%mm2, 16(%0) \n\t" | ||
123 | MOVNTQ" %%mm3, 24(%0)" | ||
124 | :: "r"(dest), "r"(s) | ||
125 | :"memory"); | ||
126 | ✗ | dest += 32; | |
127 | ✗ | s += 24; | |
128 | } | ||
129 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
130 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
131 | ✗ | while (s < end) { | |
132 | ✗ | *dest++ = *s++; | |
133 | ✗ | *dest++ = *s++; | |
134 | ✗ | *dest++ = *s++; | |
135 | ✗ | *dest++ = 255; | |
136 | } | ||
137 | ✗ | } | |
138 | |||
139 | #define STORE_BGR24_MMX \ | ||
140 | "psrlq $8, %%mm2 \n\t" \ | ||
141 | "psrlq $8, %%mm3 \n\t" \ | ||
142 | "psrlq $8, %%mm6 \n\t" \ | ||
143 | "psrlq $8, %%mm7 \n\t" \ | ||
144 | "pand "MANGLE(mask24l)", %%mm0\n\t" \ | ||
145 | "pand "MANGLE(mask24l)", %%mm1\n\t" \ | ||
146 | "pand "MANGLE(mask24l)", %%mm4\n\t" \ | ||
147 | "pand "MANGLE(mask24l)", %%mm5\n\t" \ | ||
148 | "pand "MANGLE(mask24h)", %%mm2\n\t" \ | ||
149 | "pand "MANGLE(mask24h)", %%mm3\n\t" \ | ||
150 | "pand "MANGLE(mask24h)", %%mm6\n\t" \ | ||
151 | "pand "MANGLE(mask24h)", %%mm7\n\t" \ | ||
152 | "por %%mm2, %%mm0 \n\t" \ | ||
153 | "por %%mm3, %%mm1 \n\t" \ | ||
154 | "por %%mm6, %%mm4 \n\t" \ | ||
155 | "por %%mm7, %%mm5 \n\t" \ | ||
156 | \ | ||
157 | "movq %%mm1, %%mm2 \n\t" \ | ||
158 | "movq %%mm4, %%mm3 \n\t" \ | ||
159 | "psllq $48, %%mm2 \n\t" \ | ||
160 | "psllq $32, %%mm3 \n\t" \ | ||
161 | "por %%mm2, %%mm0 \n\t" \ | ||
162 | "psrlq $16, %%mm1 \n\t" \ | ||
163 | "psrlq $32, %%mm4 \n\t" \ | ||
164 | "psllq $16, %%mm5 \n\t" \ | ||
165 | "por %%mm3, %%mm1 \n\t" \ | ||
166 | "por %%mm5, %%mm4 \n\t" \ | ||
167 | \ | ||
168 | MOVNTQ" %%mm0, (%0) \n\t" \ | ||
169 | MOVNTQ" %%mm1, 8(%0) \n\t" \ | ||
170 | MOVNTQ" %%mm4, 16(%0)" | ||
171 | |||
172 | |||
173 | ✗ | static inline void rgb32tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size) | |
174 | { | ||
175 | ✗ | uint8_t *dest = dst; | |
176 | ✗ | const uint8_t *s = src; | |
177 | const uint8_t *end; | ||
178 | const uint8_t *mm_end; | ||
179 | ✗ | end = s + src_size; | |
180 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
181 | ✗ | mm_end = end - 31; | |
182 | ✗ | while (s < mm_end) { | |
183 | ✗ | __asm__ volatile( | |
184 | PREFETCH" 32(%1) \n\t" | ||
185 | "movq (%1), %%mm0 \n\t" | ||
186 | "movq 8(%1), %%mm1 \n\t" | ||
187 | "movq 16(%1), %%mm4 \n\t" | ||
188 | "movq 24(%1), %%mm5 \n\t" | ||
189 | "movq %%mm0, %%mm2 \n\t" | ||
190 | "movq %%mm1, %%mm3 \n\t" | ||
191 | "movq %%mm4, %%mm6 \n\t" | ||
192 | "movq %%mm5, %%mm7 \n\t" | ||
193 | STORE_BGR24_MMX | ||
194 | :: "r"(dest), "r"(s) | ||
195 | NAMED_CONSTRAINTS_ADD(mask24l,mask24h) | ||
196 | :"memory"); | ||
197 | ✗ | dest += 24; | |
198 | ✗ | s += 32; | |
199 | } | ||
200 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
201 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
202 | ✗ | while (s < end) { | |
203 | ✗ | *dest++ = *s++; | |
204 | ✗ | *dest++ = *s++; | |
205 | ✗ | *dest++ = *s++; | |
206 | ✗ | s++; | |
207 | } | ||
208 | ✗ | } | |
209 | |||
210 | /* | ||
211 | original by Strepto/Astral | ||
212 | ported to gcc & bugfixed: A'rpi | ||
213 | MMXEXT, 3DNOW optimization by Nick Kurshev | ||
214 | 32-bit C version, and and&add trick by Michael Niedermayer | ||
215 | */ | ||
216 | ✗ | static inline void rgb15to16_mmxext(const uint8_t *src, uint8_t *dst, int src_size) | |
217 | { | ||
218 | ✗ | register const uint8_t* s=src; | |
219 | ✗ | register uint8_t* d=dst; | |
220 | register const uint8_t *end; | ||
221 | const uint8_t *mm_end; | ||
222 | ✗ | end = s + src_size; | |
223 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*s)); | |
224 | ✗ | __asm__ volatile("movq %0, %%mm4"::"m"(mask15s)); | |
225 | ✗ | mm_end = end - 15; | |
226 | ✗ | while (s<mm_end) { | |
227 | ✗ | __asm__ volatile( | |
228 | PREFETCH" 32(%1) \n\t" | ||
229 | "movq (%1), %%mm0 \n\t" | ||
230 | "movq 8(%1), %%mm2 \n\t" | ||
231 | "movq %%mm0, %%mm1 \n\t" | ||
232 | "movq %%mm2, %%mm3 \n\t" | ||
233 | "pand %%mm4, %%mm0 \n\t" | ||
234 | "pand %%mm4, %%mm2 \n\t" | ||
235 | "paddw %%mm1, %%mm0 \n\t" | ||
236 | "paddw %%mm3, %%mm2 \n\t" | ||
237 | MOVNTQ" %%mm0, (%0) \n\t" | ||
238 | MOVNTQ" %%mm2, 8(%0)" | ||
239 | :: "r"(d), "r"(s) | ||
240 | ); | ||
241 | ✗ | d+=16; | |
242 | ✗ | s+=16; | |
243 | } | ||
244 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
245 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
246 | ✗ | mm_end = end - 3; | |
247 | ✗ | while (s < mm_end) { | |
248 | ✗ | register unsigned x= *((const uint32_t *)s); | |
249 | ✗ | *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0); | |
250 | ✗ | d+=4; | |
251 | ✗ | s+=4; | |
252 | } | ||
253 | ✗ | if (s < end) { | |
254 | ✗ | register unsigned short x= *((const uint16_t *)s); | |
255 | ✗ | *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0); | |
256 | } | ||
257 | ✗ | } | |
258 | |||
259 | ✗ | static inline void rgb16to15_mmxext(const uint8_t *src, uint8_t *dst, int src_size) | |
260 | { | ||
261 | ✗ | register const uint8_t* s=src; | |
262 | ✗ | register uint8_t* d=dst; | |
263 | register const uint8_t *end; | ||
264 | const uint8_t *mm_end; | ||
265 | ✗ | end = s + src_size; | |
266 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*s)); | |
267 | ✗ | __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg)); | |
268 | ✗ | __asm__ volatile("movq %0, %%mm6"::"m"(mask15b)); | |
269 | ✗ | mm_end = end - 15; | |
270 | ✗ | while (s<mm_end) { | |
271 | ✗ | __asm__ volatile( | |
272 | PREFETCH" 32(%1) \n\t" | ||
273 | "movq (%1), %%mm0 \n\t" | ||
274 | "movq 8(%1), %%mm2 \n\t" | ||
275 | "movq %%mm0, %%mm1 \n\t" | ||
276 | "movq %%mm2, %%mm3 \n\t" | ||
277 | "psrlq $1, %%mm0 \n\t" | ||
278 | "psrlq $1, %%mm2 \n\t" | ||
279 | "pand %%mm7, %%mm0 \n\t" | ||
280 | "pand %%mm7, %%mm2 \n\t" | ||
281 | "pand %%mm6, %%mm1 \n\t" | ||
282 | "pand %%mm6, %%mm3 \n\t" | ||
283 | "por %%mm1, %%mm0 \n\t" | ||
284 | "por %%mm3, %%mm2 \n\t" | ||
285 | MOVNTQ" %%mm0, (%0) \n\t" | ||
286 | MOVNTQ" %%mm2, 8(%0)" | ||
287 | :: "r"(d), "r"(s) | ||
288 | ); | ||
289 | ✗ | d+=16; | |
290 | ✗ | s+=16; | |
291 | } | ||
292 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
293 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
294 | ✗ | mm_end = end - 3; | |
295 | ✗ | while (s < mm_end) { | |
296 | ✗ | register uint32_t x= *((const uint32_t*)s); | |
297 | ✗ | *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F); | |
298 | ✗ | s+=4; | |
299 | ✗ | d+=4; | |
300 | } | ||
301 | ✗ | if (s < end) { | |
302 | ✗ | register uint16_t x= *((const uint16_t*)s); | |
303 | ✗ | *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F); | |
304 | } | ||
305 | ✗ | } | |
306 | |||
307 | ✗ | static inline void rgb32to16_mmxext(const uint8_t *src, uint8_t *dst, int src_size) | |
308 | { | ||
309 | ✗ | const uint8_t *s = src; | |
310 | const uint8_t *end; | ||
311 | const uint8_t *mm_end; | ||
312 | ✗ | uint16_t *d = (uint16_t *)dst; | |
313 | ✗ | end = s + src_size; | |
314 | ✗ | mm_end = end - 15; | |
315 | ✗ | __asm__ volatile( | |
316 | "movq %3, %%mm5 \n\t" | ||
317 | "movq %4, %%mm6 \n\t" | ||
318 | "movq %5, %%mm7 \n\t" | ||
319 | "jmp 2f \n\t" | ||
320 | ".p2align 4 \n\t" | ||
321 | "1: \n\t" | ||
322 | PREFETCH" 32(%1) \n\t" | ||
323 | "movd (%1), %%mm0 \n\t" | ||
324 | "movd 4(%1), %%mm3 \n\t" | ||
325 | "punpckldq 8(%1), %%mm0 \n\t" | ||
326 | "punpckldq 12(%1), %%mm3 \n\t" | ||
327 | "movq %%mm0, %%mm1 \n\t" | ||
328 | "movq %%mm3, %%mm4 \n\t" | ||
329 | "pand %%mm6, %%mm0 \n\t" | ||
330 | "pand %%mm6, %%mm3 \n\t" | ||
331 | "pmaddwd %%mm7, %%mm0 \n\t" | ||
332 | "pmaddwd %%mm7, %%mm3 \n\t" | ||
333 | "pand %%mm5, %%mm1 \n\t" | ||
334 | "pand %%mm5, %%mm4 \n\t" | ||
335 | "por %%mm1, %%mm0 \n\t" | ||
336 | "por %%mm4, %%mm3 \n\t" | ||
337 | "psrld $5, %%mm0 \n\t" | ||
338 | "pslld $11, %%mm3 \n\t" | ||
339 | "por %%mm3, %%mm0 \n\t" | ||
340 | MOVNTQ" %%mm0, (%0) \n\t" | ||
341 | "add $16, %1 \n\t" | ||
342 | "add $8, %0 \n\t" | ||
343 | "2: \n\t" | ||
344 | "cmp %2, %1 \n\t" | ||
345 | " jb 1b \n\t" | ||
346 | : "+r" (d), "+r"(s) | ||
347 | : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216) | ||
348 | ); | ||
349 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
350 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
351 | ✗ | while (s < end) { | |
352 | ✗ | register int rgb = *(const uint32_t*)s; s += 4; | |
353 | ✗ | *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8); | |
354 | } | ||
355 | ✗ | } | |
356 | |||
357 | ✗ | static inline void rgb32tobgr16_mmxext(const uint8_t *src, uint8_t *dst, int src_size) | |
358 | { | ||
359 | ✗ | const uint8_t *s = src; | |
360 | const uint8_t *end; | ||
361 | const uint8_t *mm_end; | ||
362 | ✗ | uint16_t *d = (uint16_t *)dst; | |
363 | ✗ | end = s + src_size; | |
364 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
365 | ✗ | __asm__ volatile( | |
366 | "movq %0, %%mm7 \n\t" | ||
367 | "movq %1, %%mm6 \n\t" | ||
368 | ::"m"(red_16mask),"m"(green_16mask)); | ||
369 | ✗ | mm_end = end - 15; | |
370 | ✗ | while (s < mm_end) { | |
371 | ✗ | __asm__ volatile( | |
372 | PREFETCH" 32(%1) \n\t" | ||
373 | "movd (%1), %%mm0 \n\t" | ||
374 | "movd 4(%1), %%mm3 \n\t" | ||
375 | "punpckldq 8(%1), %%mm0 \n\t" | ||
376 | "punpckldq 12(%1), %%mm3 \n\t" | ||
377 | "movq %%mm0, %%mm1 \n\t" | ||
378 | "movq %%mm0, %%mm2 \n\t" | ||
379 | "movq %%mm3, %%mm4 \n\t" | ||
380 | "movq %%mm3, %%mm5 \n\t" | ||
381 | "psllq $8, %%mm0 \n\t" | ||
382 | "psllq $8, %%mm3 \n\t" | ||
383 | "pand %%mm7, %%mm0 \n\t" | ||
384 | "pand %%mm7, %%mm3 \n\t" | ||
385 | "psrlq $5, %%mm1 \n\t" | ||
386 | "psrlq $5, %%mm4 \n\t" | ||
387 | "pand %%mm6, %%mm1 \n\t" | ||
388 | "pand %%mm6, %%mm4 \n\t" | ||
389 | "psrlq $19, %%mm2 \n\t" | ||
390 | "psrlq $19, %%mm5 \n\t" | ||
391 | "pand %2, %%mm2 \n\t" | ||
392 | "pand %2, %%mm5 \n\t" | ||
393 | "por %%mm1, %%mm0 \n\t" | ||
394 | "por %%mm4, %%mm3 \n\t" | ||
395 | "por %%mm2, %%mm0 \n\t" | ||
396 | "por %%mm5, %%mm3 \n\t" | ||
397 | "psllq $16, %%mm3 \n\t" | ||
398 | "por %%mm3, %%mm0 \n\t" | ||
399 | MOVNTQ" %%mm0, (%0) \n\t" | ||
400 | :: "r"(d),"r"(s),"m"(blue_16mask):"memory"); | ||
401 | ✗ | d += 4; | |
402 | ✗ | s += 16; | |
403 | } | ||
404 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
405 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
406 | ✗ | while (s < end) { | |
407 | ✗ | register int rgb = *(const uint32_t*)s; s += 4; | |
408 | ✗ | *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19); | |
409 | } | ||
410 | ✗ | } | |
411 | |||
412 | ✗ | static inline void rgb32to15_mmxext(const uint8_t *src, uint8_t *dst, int src_size) | |
413 | { | ||
414 | ✗ | const uint8_t *s = src; | |
415 | const uint8_t *end; | ||
416 | const uint8_t *mm_end; | ||
417 | ✗ | uint16_t *d = (uint16_t *)dst; | |
418 | ✗ | end = s + src_size; | |
419 | ✗ | mm_end = end - 15; | |
420 | ✗ | __asm__ volatile( | |
421 | "movq %3, %%mm5 \n\t" | ||
422 | "movq %4, %%mm6 \n\t" | ||
423 | "movq %5, %%mm7 \n\t" | ||
424 | "jmp 2f \n\t" | ||
425 | ".p2align 4 \n\t" | ||
426 | "1: \n\t" | ||
427 | PREFETCH" 32(%1) \n\t" | ||
428 | "movd (%1), %%mm0 \n\t" | ||
429 | "movd 4(%1), %%mm3 \n\t" | ||
430 | "punpckldq 8(%1), %%mm0 \n\t" | ||
431 | "punpckldq 12(%1), %%mm3 \n\t" | ||
432 | "movq %%mm0, %%mm1 \n\t" | ||
433 | "movq %%mm3, %%mm4 \n\t" | ||
434 | "pand %%mm6, %%mm0 \n\t" | ||
435 | "pand %%mm6, %%mm3 \n\t" | ||
436 | "pmaddwd %%mm7, %%mm0 \n\t" | ||
437 | "pmaddwd %%mm7, %%mm3 \n\t" | ||
438 | "pand %%mm5, %%mm1 \n\t" | ||
439 | "pand %%mm5, %%mm4 \n\t" | ||
440 | "por %%mm1, %%mm0 \n\t" | ||
441 | "por %%mm4, %%mm3 \n\t" | ||
442 | "psrld $6, %%mm0 \n\t" | ||
443 | "pslld $10, %%mm3 \n\t" | ||
444 | "por %%mm3, %%mm0 \n\t" | ||
445 | MOVNTQ" %%mm0, (%0) \n\t" | ||
446 | "add $16, %1 \n\t" | ||
447 | "add $8, %0 \n\t" | ||
448 | "2: \n\t" | ||
449 | "cmp %2, %1 \n\t" | ||
450 | " jb 1b \n\t" | ||
451 | : "+r" (d), "+r"(s) | ||
452 | : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215) | ||
453 | ); | ||
454 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
455 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
456 | ✗ | while (s < end) { | |
457 | ✗ | register int rgb = *(const uint32_t*)s; s += 4; | |
458 | ✗ | *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9); | |
459 | } | ||
460 | ✗ | } | |
461 | |||
462 | ✗ | static inline void rgb32tobgr15_mmxext(const uint8_t *src, uint8_t *dst, int src_size) | |
463 | { | ||
464 | ✗ | const uint8_t *s = src; | |
465 | const uint8_t *end; | ||
466 | const uint8_t *mm_end; | ||
467 | ✗ | uint16_t *d = (uint16_t *)dst; | |
468 | ✗ | end = s + src_size; | |
469 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
470 | ✗ | __asm__ volatile( | |
471 | "movq %0, %%mm7 \n\t" | ||
472 | "movq %1, %%mm6 \n\t" | ||
473 | ::"m"(red_15mask),"m"(green_15mask)); | ||
474 | ✗ | mm_end = end - 15; | |
475 | ✗ | while (s < mm_end) { | |
476 | ✗ | __asm__ volatile( | |
477 | PREFETCH" 32(%1) \n\t" | ||
478 | "movd (%1), %%mm0 \n\t" | ||
479 | "movd 4(%1), %%mm3 \n\t" | ||
480 | "punpckldq 8(%1), %%mm0 \n\t" | ||
481 | "punpckldq 12(%1), %%mm3 \n\t" | ||
482 | "movq %%mm0, %%mm1 \n\t" | ||
483 | "movq %%mm0, %%mm2 \n\t" | ||
484 | "movq %%mm3, %%mm4 \n\t" | ||
485 | "movq %%mm3, %%mm5 \n\t" | ||
486 | "psllq $7, %%mm0 \n\t" | ||
487 | "psllq $7, %%mm3 \n\t" | ||
488 | "pand %%mm7, %%mm0 \n\t" | ||
489 | "pand %%mm7, %%mm3 \n\t" | ||
490 | "psrlq $6, %%mm1 \n\t" | ||
491 | "psrlq $6, %%mm4 \n\t" | ||
492 | "pand %%mm6, %%mm1 \n\t" | ||
493 | "pand %%mm6, %%mm4 \n\t" | ||
494 | "psrlq $19, %%mm2 \n\t" | ||
495 | "psrlq $19, %%mm5 \n\t" | ||
496 | "pand %2, %%mm2 \n\t" | ||
497 | "pand %2, %%mm5 \n\t" | ||
498 | "por %%mm1, %%mm0 \n\t" | ||
499 | "por %%mm4, %%mm3 \n\t" | ||
500 | "por %%mm2, %%mm0 \n\t" | ||
501 | "por %%mm5, %%mm3 \n\t" | ||
502 | "psllq $16, %%mm3 \n\t" | ||
503 | "por %%mm3, %%mm0 \n\t" | ||
504 | MOVNTQ" %%mm0, (%0) \n\t" | ||
505 | ::"r"(d),"r"(s),"m"(blue_15mask):"memory"); | ||
506 | ✗ | d += 4; | |
507 | ✗ | s += 16; | |
508 | } | ||
509 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
510 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
511 | ✗ | while (s < end) { | |
512 | ✗ | register int rgb = *(const uint32_t*)s; s += 4; | |
513 | ✗ | *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19); | |
514 | } | ||
515 | ✗ | } | |
516 | |||
517 | ✗ | static inline void rgb24tobgr16_mmxext(const uint8_t *src, uint8_t *dst, int src_size) | |
518 | { | ||
519 | ✗ | const uint8_t *s = src; | |
520 | const uint8_t *end; | ||
521 | const uint8_t *mm_end; | ||
522 | ✗ | uint16_t *d = (uint16_t *)dst; | |
523 | ✗ | end = s + src_size; | |
524 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
525 | ✗ | __asm__ volatile( | |
526 | "movq %0, %%mm7 \n\t" | ||
527 | "movq %1, %%mm6 \n\t" | ||
528 | ::"m"(red_16mask),"m"(green_16mask)); | ||
529 | ✗ | mm_end = end - 11; | |
530 | ✗ | while (s < mm_end) { | |
531 | ✗ | __asm__ volatile( | |
532 | PREFETCH" 32(%1) \n\t" | ||
533 | "movd (%1), %%mm0 \n\t" | ||
534 | "movd 3(%1), %%mm3 \n\t" | ||
535 | "punpckldq 6(%1), %%mm0 \n\t" | ||
536 | "punpckldq 9(%1), %%mm3 \n\t" | ||
537 | "movq %%mm0, %%mm1 \n\t" | ||
538 | "movq %%mm0, %%mm2 \n\t" | ||
539 | "movq %%mm3, %%mm4 \n\t" | ||
540 | "movq %%mm3, %%mm5 \n\t" | ||
541 | "psrlq $3, %%mm0 \n\t" | ||
542 | "psrlq $3, %%mm3 \n\t" | ||
543 | "pand %2, %%mm0 \n\t" | ||
544 | "pand %2, %%mm3 \n\t" | ||
545 | "psrlq $5, %%mm1 \n\t" | ||
546 | "psrlq $5, %%mm4 \n\t" | ||
547 | "pand %%mm6, %%mm1 \n\t" | ||
548 | "pand %%mm6, %%mm4 \n\t" | ||
549 | "psrlq $8, %%mm2 \n\t" | ||
550 | "psrlq $8, %%mm5 \n\t" | ||
551 | "pand %%mm7, %%mm2 \n\t" | ||
552 | "pand %%mm7, %%mm5 \n\t" | ||
553 | "por %%mm1, %%mm0 \n\t" | ||
554 | "por %%mm4, %%mm3 \n\t" | ||
555 | "por %%mm2, %%mm0 \n\t" | ||
556 | "por %%mm5, %%mm3 \n\t" | ||
557 | "psllq $16, %%mm3 \n\t" | ||
558 | "por %%mm3, %%mm0 \n\t" | ||
559 | MOVNTQ" %%mm0, (%0) \n\t" | ||
560 | ::"r"(d),"r"(s),"m"(blue_16mask):"memory"); | ||
561 | ✗ | d += 4; | |
562 | ✗ | s += 12; | |
563 | } | ||
564 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
565 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
566 | ✗ | while (s < end) { | |
567 | ✗ | const int b = *s++; | |
568 | ✗ | const int g = *s++; | |
569 | ✗ | const int r = *s++; | |
570 | ✗ | *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); | |
571 | } | ||
572 | ✗ | } | |
573 | |||
574 | ✗ | static inline void rgb24to16_mmxext(const uint8_t *src, uint8_t *dst, int src_size) | |
575 | { | ||
576 | ✗ | const uint8_t *s = src; | |
577 | const uint8_t *end; | ||
578 | const uint8_t *mm_end; | ||
579 | ✗ | uint16_t *d = (uint16_t *)dst; | |
580 | ✗ | end = s + src_size; | |
581 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
582 | ✗ | __asm__ volatile( | |
583 | "movq %0, %%mm7 \n\t" | ||
584 | "movq %1, %%mm6 \n\t" | ||
585 | ::"m"(red_16mask),"m"(green_16mask)); | ||
586 | ✗ | mm_end = end - 15; | |
587 | ✗ | while (s < mm_end) { | |
588 | ✗ | __asm__ volatile( | |
589 | PREFETCH" 32(%1) \n\t" | ||
590 | "movd (%1), %%mm0 \n\t" | ||
591 | "movd 3(%1), %%mm3 \n\t" | ||
592 | "punpckldq 6(%1), %%mm0 \n\t" | ||
593 | "punpckldq 9(%1), %%mm3 \n\t" | ||
594 | "movq %%mm0, %%mm1 \n\t" | ||
595 | "movq %%mm0, %%mm2 \n\t" | ||
596 | "movq %%mm3, %%mm4 \n\t" | ||
597 | "movq %%mm3, %%mm5 \n\t" | ||
598 | "psllq $8, %%mm0 \n\t" | ||
599 | "psllq $8, %%mm3 \n\t" | ||
600 | "pand %%mm7, %%mm0 \n\t" | ||
601 | "pand %%mm7, %%mm3 \n\t" | ||
602 | "psrlq $5, %%mm1 \n\t" | ||
603 | "psrlq $5, %%mm4 \n\t" | ||
604 | "pand %%mm6, %%mm1 \n\t" | ||
605 | "pand %%mm6, %%mm4 \n\t" | ||
606 | "psrlq $19, %%mm2 \n\t" | ||
607 | "psrlq $19, %%mm5 \n\t" | ||
608 | "pand %2, %%mm2 \n\t" | ||
609 | "pand %2, %%mm5 \n\t" | ||
610 | "por %%mm1, %%mm0 \n\t" | ||
611 | "por %%mm4, %%mm3 \n\t" | ||
612 | "por %%mm2, %%mm0 \n\t" | ||
613 | "por %%mm5, %%mm3 \n\t" | ||
614 | "psllq $16, %%mm3 \n\t" | ||
615 | "por %%mm3, %%mm0 \n\t" | ||
616 | MOVNTQ" %%mm0, (%0) \n\t" | ||
617 | ::"r"(d),"r"(s),"m"(blue_16mask):"memory"); | ||
618 | ✗ | d += 4; | |
619 | ✗ | s += 12; | |
620 | } | ||
621 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
622 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
623 | ✗ | while (s < end) { | |
624 | ✗ | const int r = *s++; | |
625 | ✗ | const int g = *s++; | |
626 | ✗ | const int b = *s++; | |
627 | ✗ | *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); | |
628 | } | ||
629 | ✗ | } | |
630 | |||
631 | ✗ | static inline void rgb24tobgr15_mmxext(const uint8_t *src, uint8_t *dst, int src_size) | |
632 | { | ||
633 | ✗ | const uint8_t *s = src; | |
634 | const uint8_t *end; | ||
635 | const uint8_t *mm_end; | ||
636 | ✗ | uint16_t *d = (uint16_t *)dst; | |
637 | ✗ | end = s + src_size; | |
638 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
639 | ✗ | __asm__ volatile( | |
640 | "movq %0, %%mm7 \n\t" | ||
641 | "movq %1, %%mm6 \n\t" | ||
642 | ::"m"(red_15mask),"m"(green_15mask)); | ||
643 | ✗ | mm_end = end - 11; | |
644 | ✗ | while (s < mm_end) { | |
645 | ✗ | __asm__ volatile( | |
646 | PREFETCH" 32(%1) \n\t" | ||
647 | "movd (%1), %%mm0 \n\t" | ||
648 | "movd 3(%1), %%mm3 \n\t" | ||
649 | "punpckldq 6(%1), %%mm0 \n\t" | ||
650 | "punpckldq 9(%1), %%mm3 \n\t" | ||
651 | "movq %%mm0, %%mm1 \n\t" | ||
652 | "movq %%mm0, %%mm2 \n\t" | ||
653 | "movq %%mm3, %%mm4 \n\t" | ||
654 | "movq %%mm3, %%mm5 \n\t" | ||
655 | "psrlq $3, %%mm0 \n\t" | ||
656 | "psrlq $3, %%mm3 \n\t" | ||
657 | "pand %2, %%mm0 \n\t" | ||
658 | "pand %2, %%mm3 \n\t" | ||
659 | "psrlq $6, %%mm1 \n\t" | ||
660 | "psrlq $6, %%mm4 \n\t" | ||
661 | "pand %%mm6, %%mm1 \n\t" | ||
662 | "pand %%mm6, %%mm4 \n\t" | ||
663 | "psrlq $9, %%mm2 \n\t" | ||
664 | "psrlq $9, %%mm5 \n\t" | ||
665 | "pand %%mm7, %%mm2 \n\t" | ||
666 | "pand %%mm7, %%mm5 \n\t" | ||
667 | "por %%mm1, %%mm0 \n\t" | ||
668 | "por %%mm4, %%mm3 \n\t" | ||
669 | "por %%mm2, %%mm0 \n\t" | ||
670 | "por %%mm5, %%mm3 \n\t" | ||
671 | "psllq $16, %%mm3 \n\t" | ||
672 | "por %%mm3, %%mm0 \n\t" | ||
673 | MOVNTQ" %%mm0, (%0) \n\t" | ||
674 | ::"r"(d),"r"(s),"m"(blue_15mask):"memory"); | ||
675 | ✗ | d += 4; | |
676 | ✗ | s += 12; | |
677 | } | ||
678 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
679 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
680 | ✗ | while (s < end) { | |
681 | ✗ | const int b = *s++; | |
682 | ✗ | const int g = *s++; | |
683 | ✗ | const int r = *s++; | |
684 | ✗ | *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); | |
685 | } | ||
686 | ✗ | } | |
687 | |||
688 | ✗ | static inline void rgb24to15_mmxext(const uint8_t *src, uint8_t *dst, int src_size) | |
689 | { | ||
690 | ✗ | const uint8_t *s = src; | |
691 | const uint8_t *end; | ||
692 | const uint8_t *mm_end; | ||
693 | ✗ | uint16_t *d = (uint16_t *)dst; | |
694 | ✗ | end = s + src_size; | |
695 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
696 | ✗ | __asm__ volatile( | |
697 | "movq %0, %%mm7 \n\t" | ||
698 | "movq %1, %%mm6 \n\t" | ||
699 | ::"m"(red_15mask),"m"(green_15mask)); | ||
700 | ✗ | mm_end = end - 15; | |
701 | ✗ | while (s < mm_end) { | |
702 | ✗ | __asm__ volatile( | |
703 | PREFETCH" 32(%1) \n\t" | ||
704 | "movd (%1), %%mm0 \n\t" | ||
705 | "movd 3(%1), %%mm3 \n\t" | ||
706 | "punpckldq 6(%1), %%mm0 \n\t" | ||
707 | "punpckldq 9(%1), %%mm3 \n\t" | ||
708 | "movq %%mm0, %%mm1 \n\t" | ||
709 | "movq %%mm0, %%mm2 \n\t" | ||
710 | "movq %%mm3, %%mm4 \n\t" | ||
711 | "movq %%mm3, %%mm5 \n\t" | ||
712 | "psllq $7, %%mm0 \n\t" | ||
713 | "psllq $7, %%mm3 \n\t" | ||
714 | "pand %%mm7, %%mm0 \n\t" | ||
715 | "pand %%mm7, %%mm3 \n\t" | ||
716 | "psrlq $6, %%mm1 \n\t" | ||
717 | "psrlq $6, %%mm4 \n\t" | ||
718 | "pand %%mm6, %%mm1 \n\t" | ||
719 | "pand %%mm6, %%mm4 \n\t" | ||
720 | "psrlq $19, %%mm2 \n\t" | ||
721 | "psrlq $19, %%mm5 \n\t" | ||
722 | "pand %2, %%mm2 \n\t" | ||
723 | "pand %2, %%mm5 \n\t" | ||
724 | "por %%mm1, %%mm0 \n\t" | ||
725 | "por %%mm4, %%mm3 \n\t" | ||
726 | "por %%mm2, %%mm0 \n\t" | ||
727 | "por %%mm5, %%mm3 \n\t" | ||
728 | "psllq $16, %%mm3 \n\t" | ||
729 | "por %%mm3, %%mm0 \n\t" | ||
730 | MOVNTQ" %%mm0, (%0) \n\t" | ||
731 | ::"r"(d),"r"(s),"m"(blue_15mask):"memory"); | ||
732 | ✗ | d += 4; | |
733 | ✗ | s += 12; | |
734 | } | ||
735 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
736 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
737 | ✗ | while (s < end) { | |
738 | ✗ | const int r = *s++; | |
739 | ✗ | const int g = *s++; | |
740 | ✗ | const int b = *s++; | |
741 | ✗ | *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); | |
742 | } | ||
743 | ✗ | } | |
744 | |||
745 | ✗ | static inline void rgb15tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size) | |
746 | { | ||
747 | const uint16_t *end; | ||
748 | const uint16_t *mm_end; | ||
749 | ✗ | uint8_t *d = dst; | |
750 | ✗ | const uint16_t *s = (const uint16_t*)src; | |
751 | ✗ | end = s + src_size/2; | |
752 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
753 | ✗ | mm_end = end - 7; | |
754 | ✗ | while (s < mm_end) { | |
755 | ✗ | __asm__ volatile( | |
756 | PREFETCH" 32(%1) \n\t" | ||
757 | "movq (%1), %%mm0 \n\t" | ||
758 | "movq (%1), %%mm1 \n\t" | ||
759 | "movq (%1), %%mm2 \n\t" | ||
760 | "pand %2, %%mm0 \n\t" | ||
761 | "pand %3, %%mm1 \n\t" | ||
762 | "pand %4, %%mm2 \n\t" | ||
763 | "psllq $5, %%mm0 \n\t" | ||
764 | "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" | ||
765 | "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t" | ||
766 | "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" | ||
767 | "movq %%mm0, %%mm3 \n\t" | ||
768 | "movq %%mm1, %%mm4 \n\t" | ||
769 | "movq %%mm2, %%mm5 \n\t" | ||
770 | "punpcklwd %5, %%mm0 \n\t" | ||
771 | "punpcklwd %5, %%mm1 \n\t" | ||
772 | "punpcklwd %5, %%mm2 \n\t" | ||
773 | "punpckhwd %5, %%mm3 \n\t" | ||
774 | "punpckhwd %5, %%mm4 \n\t" | ||
775 | "punpckhwd %5, %%mm5 \n\t" | ||
776 | "psllq $8, %%mm1 \n\t" | ||
777 | "psllq $16, %%mm2 \n\t" | ||
778 | "por %%mm1, %%mm0 \n\t" | ||
779 | "por %%mm2, %%mm0 \n\t" | ||
780 | "psllq $8, %%mm4 \n\t" | ||
781 | "psllq $16, %%mm5 \n\t" | ||
782 | "por %%mm4, %%mm3 \n\t" | ||
783 | "por %%mm5, %%mm3 \n\t" | ||
784 | |||
785 | "movq %%mm0, %%mm6 \n\t" | ||
786 | "movq %%mm3, %%mm7 \n\t" | ||
787 | |||
788 | "movq 8(%1), %%mm0 \n\t" | ||
789 | "movq 8(%1), %%mm1 \n\t" | ||
790 | "movq 8(%1), %%mm2 \n\t" | ||
791 | "pand %2, %%mm0 \n\t" | ||
792 | "pand %3, %%mm1 \n\t" | ||
793 | "pand %4, %%mm2 \n\t" | ||
794 | "psllq $5, %%mm0 \n\t" | ||
795 | "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" | ||
796 | "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t" | ||
797 | "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" | ||
798 | "movq %%mm0, %%mm3 \n\t" | ||
799 | "movq %%mm1, %%mm4 \n\t" | ||
800 | "movq %%mm2, %%mm5 \n\t" | ||
801 | "punpcklwd %5, %%mm0 \n\t" | ||
802 | "punpcklwd %5, %%mm1 \n\t" | ||
803 | "punpcklwd %5, %%mm2 \n\t" | ||
804 | "punpckhwd %5, %%mm3 \n\t" | ||
805 | "punpckhwd %5, %%mm4 \n\t" | ||
806 | "punpckhwd %5, %%mm5 \n\t" | ||
807 | "psllq $8, %%mm1 \n\t" | ||
808 | "psllq $16, %%mm2 \n\t" | ||
809 | "por %%mm1, %%mm0 \n\t" | ||
810 | "por %%mm2, %%mm0 \n\t" | ||
811 | "psllq $8, %%mm4 \n\t" | ||
812 | "psllq $16, %%mm5 \n\t" | ||
813 | "por %%mm4, %%mm3 \n\t" | ||
814 | "por %%mm5, %%mm3 \n\t" | ||
815 | |||
816 | :"=m"(*d) | ||
817 | :"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null) | ||
818 | NAMED_CONSTRAINTS_ADD(mul15_mid,mul15_hi) | ||
819 | :"memory"); | ||
820 | /* borrowed 32 to 24 */ | ||
821 | ✗ | __asm__ volatile( | |
822 | "movq %%mm0, %%mm4 \n\t" | ||
823 | "movq %%mm3, %%mm5 \n\t" | ||
824 | "movq %%mm6, %%mm0 \n\t" | ||
825 | "movq %%mm7, %%mm1 \n\t" | ||
826 | |||
827 | "movq %%mm4, %%mm6 \n\t" | ||
828 | "movq %%mm5, %%mm7 \n\t" | ||
829 | "movq %%mm0, %%mm2 \n\t" | ||
830 | "movq %%mm1, %%mm3 \n\t" | ||
831 | |||
832 | STORE_BGR24_MMX | ||
833 | |||
834 | :: "r"(d), "m"(*s) | ||
835 | NAMED_CONSTRAINTS_ADD(mask24l,mask24h) | ||
836 | :"memory"); | ||
837 | ✗ | d += 24; | |
838 | ✗ | s += 8; | |
839 | } | ||
840 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
841 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
842 | ✗ | while (s < end) { | |
843 | register uint16_t bgr; | ||
844 | ✗ | bgr = *s++; | |
845 | ✗ | *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); | |
846 | ✗ | *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7); | |
847 | ✗ | *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12); | |
848 | } | ||
849 | ✗ | } | |
850 | |||
851 | ✗ | static inline void rgb16tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size) | |
852 | { | ||
853 | const uint16_t *end; | ||
854 | const uint16_t *mm_end; | ||
855 | ✗ | uint8_t *d = (uint8_t *)dst; | |
856 | ✗ | const uint16_t *s = (const uint16_t *)src; | |
857 | ✗ | end = s + src_size/2; | |
858 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
859 | ✗ | mm_end = end - 7; | |
860 | ✗ | while (s < mm_end) { | |
861 | ✗ | __asm__ volatile( | |
862 | PREFETCH" 32(%1) \n\t" | ||
863 | "movq (%1), %%mm0 \n\t" | ||
864 | "movq (%1), %%mm1 \n\t" | ||
865 | "movq (%1), %%mm2 \n\t" | ||
866 | "pand %2, %%mm0 \n\t" | ||
867 | "pand %3, %%mm1 \n\t" | ||
868 | "pand %4, %%mm2 \n\t" | ||
869 | "psllq $5, %%mm0 \n\t" | ||
870 | "psrlq $1, %%mm2 \n\t" | ||
871 | "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" | ||
872 | "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t" | ||
873 | "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" | ||
874 | "movq %%mm0, %%mm3 \n\t" | ||
875 | "movq %%mm1, %%mm4 \n\t" | ||
876 | "movq %%mm2, %%mm5 \n\t" | ||
877 | "punpcklwd %5, %%mm0 \n\t" | ||
878 | "punpcklwd %5, %%mm1 \n\t" | ||
879 | "punpcklwd %5, %%mm2 \n\t" | ||
880 | "punpckhwd %5, %%mm3 \n\t" | ||
881 | "punpckhwd %5, %%mm4 \n\t" | ||
882 | "punpckhwd %5, %%mm5 \n\t" | ||
883 | "psllq $8, %%mm1 \n\t" | ||
884 | "psllq $16, %%mm2 \n\t" | ||
885 | "por %%mm1, %%mm0 \n\t" | ||
886 | "por %%mm2, %%mm0 \n\t" | ||
887 | "psllq $8, %%mm4 \n\t" | ||
888 | "psllq $16, %%mm5 \n\t" | ||
889 | "por %%mm4, %%mm3 \n\t" | ||
890 | "por %%mm5, %%mm3 \n\t" | ||
891 | |||
892 | "movq %%mm0, %%mm6 \n\t" | ||
893 | "movq %%mm3, %%mm7 \n\t" | ||
894 | |||
895 | "movq 8(%1), %%mm0 \n\t" | ||
896 | "movq 8(%1), %%mm1 \n\t" | ||
897 | "movq 8(%1), %%mm2 \n\t" | ||
898 | "pand %2, %%mm0 \n\t" | ||
899 | "pand %3, %%mm1 \n\t" | ||
900 | "pand %4, %%mm2 \n\t" | ||
901 | "psllq $5, %%mm0 \n\t" | ||
902 | "psrlq $1, %%mm2 \n\t" | ||
903 | "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" | ||
904 | "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t" | ||
905 | "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" | ||
906 | "movq %%mm0, %%mm3 \n\t" | ||
907 | "movq %%mm1, %%mm4 \n\t" | ||
908 | "movq %%mm2, %%mm5 \n\t" | ||
909 | "punpcklwd %5, %%mm0 \n\t" | ||
910 | "punpcklwd %5, %%mm1 \n\t" | ||
911 | "punpcklwd %5, %%mm2 \n\t" | ||
912 | "punpckhwd %5, %%mm3 \n\t" | ||
913 | "punpckhwd %5, %%mm4 \n\t" | ||
914 | "punpckhwd %5, %%mm5 \n\t" | ||
915 | "psllq $8, %%mm1 \n\t" | ||
916 | "psllq $16, %%mm2 \n\t" | ||
917 | "por %%mm1, %%mm0 \n\t" | ||
918 | "por %%mm2, %%mm0 \n\t" | ||
919 | "psllq $8, %%mm4 \n\t" | ||
920 | "psllq $16, %%mm5 \n\t" | ||
921 | "por %%mm4, %%mm3 \n\t" | ||
922 | "por %%mm5, %%mm3 \n\t" | ||
923 | :"=m"(*d) | ||
924 | :"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null) | ||
925 | NAMED_CONSTRAINTS_ADD(mul15_mid,mul16_mid,mul15_hi) | ||
926 | :"memory"); | ||
927 | /* borrowed 32 to 24 */ | ||
928 | ✗ | __asm__ volatile( | |
929 | "movq %%mm0, %%mm4 \n\t" | ||
930 | "movq %%mm3, %%mm5 \n\t" | ||
931 | "movq %%mm6, %%mm0 \n\t" | ||
932 | "movq %%mm7, %%mm1 \n\t" | ||
933 | |||
934 | "movq %%mm4, %%mm6 \n\t" | ||
935 | "movq %%mm5, %%mm7 \n\t" | ||
936 | "movq %%mm0, %%mm2 \n\t" | ||
937 | "movq %%mm1, %%mm3 \n\t" | ||
938 | |||
939 | STORE_BGR24_MMX | ||
940 | |||
941 | :: "r"(d), "m"(*s) | ||
942 | NAMED_CONSTRAINTS_ADD(mask24l,mask24h) | ||
943 | :"memory"); | ||
944 | ✗ | d += 24; | |
945 | ✗ | s += 8; | |
946 | } | ||
947 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
948 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
949 | ✗ | while (s < end) { | |
950 | register uint16_t bgr; | ||
951 | ✗ | bgr = *s++; | |
952 | ✗ | *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); | |
953 | ✗ | *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9); | |
954 | ✗ | *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13); | |
955 | } | ||
956 | ✗ | } | |
957 | |||
958 | /* | ||
959 | * mm0 = 00 B3 00 B2 00 B1 00 B0 | ||
960 | * mm1 = 00 G3 00 G2 00 G1 00 G0 | ||
961 | * mm2 = 00 R3 00 R2 00 R1 00 R0 | ||
962 | * mm6 = FF FF FF FF FF FF FF FF | ||
963 | * mm7 = 00 00 00 00 00 00 00 00 | ||
964 | */ | ||
965 | #define PACK_RGB32 \ | ||
966 | "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \ | ||
967 | "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \ | ||
968 | "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \ | ||
969 | "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \ | ||
970 | "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \ | ||
971 | "movq %%mm0, %%mm3 \n\t" \ | ||
972 | "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \ | ||
973 | "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \ | ||
974 | MOVNTQ" %%mm0, (%0) \n\t" \ | ||
975 | MOVNTQ" %%mm3, 8(%0) \n\t" \ | ||
976 | |||
977 | ✗ | static inline void rgb15to32_mmxext(const uint8_t *src, uint8_t *dst, int src_size) | |
978 | { | ||
979 | const uint16_t *end; | ||
980 | const uint16_t *mm_end; | ||
981 | ✗ | uint8_t *d = dst; | |
982 | ✗ | const uint16_t *s = (const uint16_t *)src; | |
983 | ✗ | end = s + src_size/2; | |
984 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
985 | ✗ | __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory"); | |
986 | ✗ | __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory"); | |
987 | ✗ | mm_end = end - 3; | |
988 | ✗ | while (s < mm_end) { | |
989 | ✗ | __asm__ volatile( | |
990 | PREFETCH" 32(%1) \n\t" | ||
991 | "movq (%1), %%mm0 \n\t" | ||
992 | "movq (%1), %%mm1 \n\t" | ||
993 | "movq (%1), %%mm2 \n\t" | ||
994 | "pand %2, %%mm0 \n\t" | ||
995 | "pand %3, %%mm1 \n\t" | ||
996 | "pand %4, %%mm2 \n\t" | ||
997 | "psllq $5, %%mm0 \n\t" | ||
998 | "pmulhw %5, %%mm0 \n\t" | ||
999 | "pmulhw %5, %%mm1 \n\t" | ||
1000 | "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" | ||
1001 | PACK_RGB32 | ||
1002 | ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r) ,"m"(mul15_mid) | ||
1003 | NAMED_CONSTRAINTS_ADD(mul15_hi) | ||
1004 | :"memory"); | ||
1005 | ✗ | d += 16; | |
1006 | ✗ | s += 4; | |
1007 | } | ||
1008 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
1009 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
1010 | ✗ | while (s < end) { | |
1011 | register uint16_t bgr; | ||
1012 | ✗ | bgr = *s++; | |
1013 | ✗ | *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); | |
1014 | ✗ | *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7); | |
1015 | ✗ | *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12); | |
1016 | ✗ | *d++ = 255; | |
1017 | } | ||
1018 | ✗ | } | |
1019 | |||
1020 | ✗ | static inline void rgb16to32_mmxext(const uint8_t *src, uint8_t *dst, int src_size) | |
1021 | { | ||
1022 | const uint16_t *end; | ||
1023 | const uint16_t *mm_end; | ||
1024 | ✗ | uint8_t *d = dst; | |
1025 | ✗ | const uint16_t *s = (const uint16_t*)src; | |
1026 | ✗ | end = s + src_size/2; | |
1027 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
1028 | ✗ | __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory"); | |
1029 | ✗ | __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory"); | |
1030 | ✗ | mm_end = end - 3; | |
1031 | ✗ | while (s < mm_end) { | |
1032 | ✗ | __asm__ volatile( | |
1033 | PREFETCH" 32(%1) \n\t" | ||
1034 | "movq (%1), %%mm0 \n\t" | ||
1035 | "movq (%1), %%mm1 \n\t" | ||
1036 | "movq (%1), %%mm2 \n\t" | ||
1037 | "pand %2, %%mm0 \n\t" | ||
1038 | "pand %3, %%mm1 \n\t" | ||
1039 | "pand %4, %%mm2 \n\t" | ||
1040 | "psllq $5, %%mm0 \n\t" | ||
1041 | "psrlq $1, %%mm2 \n\t" | ||
1042 | "pmulhw %5, %%mm0 \n\t" | ||
1043 | "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t" | ||
1044 | "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" | ||
1045 | PACK_RGB32 | ||
1046 | ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid) | ||
1047 | NAMED_CONSTRAINTS_ADD(mul16_mid,mul15_hi) | ||
1048 | :"memory"); | ||
1049 | ✗ | d += 16; | |
1050 | ✗ | s += 4; | |
1051 | } | ||
1052 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
1053 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
1054 | ✗ | while (s < end) { | |
1055 | register uint16_t bgr; | ||
1056 | ✗ | bgr = *s++; | |
1057 | ✗ | *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); | |
1058 | ✗ | *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9); | |
1059 | ✗ | *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13); | |
1060 | ✗ | *d++ = 255; | |
1061 | } | ||
1062 | ✗ | } | |
1063 | |||
1064 | ✗ | static inline void rgb24tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size) | |
1065 | { | ||
1066 | ✗ | x86_reg mmx_size= 23 - src_size; | |
1067 | ✗ | __asm__ volatile ( | |
1068 | "test %%"FF_REG_a", %%"FF_REG_a" \n\t" | ||
1069 | "jns 2f \n\t" | ||
1070 | "movq "MANGLE(mask24r)", %%mm5 \n\t" | ||
1071 | "movq "MANGLE(mask24g)", %%mm6 \n\t" | ||
1072 | "movq "MANGLE(mask24b)", %%mm7 \n\t" | ||
1073 | ".p2align 4 \n\t" | ||
1074 | "1: \n\t" | ||
1075 | PREFETCH" 32(%1, %%"FF_REG_a") \n\t" | ||
1076 | "movq (%1, %%"FF_REG_a"), %%mm0 \n\t" // BGR BGR BG | ||
1077 | "movq (%1, %%"FF_REG_a"), %%mm1 \n\t" // BGR BGR BG | ||
1078 | "movq 2(%1, %%"FF_REG_a"), %%mm2 \n\t" // R BGR BGR B | ||
1079 | "psllq $16, %%mm0 \n\t" // 00 BGR BGR | ||
1080 | "pand %%mm5, %%mm0 \n\t" | ||
1081 | "pand %%mm6, %%mm1 \n\t" | ||
1082 | "pand %%mm7, %%mm2 \n\t" | ||
1083 | "por %%mm0, %%mm1 \n\t" | ||
1084 | "por %%mm2, %%mm1 \n\t" | ||
1085 | "movq 6(%1, %%"FF_REG_a"), %%mm0 \n\t" // BGR BGR BG | ||
1086 | MOVNTQ" %%mm1,(%2, %%"FF_REG_a") \n\t" // RGB RGB RG | ||
1087 | "movq 8(%1, %%"FF_REG_a"), %%mm1 \n\t" // R BGR BGR B | ||
1088 | "movq 10(%1, %%"FF_REG_a"), %%mm2 \n\t" // GR BGR BGR | ||
1089 | "pand %%mm7, %%mm0 \n\t" | ||
1090 | "pand %%mm5, %%mm1 \n\t" | ||
1091 | "pand %%mm6, %%mm2 \n\t" | ||
1092 | "por %%mm0, %%mm1 \n\t" | ||
1093 | "por %%mm2, %%mm1 \n\t" | ||
1094 | "movq 14(%1, %%"FF_REG_a"), %%mm0 \n\t" // R BGR BGR B | ||
1095 | MOVNTQ" %%mm1, 8(%2, %%"FF_REG_a")\n\t" // B RGB RGB R | ||
1096 | "movq 16(%1, %%"FF_REG_a"), %%mm1 \n\t" // GR BGR BGR | ||
1097 | "movq 18(%1, %%"FF_REG_a"), %%mm2 \n\t" // BGR BGR BG | ||
1098 | "pand %%mm6, %%mm0 \n\t" | ||
1099 | "pand %%mm7, %%mm1 \n\t" | ||
1100 | "pand %%mm5, %%mm2 \n\t" | ||
1101 | "por %%mm0, %%mm1 \n\t" | ||
1102 | "por %%mm2, %%mm1 \n\t" | ||
1103 | MOVNTQ" %%mm1, 16(%2, %%"FF_REG_a") \n\t" | ||
1104 | "add $24, %%"FF_REG_a" \n\t" | ||
1105 | " js 1b \n\t" | ||
1106 | "2: \n\t" | ||
1107 | : "+a" (mmx_size) | ||
1108 | ✗ | : "r" (src-mmx_size), "r"(dst-mmx_size) | |
1109 | NAMED_CONSTRAINTS_ADD(mask24r,mask24g,mask24b) | ||
1110 | ); | ||
1111 | |||
1112 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
1113 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
1114 | |||
1115 | ✗ | if (mmx_size==23) return; //finished, was multiple of 8 | |
1116 | |||
1117 | ✗ | src+= src_size; | |
1118 | ✗ | dst+= src_size; | |
1119 | ✗ | src_size= 23-mmx_size; | |
1120 | ✗ | src-= src_size; | |
1121 | ✗ | dst-= src_size; | |
1122 | ✗ | for (unsigned i = 0; i < src_size; i +=3) { | |
1123 | register uint8_t x; | ||
1124 | ✗ | x = src[i + 2]; | |
1125 | ✗ | dst[i + 1] = src[i + 1]; | |
1126 | ✗ | dst[i + 2] = src[i + 0]; | |
1127 | ✗ | dst[i + 0] = x; | |
1128 | } | ||
1129 | } | ||
1130 | |||
1131 | ✗ | static inline void yuvPlanartoyuy2_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | |
1132 | int width, int height, | ||
1133 | int lumStride, int chromStride, int dstStride, int vertLumPerChroma) | ||
1134 | { | ||
1135 | ✗ | const x86_reg chromWidth= width>>1; | |
1136 | ✗ | for (int y = 0; y < height; y++) { | |
1137 | //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway) | ||
1138 | ✗ | __asm__ volatile( | |
1139 | "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" | ||
1140 | ".p2align 4 \n\t" | ||
1141 | "1: \n\t" | ||
1142 | PREFETCH" 32(%1, %%"FF_REG_a", 2) \n\t" | ||
1143 | PREFETCH" 32(%2, %%"FF_REG_a") \n\t" | ||
1144 | PREFETCH" 32(%3, %%"FF_REG_a") \n\t" | ||
1145 | "movq (%2, %%"FF_REG_a"), %%mm0 \n\t" // U(0) | ||
1146 | "movq %%mm0, %%mm2 \n\t" // U(0) | ||
1147 | "movq (%3, %%"FF_REG_a"), %%mm1 \n\t" // V(0) | ||
1148 | "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | ||
1149 | "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) | ||
1150 | |||
1151 | "movq (%1, %%"FF_REG_a",2), %%mm3 \n\t" // Y(0) | ||
1152 | "movq 8(%1, %%"FF_REG_a",2), %%mm5 \n\t" // Y(8) | ||
1153 | "movq %%mm3, %%mm4 \n\t" // Y(0) | ||
1154 | "movq %%mm5, %%mm6 \n\t" // Y(8) | ||
1155 | "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0) | ||
1156 | "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4) | ||
1157 | "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8) | ||
1158 | "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12) | ||
1159 | |||
1160 | MOVNTQ" %%mm3, (%0, %%"FF_REG_a", 4) \n\t" | ||
1161 | MOVNTQ" %%mm4, 8(%0, %%"FF_REG_a", 4) \n\t" | ||
1162 | MOVNTQ" %%mm5, 16(%0, %%"FF_REG_a", 4) \n\t" | ||
1163 | MOVNTQ" %%mm6, 24(%0, %%"FF_REG_a", 4) \n\t" | ||
1164 | |||
1165 | "add $8, %%"FF_REG_a" \n\t" | ||
1166 | "cmp %4, %%"FF_REG_a" \n\t" | ||
1167 | " jb 1b \n\t" | ||
1168 | ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) | ||
1169 | : "%"FF_REG_a | ||
1170 | ); | ||
1171 | ✗ | if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) { | |
1172 | ✗ | usrc += chromStride; | |
1173 | ✗ | vsrc += chromStride; | |
1174 | } | ||
1175 | ✗ | ysrc += lumStride; | |
1176 | ✗ | dst += dstStride; | |
1177 | } | ||
1178 | ✗ | __asm__(EMMS" \n\t" | |
1179 | SFENCE" \n\t" | ||
1180 | :::"memory"); | ||
1181 | ✗ | } | |
1182 | |||
1183 | /** | ||
1184 | * Height should be a multiple of 2 and width should be a multiple of 16. | ||
1185 | * (If this is a problem for anyone then tell me, and I will fix it.) | ||
1186 | */ | ||
1187 | ✗ | static inline void yv12toyuy2_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | |
1188 | int width, int height, | ||
1189 | int lumStride, int chromStride, int dstStride) | ||
1190 | { | ||
1191 | //FIXME interpolate chroma | ||
1192 | ✗ | yuvPlanartoyuy2_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); | |
1193 | ✗ | } | |
1194 | |||
1195 | ✗ | static inline void yuvPlanartouyvy_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | |
1196 | int width, int height, | ||
1197 | int lumStride, int chromStride, int dstStride, int vertLumPerChroma) | ||
1198 | { | ||
1199 | ✗ | const x86_reg chromWidth= width>>1; | |
1200 | ✗ | for (int y = 0; y < height; y++) { | |
1201 | //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway) | ||
1202 | ✗ | __asm__ volatile( | |
1203 | "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" | ||
1204 | ".p2align 4 \n\t" | ||
1205 | "1: \n\t" | ||
1206 | PREFETCH" 32(%1, %%"FF_REG_a", 2) \n\t" | ||
1207 | PREFETCH" 32(%2, %%"FF_REG_a") \n\t" | ||
1208 | PREFETCH" 32(%3, %%"FF_REG_a") \n\t" | ||
1209 | "movq (%2, %%"FF_REG_a"), %%mm0 \n\t" // U(0) | ||
1210 | "movq %%mm0, %%mm2 \n\t" // U(0) | ||
1211 | "movq (%3, %%"FF_REG_a"), %%mm1 \n\t" // V(0) | ||
1212 | "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | ||
1213 | "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) | ||
1214 | |||
1215 | "movq (%1, %%"FF_REG_a",2), %%mm3 \n\t" // Y(0) | ||
1216 | "movq 8(%1, %%"FF_REG_a",2), %%mm5 \n\t" // Y(8) | ||
1217 | "movq %%mm0, %%mm4 \n\t" // Y(0) | ||
1218 | "movq %%mm2, %%mm6 \n\t" // Y(8) | ||
1219 | "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0) | ||
1220 | "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4) | ||
1221 | "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8) | ||
1222 | "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12) | ||
1223 | |||
1224 | MOVNTQ" %%mm0, (%0, %%"FF_REG_a", 4) \n\t" | ||
1225 | MOVNTQ" %%mm4, 8(%0, %%"FF_REG_a", 4) \n\t" | ||
1226 | MOVNTQ" %%mm2, 16(%0, %%"FF_REG_a", 4) \n\t" | ||
1227 | MOVNTQ" %%mm6, 24(%0, %%"FF_REG_a", 4) \n\t" | ||
1228 | |||
1229 | "add $8, %%"FF_REG_a" \n\t" | ||
1230 | "cmp %4, %%"FF_REG_a" \n\t" | ||
1231 | " jb 1b \n\t" | ||
1232 | ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) | ||
1233 | : "%"FF_REG_a | ||
1234 | ); | ||
1235 | ✗ | if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) { | |
1236 | ✗ | usrc += chromStride; | |
1237 | ✗ | vsrc += chromStride; | |
1238 | } | ||
1239 | ✗ | ysrc += lumStride; | |
1240 | ✗ | dst += dstStride; | |
1241 | } | ||
1242 | ✗ | __asm__(EMMS" \n\t" | |
1243 | SFENCE" \n\t" | ||
1244 | :::"memory"); | ||
1245 | ✗ | } | |
1246 | |||
1247 | /** | ||
1248 | * Height should be a multiple of 2 and width should be a multiple of 16 | ||
1249 | * (If this is a problem for anyone then tell me, and I will fix it.) | ||
1250 | */ | ||
1251 | ✗ | static inline void yv12touyvy_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | |
1252 | int width, int height, | ||
1253 | int lumStride, int chromStride, int dstStride) | ||
1254 | { | ||
1255 | //FIXME interpolate chroma | ||
1256 | ✗ | yuvPlanartouyvy_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); | |
1257 | ✗ | } | |
1258 | |||
1259 | /** | ||
1260 | * Width should be a multiple of 16. | ||
1261 | */ | ||
1262 | ✗ | static inline void yuv422ptouyvy_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | |
1263 | int width, int height, | ||
1264 | int lumStride, int chromStride, int dstStride) | ||
1265 | { | ||
1266 | ✗ | yuvPlanartouyvy_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1); | |
1267 | ✗ | } | |
1268 | |||
1269 | /** | ||
1270 | * Width should be a multiple of 16. | ||
1271 | */ | ||
1272 | ✗ | static inline void yuv422ptoyuy2_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | |
1273 | int width, int height, | ||
1274 | int lumStride, int chromStride, int dstStride) | ||
1275 | { | ||
1276 | ✗ | yuvPlanartoyuy2_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1); | |
1277 | ✗ | } | |
1278 | |||
1279 | /** | ||
1280 | * Height should be a multiple of 2 and width should be a multiple of 16. | ||
1281 | * (If this is a problem for anyone then tell me, and I will fix it.) | ||
1282 | */ | ||
1283 | ✗ | static inline void yuy2toyv12_mmxext(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | |
1284 | int width, int height, | ||
1285 | int lumStride, int chromStride, int srcStride) | ||
1286 | { | ||
1287 | ✗ | const x86_reg chromWidth= width>>1; | |
1288 | ✗ | for (int y = 0; y < height; y += 2) { | |
1289 | ✗ | __asm__ volatile( | |
1290 | "xor %%"FF_REG_a", %%"FF_REG_a"\n\t" | ||
1291 | "pcmpeqw %%mm7, %%mm7 \n\t" | ||
1292 | "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... | ||
1293 | ".p2align 4 \n\t" | ||
1294 | "1: \n\t" | ||
1295 | PREFETCH" 64(%0, %%"FF_REG_a", 4) \n\t" | ||
1296 | "movq (%0, %%"FF_REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) | ||
1297 | "movq 8(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) | ||
1298 | "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) | ||
1299 | "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) | ||
1300 | "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) | ||
1301 | "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) | ||
1302 | "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) | ||
1303 | "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) | ||
1304 | "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | ||
1305 | "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) | ||
1306 | |||
1307 | MOVNTQ" %%mm2, (%1, %%"FF_REG_a", 2) \n\t" | ||
1308 | |||
1309 | "movq 16(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8) | ||
1310 | "movq 24(%0, %%"FF_REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12) | ||
1311 | "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) | ||
1312 | "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) | ||
1313 | "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) | ||
1314 | "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) | ||
1315 | "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) | ||
1316 | "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) | ||
1317 | "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) | ||
1318 | "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) | ||
1319 | |||
1320 | MOVNTQ" %%mm3, 8(%1, %%"FF_REG_a", 2) \n\t" | ||
1321 | |||
1322 | "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) | ||
1323 | "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) | ||
1324 | "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) | ||
1325 | "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) | ||
1326 | "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) | ||
1327 | "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) | ||
1328 | "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) | ||
1329 | "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) | ||
1330 | |||
1331 | MOVNTQ" %%mm0, (%3, %%"FF_REG_a") \n\t" | ||
1332 | MOVNTQ" %%mm2, (%2, %%"FF_REG_a") \n\t" | ||
1333 | |||
1334 | "add $8, %%"FF_REG_a" \n\t" | ||
1335 | "cmp %4, %%"FF_REG_a" \n\t" | ||
1336 | " jb 1b \n\t" | ||
1337 | ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) | ||
1338 | : "memory", "%"FF_REG_a | ||
1339 | ); | ||
1340 | |||
1341 | ✗ | ydst += lumStride; | |
1342 | ✗ | src += srcStride; | |
1343 | |||
1344 | ✗ | __asm__ volatile( | |
1345 | "xor %%"FF_REG_a", %%"FF_REG_a"\n\t" | ||
1346 | ".p2align 4 \n\t" | ||
1347 | "1: \n\t" | ||
1348 | PREFETCH" 64(%0, %%"FF_REG_a", 4) \n\t" | ||
1349 | "movq (%0, %%"FF_REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) | ||
1350 | "movq 8(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) | ||
1351 | "movq 16(%0, %%"FF_REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8) | ||
1352 | "movq 24(%0, %%"FF_REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12) | ||
1353 | "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0) | ||
1354 | "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4) | ||
1355 | "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8) | ||
1356 | "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12) | ||
1357 | "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) | ||
1358 | "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) | ||
1359 | |||
1360 | MOVNTQ" %%mm0, (%1, %%"FF_REG_a", 2) \n\t" | ||
1361 | MOVNTQ" %%mm2, 8(%1, %%"FF_REG_a", 2) \n\t" | ||
1362 | |||
1363 | "add $8, %%"FF_REG_a"\n\t" | ||
1364 | "cmp %4, %%"FF_REG_a"\n\t" | ||
1365 | " jb 1b \n\t" | ||
1366 | |||
1367 | ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) | ||
1368 | : "memory", "%"FF_REG_a | ||
1369 | ); | ||
1370 | ✗ | udst += chromStride; | |
1371 | ✗ | vdst += chromStride; | |
1372 | ✗ | ydst += lumStride; | |
1373 | ✗ | src += srcStride; | |
1374 | } | ||
1375 | ✗ | __asm__ volatile(EMMS" \n\t" | |
1376 | SFENCE" \n\t" | ||
1377 | :::"memory"); | ||
1378 | ✗ | } | |
1379 | |||
1380 | ✗ | static inline void planar2x_mmxext(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride) | |
1381 | { | ||
1382 | ✗ | dst[0]= src[0]; | |
1383 | |||
1384 | // first line | ||
1385 | ✗ | for (int x = 0; x < srcWidth - 1; x++) { | |
1386 | ✗ | dst[2*x+1]= (3*src[x] + src[x+1])>>2; | |
1387 | ✗ | dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; | |
1388 | } | ||
1389 | ✗ | dst[2*srcWidth-1]= src[srcWidth-1]; | |
1390 | |||
1391 | ✗ | dst+= dstStride; | |
1392 | |||
1393 | ✗ | for (int y = 1; y < srcHeight; y++) { | |
1394 | ✗ | x86_reg mmxSize= srcWidth&~15; | |
1395 | |||
1396 | ✗ | if (mmxSize) { | |
1397 | ✗ | __asm__ volatile( | |
1398 | "mov %4, %%"FF_REG_a" \n\t" | ||
1399 | "movq "MANGLE(mmx_ff)", %%mm0 \n\t" | ||
1400 | "movq (%0, %%"FF_REG_a"), %%mm4 \n\t" | ||
1401 | "movq %%mm4, %%mm2 \n\t" | ||
1402 | "psllq $8, %%mm4 \n\t" | ||
1403 | "pand %%mm0, %%mm2 \n\t" | ||
1404 | "por %%mm2, %%mm4 \n\t" | ||
1405 | "movq (%1, %%"FF_REG_a"), %%mm5 \n\t" | ||
1406 | "movq %%mm5, %%mm3 \n\t" | ||
1407 | "psllq $8, %%mm5 \n\t" | ||
1408 | "pand %%mm0, %%mm3 \n\t" | ||
1409 | "por %%mm3, %%mm5 \n\t" | ||
1410 | "1: \n\t" | ||
1411 | "movq (%0, %%"FF_REG_a"), %%mm0 \n\t" | ||
1412 | "movq (%1, %%"FF_REG_a"), %%mm1 \n\t" | ||
1413 | "movq 1(%0, %%"FF_REG_a"), %%mm2 \n\t" | ||
1414 | "movq 1(%1, %%"FF_REG_a"), %%mm3 \n\t" | ||
1415 | PAVGB" %%mm0, %%mm5 \n\t" | ||
1416 | PAVGB" %%mm0, %%mm3 \n\t" | ||
1417 | PAVGB" %%mm0, %%mm5 \n\t" | ||
1418 | PAVGB" %%mm0, %%mm3 \n\t" | ||
1419 | PAVGB" %%mm1, %%mm4 \n\t" | ||
1420 | PAVGB" %%mm1, %%mm2 \n\t" | ||
1421 | PAVGB" %%mm1, %%mm4 \n\t" | ||
1422 | PAVGB" %%mm1, %%mm2 \n\t" | ||
1423 | "movq %%mm5, %%mm7 \n\t" | ||
1424 | "movq %%mm4, %%mm6 \n\t" | ||
1425 | "punpcklbw %%mm3, %%mm5 \n\t" | ||
1426 | "punpckhbw %%mm3, %%mm7 \n\t" | ||
1427 | "punpcklbw %%mm2, %%mm4 \n\t" | ||
1428 | "punpckhbw %%mm2, %%mm6 \n\t" | ||
1429 | MOVNTQ" %%mm5, (%2, %%"FF_REG_a", 2) \n\t" | ||
1430 | MOVNTQ" %%mm7, 8(%2, %%"FF_REG_a", 2) \n\t" | ||
1431 | MOVNTQ" %%mm4, (%3, %%"FF_REG_a", 2) \n\t" | ||
1432 | MOVNTQ" %%mm6, 8(%3, %%"FF_REG_a", 2) \n\t" | ||
1433 | "add $8, %%"FF_REG_a" \n\t" | ||
1434 | "movq -1(%0, %%"FF_REG_a"), %%mm4 \n\t" | ||
1435 | "movq -1(%1, %%"FF_REG_a"), %%mm5 \n\t" | ||
1436 | " js 1b \n\t" | ||
1437 | ✗ | :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ), | |
1438 | ✗ | "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2), | |
1439 | ✗ | "g" (-mmxSize) | |
1440 | NAMED_CONSTRAINTS_ADD(mmx_ff) | ||
1441 | : "%"FF_REG_a | ||
1442 | ); | ||
1443 | } else { | ||
1444 | ✗ | mmxSize = 1; | |
1445 | ✗ | dst[0] = (src[0] * 3 + src[srcStride]) >> 2; | |
1446 | ✗ | dst[dstStride] = (src[0] + 3 * src[srcStride]) >> 2; | |
1447 | } | ||
1448 | |||
1449 | ✗ | for (int x = mmxSize - 1; x < srcWidth - 1; x++) { | |
1450 | ✗ | dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2; | |
1451 | ✗ | dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2; | |
1452 | ✗ | dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2; | |
1453 | ✗ | dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2; | |
1454 | } | ||
1455 | ✗ | dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2; | |
1456 | ✗ | dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2; | |
1457 | |||
1458 | ✗ | dst+=dstStride*2; | |
1459 | ✗ | src+=srcStride; | |
1460 | } | ||
1461 | |||
1462 | // last line | ||
1463 | ✗ | dst[0]= src[0]; | |
1464 | |||
1465 | ✗ | for (int x = 0; x < srcWidth - 1; x++) { | |
1466 | ✗ | dst[2*x+1]= (3*src[x] + src[x+1])>>2; | |
1467 | ✗ | dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; | |
1468 | } | ||
1469 | ✗ | dst[2*srcWidth-1]= src[srcWidth-1]; | |
1470 | |||
1471 | ✗ | __asm__ volatile(EMMS" \n\t" | |
1472 | SFENCE" \n\t" | ||
1473 | :::"memory"); | ||
1474 | ✗ | } | |
1475 | |||
1476 | /** | ||
1477 | * Height should be a multiple of 2 and width should be a multiple of 2. | ||
1478 | * (If this is a problem for anyone then tell me, and I will fix it.) | ||
1479 | * Chrominance data is only taken from every second line, | ||
1480 | * others are ignored in the C version. | ||
1481 | * FIXME: Write HQ version. | ||
1482 | */ | ||
1483 | #if ARCH_X86_32 && HAVE_7REGS | ||
1484 | static inline void rgb24toyv12_mmxext(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | ||
1485 | int width, int height, | ||
1486 | int lumStride, int chromStride, int srcStride, | ||
1487 | const int32_t *rgb2yuv) | ||
1488 | { | ||
1489 | #define BGR2Y_IDX "16*4+16*32" | ||
1490 | #define BGR2U_IDX "16*4+16*33" | ||
1491 | #define BGR2V_IDX "16*4+16*34" | ||
1492 | int y; | ||
1493 | const x86_reg chromWidth= width>>1; | ||
1494 | |||
1495 | if (height > 2) { | ||
1496 | ff_rgb24toyv12_c(src, ydst, udst, vdst, width, 2, lumStride, chromStride, srcStride, rgb2yuv); | ||
1497 | src += 2*srcStride; | ||
1498 | ydst += 2*lumStride; | ||
1499 | udst += chromStride; | ||
1500 | vdst += chromStride; | ||
1501 | height -= 2; | ||
1502 | } | ||
1503 | |||
1504 | for (y = 0; y < height - 2; y += 2) { | ||
1505 | for (int i = 0; i < 2; i++) { | ||
1506 | __asm__ volatile( | ||
1507 | "mov %2, %%"FF_REG_a"\n\t" | ||
1508 | "movq "BGR2Y_IDX"(%3), %%mm6 \n\t" | ||
1509 | "movq "MANGLE(ff_w1111)", %%mm5 \n\t" | ||
1510 | "pxor %%mm7, %%mm7 \n\t" | ||
1511 | "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_d" \n\t" | ||
1512 | ".p2align 4 \n\t" | ||
1513 | "1: \n\t" | ||
1514 | PREFETCH" 64(%0, %%"FF_REG_d") \n\t" | ||
1515 | "movd (%0, %%"FF_REG_d"), %%mm0 \n\t" | ||
1516 | "movd 3(%0, %%"FF_REG_d"), %%mm1 \n\t" | ||
1517 | "punpcklbw %%mm7, %%mm0 \n\t" | ||
1518 | "punpcklbw %%mm7, %%mm1 \n\t" | ||
1519 | "movd 6(%0, %%"FF_REG_d"), %%mm2 \n\t" | ||
1520 | "movd 9(%0, %%"FF_REG_d"), %%mm3 \n\t" | ||
1521 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
1522 | "punpcklbw %%mm7, %%mm3 \n\t" | ||
1523 | "pmaddwd %%mm6, %%mm0 \n\t" | ||
1524 | "pmaddwd %%mm6, %%mm1 \n\t" | ||
1525 | "pmaddwd %%mm6, %%mm2 \n\t" | ||
1526 | "pmaddwd %%mm6, %%mm3 \n\t" | ||
1527 | "psrad $8, %%mm0 \n\t" | ||
1528 | "psrad $8, %%mm1 \n\t" | ||
1529 | "psrad $8, %%mm2 \n\t" | ||
1530 | "psrad $8, %%mm3 \n\t" | ||
1531 | "packssdw %%mm1, %%mm0 \n\t" | ||
1532 | "packssdw %%mm3, %%mm2 \n\t" | ||
1533 | "pmaddwd %%mm5, %%mm0 \n\t" | ||
1534 | "pmaddwd %%mm5, %%mm2 \n\t" | ||
1535 | "packssdw %%mm2, %%mm0 \n\t" | ||
1536 | "psraw $7, %%mm0 \n\t" | ||
1537 | |||
1538 | "movd 12(%0, %%"FF_REG_d"), %%mm4 \n\t" | ||
1539 | "movd 15(%0, %%"FF_REG_d"), %%mm1 \n\t" | ||
1540 | "punpcklbw %%mm7, %%mm4 \n\t" | ||
1541 | "punpcklbw %%mm7, %%mm1 \n\t" | ||
1542 | "movd 18(%0, %%"FF_REG_d"), %%mm2 \n\t" | ||
1543 | "movd 21(%0, %%"FF_REG_d"), %%mm3 \n\t" | ||
1544 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
1545 | "punpcklbw %%mm7, %%mm3 \n\t" | ||
1546 | "pmaddwd %%mm6, %%mm4 \n\t" | ||
1547 | "pmaddwd %%mm6, %%mm1 \n\t" | ||
1548 | "pmaddwd %%mm6, %%mm2 \n\t" | ||
1549 | "pmaddwd %%mm6, %%mm3 \n\t" | ||
1550 | "psrad $8, %%mm4 \n\t" | ||
1551 | "psrad $8, %%mm1 \n\t" | ||
1552 | "psrad $8, %%mm2 \n\t" | ||
1553 | "psrad $8, %%mm3 \n\t" | ||
1554 | "packssdw %%mm1, %%mm4 \n\t" | ||
1555 | "packssdw %%mm3, %%mm2 \n\t" | ||
1556 | "pmaddwd %%mm5, %%mm4 \n\t" | ||
1557 | "pmaddwd %%mm5, %%mm2 \n\t" | ||
1558 | "add $24, %%"FF_REG_d"\n\t" | ||
1559 | "packssdw %%mm2, %%mm4 \n\t" | ||
1560 | "psraw $7, %%mm4 \n\t" | ||
1561 | |||
1562 | "packuswb %%mm4, %%mm0 \n\t" | ||
1563 | "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t" | ||
1564 | |||
1565 | MOVNTQ" %%mm0, (%1, %%"FF_REG_a") \n\t" | ||
1566 | "add $8, %%"FF_REG_a" \n\t" | ||
1567 | " js 1b \n\t" | ||
1568 | : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width), "r"(rgb2yuv) | ||
1569 | NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2YOffset) | ||
1570 | : "%"FF_REG_a, "%"FF_REG_d | ||
1571 | ); | ||
1572 | ydst += lumStride; | ||
1573 | src += srcStride; | ||
1574 | } | ||
1575 | src -= srcStride*2; | ||
1576 | __asm__ volatile( | ||
1577 | "mov %4, %%"FF_REG_a"\n\t" | ||
1578 | "movq "MANGLE(ff_w1111)", %%mm5 \n\t" | ||
1579 | "movq "BGR2U_IDX"(%5), %%mm6 \n\t" | ||
1580 | "pxor %%mm7, %%mm7 \n\t" | ||
1581 | "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_d" \n\t" | ||
1582 | "add %%"FF_REG_d", %%"FF_REG_d"\n\t" | ||
1583 | ".p2align 4 \n\t" | ||
1584 | "1: \n\t" | ||
1585 | PREFETCH" 64(%0, %%"FF_REG_d") \n\t" | ||
1586 | PREFETCH" 64(%1, %%"FF_REG_d") \n\t" | ||
1587 | "movq (%0, %%"FF_REG_d"), %%mm0 \n\t" | ||
1588 | "movq (%1, %%"FF_REG_d"), %%mm1 \n\t" | ||
1589 | "movq 6(%0, %%"FF_REG_d"), %%mm2 \n\t" | ||
1590 | "movq 6(%1, %%"FF_REG_d"), %%mm3 \n\t" | ||
1591 | PAVGB" %%mm1, %%mm0 \n\t" | ||
1592 | PAVGB" %%mm3, %%mm2 \n\t" | ||
1593 | "movq %%mm0, %%mm1 \n\t" | ||
1594 | "movq %%mm2, %%mm3 \n\t" | ||
1595 | "psrlq $24, %%mm0 \n\t" | ||
1596 | "psrlq $24, %%mm2 \n\t" | ||
1597 | PAVGB" %%mm1, %%mm0 \n\t" | ||
1598 | PAVGB" %%mm3, %%mm2 \n\t" | ||
1599 | "punpcklbw %%mm7, %%mm0 \n\t" | ||
1600 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
1601 | "movq "BGR2V_IDX"(%5), %%mm1 \n\t" | ||
1602 | "movq "BGR2V_IDX"(%5), %%mm3 \n\t" | ||
1603 | |||
1604 | "pmaddwd %%mm0, %%mm1 \n\t" | ||
1605 | "pmaddwd %%mm2, %%mm3 \n\t" | ||
1606 | "pmaddwd %%mm6, %%mm0 \n\t" | ||
1607 | "pmaddwd %%mm6, %%mm2 \n\t" | ||
1608 | "psrad $8, %%mm0 \n\t" | ||
1609 | "psrad $8, %%mm1 \n\t" | ||
1610 | "psrad $8, %%mm2 \n\t" | ||
1611 | "psrad $8, %%mm3 \n\t" | ||
1612 | "packssdw %%mm2, %%mm0 \n\t" | ||
1613 | "packssdw %%mm3, %%mm1 \n\t" | ||
1614 | "pmaddwd %%mm5, %%mm0 \n\t" | ||
1615 | "pmaddwd %%mm5, %%mm1 \n\t" | ||
1616 | "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 | ||
1617 | "psraw $7, %%mm0 \n\t" | ||
1618 | |||
1619 | "movq 12(%0, %%"FF_REG_d"), %%mm4 \n\t" | ||
1620 | "movq 12(%1, %%"FF_REG_d"), %%mm1 \n\t" | ||
1621 | "movq 18(%0, %%"FF_REG_d"), %%mm2 \n\t" | ||
1622 | "movq 18(%1, %%"FF_REG_d"), %%mm3 \n\t" | ||
1623 | PAVGB" %%mm1, %%mm4 \n\t" | ||
1624 | PAVGB" %%mm3, %%mm2 \n\t" | ||
1625 | "movq %%mm4, %%mm1 \n\t" | ||
1626 | "movq %%mm2, %%mm3 \n\t" | ||
1627 | "psrlq $24, %%mm4 \n\t" | ||
1628 | "psrlq $24, %%mm2 \n\t" | ||
1629 | PAVGB" %%mm1, %%mm4 \n\t" | ||
1630 | PAVGB" %%mm3, %%mm2 \n\t" | ||
1631 | "punpcklbw %%mm7, %%mm4 \n\t" | ||
1632 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
1633 | "movq "BGR2V_IDX"(%5), %%mm1 \n\t" | ||
1634 | "movq "BGR2V_IDX"(%5), %%mm3 \n\t" | ||
1635 | |||
1636 | "pmaddwd %%mm4, %%mm1 \n\t" | ||
1637 | "pmaddwd %%mm2, %%mm3 \n\t" | ||
1638 | "pmaddwd %%mm6, %%mm4 \n\t" | ||
1639 | "pmaddwd %%mm6, %%mm2 \n\t" | ||
1640 | "psrad $8, %%mm4 \n\t" | ||
1641 | "psrad $8, %%mm1 \n\t" | ||
1642 | "psrad $8, %%mm2 \n\t" | ||
1643 | "psrad $8, %%mm3 \n\t" | ||
1644 | "packssdw %%mm2, %%mm4 \n\t" | ||
1645 | "packssdw %%mm3, %%mm1 \n\t" | ||
1646 | "pmaddwd %%mm5, %%mm4 \n\t" | ||
1647 | "pmaddwd %%mm5, %%mm1 \n\t" | ||
1648 | "add $24, %%"FF_REG_d"\n\t" | ||
1649 | "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 | ||
1650 | "psraw $7, %%mm4 \n\t" | ||
1651 | |||
1652 | "movq %%mm0, %%mm1 \n\t" | ||
1653 | "punpckldq %%mm4, %%mm0 \n\t" | ||
1654 | "punpckhdq %%mm4, %%mm1 \n\t" | ||
1655 | "packsswb %%mm1, %%mm0 \n\t" | ||
1656 | "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t" | ||
1657 | "movd %%mm0, (%2, %%"FF_REG_a") \n\t" | ||
1658 | "punpckhdq %%mm0, %%mm0 \n\t" | ||
1659 | "movd %%mm0, (%3, %%"FF_REG_a") \n\t" | ||
1660 | "add $4, %%"FF_REG_a" \n\t" | ||
1661 | " js 1b \n\t" | ||
1662 | : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth), "r"(rgb2yuv) | ||
1663 | NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2UVOffset) | ||
1664 | : "%"FF_REG_a, "%"FF_REG_d | ||
1665 | ); | ||
1666 | |||
1667 | udst += chromStride; | ||
1668 | vdst += chromStride; | ||
1669 | src += srcStride*2; | ||
1670 | } | ||
1671 | |||
1672 | __asm__ volatile(EMMS" \n\t" | ||
1673 | SFENCE" \n\t" | ||
1674 | :::"memory"); | ||
1675 | |||
1676 | ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride, rgb2yuv); | ||
1677 | } | ||
1678 | #endif /* HAVE_7REGS */ | ||
1679 | |||
1680 | ✗ | static inline void vu9_to_vu12_mmxext(const uint8_t *src1, const uint8_t *src2, | |
1681 | uint8_t *dst1, uint8_t *dst2, | ||
1682 | int width, int height, | ||
1683 | int srcStride1, int srcStride2, | ||
1684 | int dstStride1, int dstStride2) | ||
1685 | { | ||
1686 | int w,h; | ||
1687 | ✗ | w=width/2; h=height/2; | |
1688 | ✗ | __asm__ volatile( | |
1689 | PREFETCH" %0 \n\t" | ||
1690 | PREFETCH" %1 \n\t" | ||
1691 | ✗ | ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory"); | |
1692 | ✗ | for (x86_reg y = 0; y < h; y++) { | |
1693 | ✗ | const uint8_t* s1=src1+srcStride1*(y>>1); | |
1694 | ✗ | uint8_t* d=dst1+dstStride1*y; | |
1695 | ✗ | x86_reg x = 0; | |
1696 | ✗ | for (;x<w-31;x+=32) { | |
1697 | ✗ | __asm__ volatile( | |
1698 | PREFETCH" 32(%1,%2) \n\t" | ||
1699 | "movq (%1,%2), %%mm0 \n\t" | ||
1700 | "movq 8(%1,%2), %%mm2 \n\t" | ||
1701 | "movq 16(%1,%2), %%mm4 \n\t" | ||
1702 | "movq 24(%1,%2), %%mm6 \n\t" | ||
1703 | "movq %%mm0, %%mm1 \n\t" | ||
1704 | "movq %%mm2, %%mm3 \n\t" | ||
1705 | "movq %%mm4, %%mm5 \n\t" | ||
1706 | "movq %%mm6, %%mm7 \n\t" | ||
1707 | "punpcklbw %%mm0, %%mm0 \n\t" | ||
1708 | "punpckhbw %%mm1, %%mm1 \n\t" | ||
1709 | "punpcklbw %%mm2, %%mm2 \n\t" | ||
1710 | "punpckhbw %%mm3, %%mm3 \n\t" | ||
1711 | "punpcklbw %%mm4, %%mm4 \n\t" | ||
1712 | "punpckhbw %%mm5, %%mm5 \n\t" | ||
1713 | "punpcklbw %%mm6, %%mm6 \n\t" | ||
1714 | "punpckhbw %%mm7, %%mm7 \n\t" | ||
1715 | MOVNTQ" %%mm0, (%0,%2,2) \n\t" | ||
1716 | MOVNTQ" %%mm1, 8(%0,%2,2) \n\t" | ||
1717 | MOVNTQ" %%mm2, 16(%0,%2,2) \n\t" | ||
1718 | MOVNTQ" %%mm3, 24(%0,%2,2) \n\t" | ||
1719 | MOVNTQ" %%mm4, 32(%0,%2,2) \n\t" | ||
1720 | MOVNTQ" %%mm5, 40(%0,%2,2) \n\t" | ||
1721 | MOVNTQ" %%mm6, 48(%0,%2,2) \n\t" | ||
1722 | MOVNTQ" %%mm7, 56(%0,%2,2)" | ||
1723 | :: "r"(d), "r"(s1), "r"(x) | ||
1724 | :"memory"); | ||
1725 | } | ||
1726 | ✗ | for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x]; | |
1727 | } | ||
1728 | ✗ | for (x86_reg y = 0; y < h; y++) { | |
1729 | ✗ | const uint8_t* s2=src2+srcStride2*(y>>1); | |
1730 | ✗ | uint8_t* d=dst2+dstStride2*y; | |
1731 | ✗ | x86_reg x = 0; | |
1732 | ✗ | for (;x<w-31;x+=32) { | |
1733 | ✗ | __asm__ volatile( | |
1734 | PREFETCH" 32(%1,%2) \n\t" | ||
1735 | "movq (%1,%2), %%mm0 \n\t" | ||
1736 | "movq 8(%1,%2), %%mm2 \n\t" | ||
1737 | "movq 16(%1,%2), %%mm4 \n\t" | ||
1738 | "movq 24(%1,%2), %%mm6 \n\t" | ||
1739 | "movq %%mm0, %%mm1 \n\t" | ||
1740 | "movq %%mm2, %%mm3 \n\t" | ||
1741 | "movq %%mm4, %%mm5 \n\t" | ||
1742 | "movq %%mm6, %%mm7 \n\t" | ||
1743 | "punpcklbw %%mm0, %%mm0 \n\t" | ||
1744 | "punpckhbw %%mm1, %%mm1 \n\t" | ||
1745 | "punpcklbw %%mm2, %%mm2 \n\t" | ||
1746 | "punpckhbw %%mm3, %%mm3 \n\t" | ||
1747 | "punpcklbw %%mm4, %%mm4 \n\t" | ||
1748 | "punpckhbw %%mm5, %%mm5 \n\t" | ||
1749 | "punpcklbw %%mm6, %%mm6 \n\t" | ||
1750 | "punpckhbw %%mm7, %%mm7 \n\t" | ||
1751 | MOVNTQ" %%mm0, (%0,%2,2) \n\t" | ||
1752 | MOVNTQ" %%mm1, 8(%0,%2,2) \n\t" | ||
1753 | MOVNTQ" %%mm2, 16(%0,%2,2) \n\t" | ||
1754 | MOVNTQ" %%mm3, 24(%0,%2,2) \n\t" | ||
1755 | MOVNTQ" %%mm4, 32(%0,%2,2) \n\t" | ||
1756 | MOVNTQ" %%mm5, 40(%0,%2,2) \n\t" | ||
1757 | MOVNTQ" %%mm6, 48(%0,%2,2) \n\t" | ||
1758 | MOVNTQ" %%mm7, 56(%0,%2,2)" | ||
1759 | :: "r"(d), "r"(s2), "r"(x) | ||
1760 | :"memory"); | ||
1761 | } | ||
1762 | ✗ | for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x]; | |
1763 | } | ||
1764 | ✗ | __asm__( | |
1765 | EMMS" \n\t" | ||
1766 | SFENCE" \n\t" | ||
1767 | ::: "memory" | ||
1768 | ); | ||
1769 | ✗ | } | |
1770 | |||
1771 | ✗ | static inline void yvu9_to_yuy2_mmxext(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, | |
1772 | uint8_t *dst, | ||
1773 | int width, int height, | ||
1774 | int srcStride1, int srcStride2, | ||
1775 | int srcStride3, int dstStride) | ||
1776 | { | ||
1777 | int w,h; | ||
1778 | ✗ | w=width/2; h=height; | |
1779 | ✗ | for (int y = 0; y < h; y++) { | |
1780 | ✗ | const uint8_t* yp=src1+srcStride1*y; | |
1781 | ✗ | const uint8_t* up=src2+srcStride2*(y>>2); | |
1782 | ✗ | const uint8_t* vp=src3+srcStride3*(y>>2); | |
1783 | ✗ | uint8_t* d=dst+dstStride*y; | |
1784 | ✗ | x86_reg x = 0; | |
1785 | ✗ | for (;x<w-7;x+=8) { | |
1786 | ✗ | __asm__ volatile( | |
1787 | PREFETCH" 32(%1, %0) \n\t" | ||
1788 | PREFETCH" 32(%2, %0) \n\t" | ||
1789 | PREFETCH" 32(%3, %0) \n\t" | ||
1790 | "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ | ||
1791 | "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */ | ||
1792 | "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */ | ||
1793 | "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ | ||
1794 | "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */ | ||
1795 | "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */ | ||
1796 | "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */ | ||
1797 | "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */ | ||
1798 | "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */ | ||
1799 | "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */ | ||
1800 | |||
1801 | "movq %%mm1, %%mm6 \n\t" | ||
1802 | "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/ | ||
1803 | "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/ | ||
1804 | "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/ | ||
1805 | MOVNTQ" %%mm0, (%4, %0, 8) \n\t" | ||
1806 | MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t" | ||
1807 | |||
1808 | "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/ | ||
1809 | "movq 8(%1, %0, 4), %%mm0 \n\t" | ||
1810 | "movq %%mm0, %%mm3 \n\t" | ||
1811 | "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/ | ||
1812 | "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/ | ||
1813 | MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t" | ||
1814 | MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t" | ||
1815 | |||
1816 | "movq %%mm4, %%mm6 \n\t" | ||
1817 | "movq 16(%1, %0, 4), %%mm0 \n\t" | ||
1818 | "movq %%mm0, %%mm3 \n\t" | ||
1819 | "punpcklbw %%mm5, %%mm4 \n\t" | ||
1820 | "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/ | ||
1821 | "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/ | ||
1822 | MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t" | ||
1823 | MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t" | ||
1824 | |||
1825 | "punpckhbw %%mm5, %%mm6 \n\t" | ||
1826 | "movq 24(%1, %0, 4), %%mm0 \n\t" | ||
1827 | "movq %%mm0, %%mm3 \n\t" | ||
1828 | "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/ | ||
1829 | "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/ | ||
1830 | MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t" | ||
1831 | MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t" | ||
1832 | |||
1833 | : "+r" (x) | ||
1834 | : "r"(yp), "r" (up), "r"(vp), "r"(d) | ||
1835 | :"memory"); | ||
1836 | } | ||
1837 | ✗ | for (; x<w; x++) { | |
1838 | ✗ | const int x2 = x<<2; | |
1839 | ✗ | d[8*x+0] = yp[x2]; | |
1840 | ✗ | d[8*x+1] = up[x]; | |
1841 | ✗ | d[8*x+2] = yp[x2+1]; | |
1842 | ✗ | d[8*x+3] = vp[x]; | |
1843 | ✗ | d[8*x+4] = yp[x2+2]; | |
1844 | ✗ | d[8*x+5] = up[x]; | |
1845 | ✗ | d[8*x+6] = yp[x2+3]; | |
1846 | ✗ | d[8*x+7] = vp[x]; | |
1847 | } | ||
1848 | } | ||
1849 | ✗ | __asm__( | |
1850 | EMMS" \n\t" | ||
1851 | SFENCE" \n\t" | ||
1852 | ::: "memory" | ||
1853 | ); | ||
1854 | ✗ | } | |
1855 | |||
1856 | ✗ | static void extract_even_mmxext(const uint8_t *src, uint8_t *dst, x86_reg count) | |
1857 | { | ||
1858 | ✗ | dst += count; | |
1859 | ✗ | src += 2*count; | |
1860 | ✗ | count= - count; | |
1861 | |||
1862 | ✗ | if(count <= -16) { | |
1863 | ✗ | count += 15; | |
1864 | ✗ | __asm__ volatile( | |
1865 | "pcmpeqw %%mm7, %%mm7 \n\t" | ||
1866 | "psrlw $8, %%mm7 \n\t" | ||
1867 | "1: \n\t" | ||
1868 | "movq -30(%1, %0, 2), %%mm0 \n\t" | ||
1869 | "movq -22(%1, %0, 2), %%mm1 \n\t" | ||
1870 | "movq -14(%1, %0, 2), %%mm2 \n\t" | ||
1871 | "movq -6(%1, %0, 2), %%mm3 \n\t" | ||
1872 | "pand %%mm7, %%mm0 \n\t" | ||
1873 | "pand %%mm7, %%mm1 \n\t" | ||
1874 | "pand %%mm7, %%mm2 \n\t" | ||
1875 | "pand %%mm7, %%mm3 \n\t" | ||
1876 | "packuswb %%mm1, %%mm0 \n\t" | ||
1877 | "packuswb %%mm3, %%mm2 \n\t" | ||
1878 | MOVNTQ" %%mm0,-15(%2, %0) \n\t" | ||
1879 | MOVNTQ" %%mm2,- 7(%2, %0) \n\t" | ||
1880 | "add $16, %0 \n\t" | ||
1881 | " js 1b \n\t" | ||
1882 | : "+r"(count) | ||
1883 | : "r"(src), "r"(dst) | ||
1884 | ); | ||
1885 | ✗ | count -= 15; | |
1886 | } | ||
1887 | ✗ | while(count<0) { | |
1888 | ✗ | dst[count]= src[2*count]; | |
1889 | ✗ | count++; | |
1890 | } | ||
1891 | ✗ | } | |
1892 | |||
1893 | ✗ | static void extract_odd_mmxext(const uint8_t *src, uint8_t *dst, x86_reg count) | |
1894 | { | ||
1895 | ✗ | src ++; | |
1896 | ✗ | dst += count; | |
1897 | ✗ | src += 2*count; | |
1898 | ✗ | count= - count; | |
1899 | |||
1900 | ✗ | if(count < -16) { | |
1901 | ✗ | count += 16; | |
1902 | ✗ | __asm__ volatile( | |
1903 | "pcmpeqw %%mm7, %%mm7 \n\t" | ||
1904 | "psrlw $8, %%mm7 \n\t" | ||
1905 | "1: \n\t" | ||
1906 | "movq -32(%1, %0, 2), %%mm0 \n\t" | ||
1907 | "movq -24(%1, %0, 2), %%mm1 \n\t" | ||
1908 | "movq -16(%1, %0, 2), %%mm2 \n\t" | ||
1909 | "movq -8(%1, %0, 2), %%mm3 \n\t" | ||
1910 | "pand %%mm7, %%mm0 \n\t" | ||
1911 | "pand %%mm7, %%mm1 \n\t" | ||
1912 | "pand %%mm7, %%mm2 \n\t" | ||
1913 | "pand %%mm7, %%mm3 \n\t" | ||
1914 | "packuswb %%mm1, %%mm0 \n\t" | ||
1915 | "packuswb %%mm3, %%mm2 \n\t" | ||
1916 | MOVNTQ" %%mm0,-16(%2, %0) \n\t" | ||
1917 | MOVNTQ" %%mm2,- 8(%2, %0) \n\t" | ||
1918 | "add $16, %0 \n\t" | ||
1919 | " js 1b \n\t" | ||
1920 | : "+r"(count) | ||
1921 | : "r"(src), "r"(dst) | ||
1922 | ); | ||
1923 | ✗ | count -= 16; | |
1924 | } | ||
1925 | ✗ | while(count<0) { | |
1926 | ✗ | dst[count]= src[2*count]; | |
1927 | ✗ | count++; | |
1928 | } | ||
1929 | ✗ | } | |
1930 | |||
1931 | #if ARCH_X86_32 | ||
1932 | static void extract_even2_mmxext(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count) | ||
1933 | { | ||
1934 | dst0+= count; | ||
1935 | dst1+= count; | ||
1936 | src += 4*count; | ||
1937 | count= - count; | ||
1938 | if(count <= -8) { | ||
1939 | count += 7; | ||
1940 | __asm__ volatile( | ||
1941 | "pcmpeqw %%mm7, %%mm7 \n\t" | ||
1942 | "psrlw $8, %%mm7 \n\t" | ||
1943 | "1: \n\t" | ||
1944 | "movq -28(%1, %0, 4), %%mm0 \n\t" | ||
1945 | "movq -20(%1, %0, 4), %%mm1 \n\t" | ||
1946 | "movq -12(%1, %0, 4), %%mm2 \n\t" | ||
1947 | "movq -4(%1, %0, 4), %%mm3 \n\t" | ||
1948 | "pand %%mm7, %%mm0 \n\t" | ||
1949 | "pand %%mm7, %%mm1 \n\t" | ||
1950 | "pand %%mm7, %%mm2 \n\t" | ||
1951 | "pand %%mm7, %%mm3 \n\t" | ||
1952 | "packuswb %%mm1, %%mm0 \n\t" | ||
1953 | "packuswb %%mm3, %%mm2 \n\t" | ||
1954 | "movq %%mm0, %%mm1 \n\t" | ||
1955 | "movq %%mm2, %%mm3 \n\t" | ||
1956 | "psrlw $8, %%mm0 \n\t" | ||
1957 | "psrlw $8, %%mm2 \n\t" | ||
1958 | "pand %%mm7, %%mm1 \n\t" | ||
1959 | "pand %%mm7, %%mm3 \n\t" | ||
1960 | "packuswb %%mm2, %%mm0 \n\t" | ||
1961 | "packuswb %%mm3, %%mm1 \n\t" | ||
1962 | MOVNTQ" %%mm0,- 7(%3, %0) \n\t" | ||
1963 | MOVNTQ" %%mm1,- 7(%2, %0) \n\t" | ||
1964 | "add $8, %0 \n\t" | ||
1965 | " js 1b \n\t" | ||
1966 | : "+r"(count) | ||
1967 | : "r"(src), "r"(dst0), "r"(dst1) | ||
1968 | ); | ||
1969 | count -= 7; | ||
1970 | } | ||
1971 | while(count<0) { | ||
1972 | dst0[count]= src[4*count+0]; | ||
1973 | dst1[count]= src[4*count+2]; | ||
1974 | count++; | ||
1975 | } | ||
1976 | } | ||
1977 | #endif /* ARCH_X86_32 */ | ||
1978 | |||
1979 | ✗ | static void extract_even2avg_mmxext(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count) | |
1980 | { | ||
1981 | ✗ | dst0 += count; | |
1982 | ✗ | dst1 += count; | |
1983 | ✗ | src0 += 4*count; | |
1984 | ✗ | src1 += 4*count; | |
1985 | ✗ | count= - count; | |
1986 | #ifdef PAVGB | ||
1987 | ✗ | if(count <= -8) { | |
1988 | ✗ | count += 7; | |
1989 | ✗ | __asm__ volatile( | |
1990 | "pcmpeqw %%mm7, %%mm7 \n\t" | ||
1991 | "psrlw $8, %%mm7 \n\t" | ||
1992 | "1: \n\t" | ||
1993 | "movq -28(%1, %0, 4), %%mm0 \n\t" | ||
1994 | "movq -20(%1, %0, 4), %%mm1 \n\t" | ||
1995 | "movq -12(%1, %0, 4), %%mm2 \n\t" | ||
1996 | "movq -4(%1, %0, 4), %%mm3 \n\t" | ||
1997 | PAVGB" -28(%2, %0, 4), %%mm0 \n\t" | ||
1998 | PAVGB" -20(%2, %0, 4), %%mm1 \n\t" | ||
1999 | PAVGB" -12(%2, %0, 4), %%mm2 \n\t" | ||
2000 | PAVGB" - 4(%2, %0, 4), %%mm3 \n\t" | ||
2001 | "pand %%mm7, %%mm0 \n\t" | ||
2002 | "pand %%mm7, %%mm1 \n\t" | ||
2003 | "pand %%mm7, %%mm2 \n\t" | ||
2004 | "pand %%mm7, %%mm3 \n\t" | ||
2005 | "packuswb %%mm1, %%mm0 \n\t" | ||
2006 | "packuswb %%mm3, %%mm2 \n\t" | ||
2007 | "movq %%mm0, %%mm1 \n\t" | ||
2008 | "movq %%mm2, %%mm3 \n\t" | ||
2009 | "psrlw $8, %%mm0 \n\t" | ||
2010 | "psrlw $8, %%mm2 \n\t" | ||
2011 | "pand %%mm7, %%mm1 \n\t" | ||
2012 | "pand %%mm7, %%mm3 \n\t" | ||
2013 | "packuswb %%mm2, %%mm0 \n\t" | ||
2014 | "packuswb %%mm3, %%mm1 \n\t" | ||
2015 | MOVNTQ" %%mm0,- 7(%4, %0) \n\t" | ||
2016 | MOVNTQ" %%mm1,- 7(%3, %0) \n\t" | ||
2017 | "add $8, %0 \n\t" | ||
2018 | " js 1b \n\t" | ||
2019 | : "+r"(count) | ||
2020 | : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1) | ||
2021 | ); | ||
2022 | ✗ | count -= 7; | |
2023 | } | ||
2024 | #endif | ||
2025 | ✗ | while(count<0) { | |
2026 | ✗ | dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1; | |
2027 | ✗ | dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1; | |
2028 | ✗ | count++; | |
2029 | } | ||
2030 | ✗ | } | |
2031 | |||
2032 | ✗ | static void extract_odd2_mmxext(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count) | |
2033 | { | ||
2034 | ✗ | dst0+= count; | |
2035 | ✗ | dst1+= count; | |
2036 | ✗ | src += 4*count; | |
2037 | ✗ | count= - count; | |
2038 | ✗ | if(count <= -8) { | |
2039 | ✗ | count += 7; | |
2040 | ✗ | __asm__ volatile( | |
2041 | "pcmpeqw %%mm7, %%mm7 \n\t" | ||
2042 | "psrlw $8, %%mm7 \n\t" | ||
2043 | "1: \n\t" | ||
2044 | "movq -28(%1, %0, 4), %%mm0 \n\t" | ||
2045 | "movq -20(%1, %0, 4), %%mm1 \n\t" | ||
2046 | "movq -12(%1, %0, 4), %%mm2 \n\t" | ||
2047 | "movq -4(%1, %0, 4), %%mm3 \n\t" | ||
2048 | "psrlw $8, %%mm0 \n\t" | ||
2049 | "psrlw $8, %%mm1 \n\t" | ||
2050 | "psrlw $8, %%mm2 \n\t" | ||
2051 | "psrlw $8, %%mm3 \n\t" | ||
2052 | "packuswb %%mm1, %%mm0 \n\t" | ||
2053 | "packuswb %%mm3, %%mm2 \n\t" | ||
2054 | "movq %%mm0, %%mm1 \n\t" | ||
2055 | "movq %%mm2, %%mm3 \n\t" | ||
2056 | "psrlw $8, %%mm0 \n\t" | ||
2057 | "psrlw $8, %%mm2 \n\t" | ||
2058 | "pand %%mm7, %%mm1 \n\t" | ||
2059 | "pand %%mm7, %%mm3 \n\t" | ||
2060 | "packuswb %%mm2, %%mm0 \n\t" | ||
2061 | "packuswb %%mm3, %%mm1 \n\t" | ||
2062 | MOVNTQ" %%mm0,- 7(%3, %0) \n\t" | ||
2063 | MOVNTQ" %%mm1,- 7(%2, %0) \n\t" | ||
2064 | "add $8, %0 \n\t" | ||
2065 | " js 1b \n\t" | ||
2066 | : "+r"(count) | ||
2067 | : "r"(src), "r"(dst0), "r"(dst1) | ||
2068 | ); | ||
2069 | ✗ | count -= 7; | |
2070 | } | ||
2071 | ✗ | src++; | |
2072 | ✗ | while(count<0) { | |
2073 | ✗ | dst0[count]= src[4*count+0]; | |
2074 | ✗ | dst1[count]= src[4*count+2]; | |
2075 | ✗ | count++; | |
2076 | } | ||
2077 | ✗ | } | |
2078 | |||
2079 | ✗ | static void extract_odd2avg_mmxext(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count) | |
2080 | { | ||
2081 | ✗ | dst0 += count; | |
2082 | ✗ | dst1 += count; | |
2083 | ✗ | src0 += 4*count; | |
2084 | ✗ | src1 += 4*count; | |
2085 | ✗ | count= - count; | |
2086 | #ifdef PAVGB | ||
2087 | ✗ | if(count <= -8) { | |
2088 | ✗ | count += 7; | |
2089 | ✗ | __asm__ volatile( | |
2090 | "pcmpeqw %%mm7, %%mm7 \n\t" | ||
2091 | "psrlw $8, %%mm7 \n\t" | ||
2092 | "1: \n\t" | ||
2093 | "movq -28(%1, %0, 4), %%mm0 \n\t" | ||
2094 | "movq -20(%1, %0, 4), %%mm1 \n\t" | ||
2095 | "movq -12(%1, %0, 4), %%mm2 \n\t" | ||
2096 | "movq -4(%1, %0, 4), %%mm3 \n\t" | ||
2097 | PAVGB" -28(%2, %0, 4), %%mm0 \n\t" | ||
2098 | PAVGB" -20(%2, %0, 4), %%mm1 \n\t" | ||
2099 | PAVGB" -12(%2, %0, 4), %%mm2 \n\t" | ||
2100 | PAVGB" - 4(%2, %0, 4), %%mm3 \n\t" | ||
2101 | "psrlw $8, %%mm0 \n\t" | ||
2102 | "psrlw $8, %%mm1 \n\t" | ||
2103 | "psrlw $8, %%mm2 \n\t" | ||
2104 | "psrlw $8, %%mm3 \n\t" | ||
2105 | "packuswb %%mm1, %%mm0 \n\t" | ||
2106 | "packuswb %%mm3, %%mm2 \n\t" | ||
2107 | "movq %%mm0, %%mm1 \n\t" | ||
2108 | "movq %%mm2, %%mm3 \n\t" | ||
2109 | "psrlw $8, %%mm0 \n\t" | ||
2110 | "psrlw $8, %%mm2 \n\t" | ||
2111 | "pand %%mm7, %%mm1 \n\t" | ||
2112 | "pand %%mm7, %%mm3 \n\t" | ||
2113 | "packuswb %%mm2, %%mm0 \n\t" | ||
2114 | "packuswb %%mm3, %%mm1 \n\t" | ||
2115 | MOVNTQ" %%mm0,- 7(%4, %0) \n\t" | ||
2116 | MOVNTQ" %%mm1,- 7(%3, %0) \n\t" | ||
2117 | "add $8, %0 \n\t" | ||
2118 | " js 1b \n\t" | ||
2119 | : "+r"(count) | ||
2120 | : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1) | ||
2121 | ); | ||
2122 | ✗ | count -= 7; | |
2123 | } | ||
2124 | #endif | ||
2125 | ✗ | src0++; | |
2126 | ✗ | src1++; | |
2127 | ✗ | while(count<0) { | |
2128 | ✗ | dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1; | |
2129 | ✗ | dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1; | |
2130 | ✗ | count++; | |
2131 | } | ||
2132 | ✗ | } | |
2133 | |||
2134 | ✗ | static void yuyvtoyuv420_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, | |
2135 | int width, int height, | ||
2136 | int lumStride, int chromStride, int srcStride) | ||
2137 | { | ||
2138 | ✗ | const int chromWidth = AV_CEIL_RSHIFT(width, 1); | |
2139 | |||
2140 | ✗ | for (int y = 0; y < height; y++) { | |
2141 | ✗ | extract_even_mmxext(src, ydst, width); | |
2142 | ✗ | if(y&1) { | |
2143 | ✗ | extract_odd2avg_mmxext(src-srcStride, src, udst, vdst, chromWidth); | |
2144 | ✗ | udst+= chromStride; | |
2145 | ✗ | vdst+= chromStride; | |
2146 | } | ||
2147 | |||
2148 | ✗ | src += srcStride; | |
2149 | ✗ | ydst+= lumStride; | |
2150 | } | ||
2151 | ✗ | __asm__( | |
2152 | EMMS" \n\t" | ||
2153 | SFENCE" \n\t" | ||
2154 | ::: "memory" | ||
2155 | ); | ||
2156 | ✗ | } | |
2157 | |||
2158 | ✗ | static void yuyvtoyuv422_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, | |
2159 | int width, int height, | ||
2160 | int lumStride, int chromStride, int srcStride) | ||
2161 | { | ||
2162 | ✗ | const int chromWidth = AV_CEIL_RSHIFT(width, 1); | |
2163 | |||
2164 | ✗ | for (int y = 0; y < height; y++) { | |
2165 | ✗ | extract_even_mmxext(src, ydst, width); | |
2166 | ✗ | extract_odd2_mmxext(src, udst, vdst, chromWidth); | |
2167 | |||
2168 | ✗ | src += srcStride; | |
2169 | ✗ | ydst+= lumStride; | |
2170 | ✗ | udst+= chromStride; | |
2171 | ✗ | vdst+= chromStride; | |
2172 | } | ||
2173 | ✗ | __asm__( | |
2174 | EMMS" \n\t" | ||
2175 | SFENCE" \n\t" | ||
2176 | ::: "memory" | ||
2177 | ); | ||
2178 | ✗ | } | |
2179 | |||
2180 | ✗ | static void uyvytoyuv420_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, | |
2181 | int width, int height, | ||
2182 | int lumStride, int chromStride, int srcStride) | ||
2183 | { | ||
2184 | ✗ | const int chromWidth = AV_CEIL_RSHIFT(width, 1); | |
2185 | |||
2186 | ✗ | for (int y = 0; y < height; y++) { | |
2187 | ✗ | extract_odd_mmxext(src, ydst, width); | |
2188 | ✗ | if(y&1) { | |
2189 | ✗ | extract_even2avg_mmxext(src-srcStride, src, udst, vdst, chromWidth); | |
2190 | ✗ | udst+= chromStride; | |
2191 | ✗ | vdst+= chromStride; | |
2192 | } | ||
2193 | |||
2194 | ✗ | src += srcStride; | |
2195 | ✗ | ydst+= lumStride; | |
2196 | } | ||
2197 | ✗ | __asm__( | |
2198 | EMMS" \n\t" | ||
2199 | SFENCE" \n\t" | ||
2200 | ::: "memory" | ||
2201 | ); | ||
2202 | ✗ | } | |
2203 | |||
2204 | #if ARCH_X86_32 | ||
2205 | static void uyvytoyuv422_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, | ||
2206 | int width, int height, | ||
2207 | int lumStride, int chromStride, int srcStride) | ||
2208 | { | ||
2209 | const int chromWidth = AV_CEIL_RSHIFT(width, 1); | ||
2210 | |||
2211 | for (int y = 0; y < height; y++) { | ||
2212 | extract_odd_mmxext(src, ydst, width); | ||
2213 | extract_even2_mmxext(src, udst, vdst, chromWidth); | ||
2214 | |||
2215 | src += srcStride; | ||
2216 | ydst+= lumStride; | ||
2217 | udst+= chromStride; | ||
2218 | vdst+= chromStride; | ||
2219 | } | ||
2220 | __asm__( | ||
2221 | EMMS" \n\t" | ||
2222 | SFENCE" \n\t" | ||
2223 | ::: "memory" | ||
2224 | ); | ||
2225 | } | ||
2226 | #endif /* ARCH_X86_32 */ | ||
2227 | |||
2228 | 16 | static av_cold void rgb2rgb_init_mmxext(void) | |
2229 | { | ||
2230 | 16 | rgb15to16 = rgb15to16_mmxext; | |
2231 | 16 | rgb15tobgr24 = rgb15tobgr24_mmxext; | |
2232 | 16 | rgb15to32 = rgb15to32_mmxext; | |
2233 | 16 | rgb16tobgr24 = rgb16tobgr24_mmxext; | |
2234 | 16 | rgb16to32 = rgb16to32_mmxext; | |
2235 | 16 | rgb16to15 = rgb16to15_mmxext; | |
2236 | 16 | rgb24tobgr16 = rgb24tobgr16_mmxext; | |
2237 | 16 | rgb24tobgr15 = rgb24tobgr15_mmxext; | |
2238 | 16 | rgb24tobgr32 = rgb24tobgr32_mmxext; | |
2239 | 16 | rgb32to16 = rgb32to16_mmxext; | |
2240 | 16 | rgb32to15 = rgb32to15_mmxext; | |
2241 | 16 | rgb32tobgr24 = rgb32tobgr24_mmxext; | |
2242 | 16 | rgb24to15 = rgb24to15_mmxext; | |
2243 | 16 | rgb24to16 = rgb24to16_mmxext; | |
2244 | 16 | rgb24tobgr24 = rgb24tobgr24_mmxext; | |
2245 | 16 | rgb32tobgr16 = rgb32tobgr16_mmxext; | |
2246 | 16 | rgb32tobgr15 = rgb32tobgr15_mmxext; | |
2247 | 16 | yv12toyuy2 = yv12toyuy2_mmxext; | |
2248 | 16 | yv12touyvy = yv12touyvy_mmxext; | |
2249 | 16 | yuv422ptoyuy2 = yuv422ptoyuy2_mmxext; | |
2250 | 16 | yuv422ptouyvy = yuv422ptouyvy_mmxext; | |
2251 | 16 | yuy2toyv12 = yuy2toyv12_mmxext; | |
2252 | 16 | vu9_to_vu12 = vu9_to_vu12_mmxext; | |
2253 | 16 | yvu9_to_yuy2 = yvu9_to_yuy2_mmxext; | |
2254 | #if ARCH_X86_32 | ||
2255 | uyvytoyuv422 = uyvytoyuv422_mmxext; | ||
2256 | #endif | ||
2257 | 16 | yuyvtoyuv422 = yuyvtoyuv422_mmxext; | |
2258 | |||
2259 | 16 | planar2x = planar2x_mmxext; | |
2260 | #if ARCH_X86_32 && HAVE_7REGS | ||
2261 | ff_rgb24toyv12 = rgb24toyv12_mmxext; | ||
2262 | #endif /* ARCH_X86_32 && HAVE_7REGS */ | ||
2263 | |||
2264 | 16 | yuyvtoyuv420 = yuyvtoyuv420_mmxext; | |
2265 | 16 | uyvytoyuv420 = uyvytoyuv420_mmxext; | |
2266 | 16 | } | |
2267 | |||
2268 | //SSE2 versions | ||
2269 | 17 | static void interleave_bytes_sse2(const uint8_t *src1, const uint8_t *src2, uint8_t *dest, | |
2270 | int width, int height, int src1Stride, | ||
2271 | int src2Stride, int dstStride) | ||
2272 | { | ||
2273 |
2/2✓ Branch 0 taken 1098 times.
✓ Branch 1 taken 17 times.
|
1115 | for (int h = 0; h < height; h++) { |
2274 |
2/2✓ Branch 0 taken 197 times.
✓ Branch 1 taken 901 times.
|
1098 | if (width >= 16) { |
2275 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 197 times.
|
197 | if (!((((intptr_t)src1) | ((intptr_t)src2) | ((intptr_t)dest))&15)) { |
2276 | ✗ | __asm__( | |
2277 | "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" | ||
2278 | "1: \n\t" | ||
2279 | PREFETCH" 64(%1, %%"FF_REG_a") \n\t" | ||
2280 | PREFETCH" 64(%2, %%"FF_REG_a") \n\t" | ||
2281 | "movdqa (%1, %%"FF_REG_a"), %%xmm0 \n\t" | ||
2282 | "movdqa (%1, %%"FF_REG_a"), %%xmm1 \n\t" | ||
2283 | "movdqa (%2, %%"FF_REG_a"), %%xmm2 \n\t" | ||
2284 | "punpcklbw %%xmm2, %%xmm0 \n\t" | ||
2285 | "punpckhbw %%xmm2, %%xmm1 \n\t" | ||
2286 | "movntdq %%xmm0, (%0, %%"FF_REG_a", 2) \n\t" | ||
2287 | "movntdq %%xmm1, 16(%0, %%"FF_REG_a", 2) \n\t" | ||
2288 | "add $16, %%"FF_REG_a" \n\t" | ||
2289 | "cmp %3, %%"FF_REG_a" \n\t" | ||
2290 | " jb 1b \n\t" | ||
2291 | ✗ | ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15) | |
2292 | : "memory", XMM_CLOBBERS("xmm0", "xmm1", "xmm2",) "%"FF_REG_a | ||
2293 | ); | ||
2294 | } else | ||
2295 | 197 | __asm__( | |
2296 | "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" | ||
2297 | "1: \n\t" | ||
2298 | PREFETCH" 64(%1, %%"FF_REG_a") \n\t" | ||
2299 | PREFETCH" 64(%2, %%"FF_REG_a") \n\t" | ||
2300 | "movq (%1, %%"FF_REG_a"), %%mm0 \n\t" | ||
2301 | "movq 8(%1, %%"FF_REG_a"), %%mm2 \n\t" | ||
2302 | "movq %%mm0, %%mm1 \n\t" | ||
2303 | "movq %%mm2, %%mm3 \n\t" | ||
2304 | "movq (%2, %%"FF_REG_a"), %%mm4 \n\t" | ||
2305 | "movq 8(%2, %%"FF_REG_a"), %%mm5 \n\t" | ||
2306 | "punpcklbw %%mm4, %%mm0 \n\t" | ||
2307 | "punpckhbw %%mm4, %%mm1 \n\t" | ||
2308 | "punpcklbw %%mm5, %%mm2 \n\t" | ||
2309 | "punpckhbw %%mm5, %%mm3 \n\t" | ||
2310 | MOVNTQ" %%mm0, (%0, %%"FF_REG_a", 2) \n\t" | ||
2311 | MOVNTQ" %%mm1, 8(%0, %%"FF_REG_a", 2) \n\t" | ||
2312 | MOVNTQ" %%mm2, 16(%0, %%"FF_REG_a", 2) \n\t" | ||
2313 | MOVNTQ" %%mm3, 24(%0, %%"FF_REG_a", 2) \n\t" | ||
2314 | "add $16, %%"FF_REG_a" \n\t" | ||
2315 | "cmp %3, %%"FF_REG_a" \n\t" | ||
2316 | " jb 1b \n\t" | ||
2317 | 197 | ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15) | |
2318 | : "memory", "%"FF_REG_a | ||
2319 | ); | ||
2320 | |||
2321 | } | ||
2322 |
2/2✓ Branch 0 taken 7985 times.
✓ Branch 1 taken 1098 times.
|
9083 | for (int w = (width & (~15)); w < width; w++) { |
2323 | 7985 | dest[2*w+0] = src1[w]; | |
2324 | 7985 | dest[2*w+1] = src2[w]; | |
2325 | } | ||
2326 | 1098 | dest += dstStride; | |
2327 | 1098 | src1 += src1Stride; | |
2328 | 1098 | src2 += src2Stride; | |
2329 | } | ||
2330 | 17 | __asm__( | |
2331 | EMMS" \n\t" | ||
2332 | SFENCE" \n\t" | ||
2333 | ::: "memory" | ||
2334 | ); | ||
2335 | 17 | } | |
2336 | |||
2337 | /* | ||
2338 | RGB15->RGB16 original by Strepto/Astral | ||
2339 | ported to gcc & bugfixed : A'rpi | ||
2340 | MMXEXT, 3DNOW optimization by Nick Kurshev | ||
2341 | 32-bit C version, and and&add trick by Michael Niedermayer | ||
2342 | */ | ||
2343 | |||
2344 | #endif /* HAVE_INLINE_ASM */ | ||
2345 | |||
2346 | void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size); | ||
2347 | void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size); | ||
2348 | void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size); | ||
2349 | void ff_shuffle_bytes_3012_ssse3(const uint8_t *src, uint8_t *dst, int src_size); | ||
2350 | void ff_shuffle_bytes_3210_ssse3(const uint8_t *src, uint8_t *dst, int src_size); | ||
2351 | void ff_shuffle_bytes_3102_ssse3(const uint8_t *src, uint8_t *dst, int src_size); | ||
2352 | void ff_shuffle_bytes_2013_ssse3(const uint8_t *src, uint8_t *dst, int src_size); | ||
2353 | void ff_shuffle_bytes_2130_ssse3(const uint8_t *src, uint8_t *dst, int src_size); | ||
2354 | void ff_shuffle_bytes_1203_ssse3(const uint8_t *src, uint8_t *dst, int src_size); | ||
2355 | |||
2356 | #if ARCH_X86_64 | ||
2357 | void ff_shuffle_bytes_2103_avx2(const uint8_t *src, uint8_t *dst, int src_size); | ||
2358 | void ff_shuffle_bytes_0321_avx2(const uint8_t *src, uint8_t *dst, int src_size); | ||
2359 | void ff_shuffle_bytes_1230_avx2(const uint8_t *src, uint8_t *dst, int src_size); | ||
2360 | void ff_shuffle_bytes_3012_avx2(const uint8_t *src, uint8_t *dst, int src_size); | ||
2361 | void ff_shuffle_bytes_3210_avx2(const uint8_t *src, uint8_t *dst, int src_size); | ||
2362 | void ff_shuffle_bytes_3102_avx2(const uint8_t *src, uint8_t *dst, int src_size); | ||
2363 | void ff_shuffle_bytes_2013_avx2(const uint8_t *src, uint8_t *dst, int src_size); | ||
2364 | void ff_shuffle_bytes_2130_avx2(const uint8_t *src, uint8_t *dst, int src_size); | ||
2365 | void ff_shuffle_bytes_1203_avx2(const uint8_t *src, uint8_t *dst, int src_size); | ||
2366 | |||
2367 | void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | ||
2368 | const uint8_t *src, int width, int height, | ||
2369 | int lumStride, int chromStride, int srcStride); | ||
2370 | void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | ||
2371 | const uint8_t *src, int width, int height, | ||
2372 | int lumStride, int chromStride, int srcStride); | ||
2373 | void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | ||
2374 | const uint8_t *src, int width, int height, | ||
2375 | int lumStride, int chromStride, int srcStride); | ||
2376 | #endif | ||
2377 | |||
2378 | #define DEINTERLEAVE_BYTES(cpuext) \ | ||
2379 | void ff_nv12ToUV_ ## cpuext(uint8_t *dstU, uint8_t *dstV, \ | ||
2380 | const uint8_t *unused, \ | ||
2381 | const uint8_t *src1, \ | ||
2382 | const uint8_t *src2, \ | ||
2383 | int w, \ | ||
2384 | uint32_t *unused2, \ | ||
2385 | void *opq); \ | ||
2386 | static void deinterleave_bytes_ ## cpuext(const uint8_t *src, uint8_t *dst1, uint8_t *dst2, \ | ||
2387 | int width, int height, int srcStride, \ | ||
2388 | int dst1Stride, int dst2Stride) \ | ||
2389 | { \ | ||
2390 | for (int h = 0; h < height; h++) { \ | ||
2391 | if (width >= 16) \ | ||
2392 | ff_nv12ToUV_ ## cpuext(dst1, dst2, NULL, src, NULL, width - 15, NULL, NULL); \ | ||
2393 | for (int w = (width & (~15)); w < width; w++) { \ | ||
2394 | dst1[w] = src[2*w+0]; \ | ||
2395 | dst2[w] = src[2*w+1]; \ | ||
2396 | } \ | ||
2397 | src += srcStride; \ | ||
2398 | dst1 += dst1Stride; \ | ||
2399 | dst2 += dst2Stride; \ | ||
2400 | } \ | ||
2401 | } | ||
2402 | |||
2403 | #if HAVE_SSE2_EXTERNAL | ||
2404 |
6/6✓ Branch 0 taken 158 times.
✓ Branch 1 taken 2040 times.
✓ Branch 3 taken 15350 times.
✓ Branch 4 taken 2198 times.
✓ Branch 5 taken 2198 times.
✓ Branch 6 taken 34 times.
|
17582 | DEINTERLEAVE_BYTES(sse2) |
2405 | #endif | ||
2406 | #if HAVE_AVX_EXTERNAL | ||
2407 |
6/6✓ Branch 0 taken 74 times.
✓ Branch 1 taken 1030 times.
✓ Branch 3 taken 7732 times.
✓ Branch 4 taken 1104 times.
✓ Branch 5 taken 1104 times.
✓ Branch 6 taken 17 times.
|
8853 | DEINTERLEAVE_BYTES(avx) |
2408 | #endif | ||
2409 | |||
2410 | 4497 | av_cold void rgb2rgb_init_x86(void) | |
2411 | { | ||
2412 | 4497 | int cpu_flags = av_get_cpu_flags(); | |
2413 | |||
2414 | #if HAVE_INLINE_ASM | ||
2415 |
2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 4481 times.
|
4497 | if (INLINE_MMXEXT(cpu_flags)) |
2416 | 16 | rgb2rgb_init_mmxext(); | |
2417 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 4483 times.
|
4497 | if (INLINE_SSE2(cpu_flags)) |
2418 | 14 | interleaveBytes = interleave_bytes_sse2; | |
2419 | #endif /* HAVE_INLINE_ASM */ | ||
2420 | |||
2421 | #if HAVE_SSE2_EXTERNAL | ||
2422 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 4483 times.
|
4497 | if (EXTERNAL_SSE2(cpu_flags)) { |
2423 | #if ARCH_X86_64 | ||
2424 | 14 | uyvytoyuv422 = ff_uyvytoyuv422_sse2; | |
2425 | #endif | ||
2426 | 14 | deinterleaveBytes = deinterleave_bytes_sse2; | |
2427 | } | ||
2428 | #endif | ||
2429 |
2/2✓ Branch 0 taken 12 times.
✓ Branch 1 taken 4485 times.
|
4497 | if (EXTERNAL_SSSE3(cpu_flags)) { |
2430 | 12 | shuffle_bytes_0321 = ff_shuffle_bytes_0321_ssse3; | |
2431 | 12 | shuffle_bytes_2103 = ff_shuffle_bytes_2103_ssse3; | |
2432 | 12 | shuffle_bytes_1230 = ff_shuffle_bytes_1230_ssse3; | |
2433 | 12 | shuffle_bytes_3012 = ff_shuffle_bytes_3012_ssse3; | |
2434 | 12 | shuffle_bytes_3210 = ff_shuffle_bytes_3210_ssse3; | |
2435 | 12 | shuffle_bytes_3102 = ff_shuffle_bytes_3102_ssse3; | |
2436 | 12 | shuffle_bytes_2013 = ff_shuffle_bytes_2013_ssse3; | |
2437 | 12 | shuffle_bytes_2130 = ff_shuffle_bytes_2130_ssse3; | |
2438 | 12 | shuffle_bytes_1203 = ff_shuffle_bytes_1203_ssse3; | |
2439 | } | ||
2440 | #if HAVE_AVX_EXTERNAL | ||
2441 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 4489 times.
|
4497 | if (EXTERNAL_AVX(cpu_flags)) { |
2442 | 8 | deinterleaveBytes = deinterleave_bytes_avx; | |
2443 | #if ARCH_X86_64 | ||
2444 | 8 | uyvytoyuv422 = ff_uyvytoyuv422_avx; | |
2445 | } | ||
2446 |
3/4✓ Branch 0 taken 6 times.
✓ Branch 1 taken 4491 times.
✓ Branch 2 taken 6 times.
✗ Branch 3 not taken.
|
4497 | if (EXTERNAL_AVX2_FAST(cpu_flags)) { |
2447 | 6 | shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx2; | |
2448 | 6 | shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx2; | |
2449 | 6 | shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx2; | |
2450 | 6 | shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx2; | |
2451 | 6 | shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx2; | |
2452 | 6 | shuffle_bytes_3102 = ff_shuffle_bytes_3102_avx2; | |
2453 | 6 | shuffle_bytes_2013 = ff_shuffle_bytes_2013_avx2; | |
2454 | 6 | shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx2; | |
2455 | 6 | shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx2; | |
2456 | } | ||
2457 |
3/4✓ Branch 0 taken 6 times.
✓ Branch 1 taken 4491 times.
✓ Branch 2 taken 6 times.
✗ Branch 3 not taken.
|
4497 | if (EXTERNAL_AVX2_FAST(cpu_flags)) { |
2458 | 6 | uyvytoyuv422 = ff_uyvytoyuv422_avx2; | |
2459 | #endif | ||
2460 | } | ||
2461 | #endif | ||
2462 | 4497 | } | |
2463 |