Line | Branch | Exec | Source |
---|---|---|---|
1 | /* | ||
2 | * software RGB to RGB converter | ||
3 | * pluralize by software PAL8 to RGB converter | ||
4 | * software YUV to YUV converter | ||
5 | * software YUV to RGB converter | ||
6 | * Written by Nick Kurshev. | ||
7 | * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at) | ||
8 | * | ||
9 | * This file is part of FFmpeg. | ||
10 | * | ||
11 | * FFmpeg is free software; you can redistribute it and/or | ||
12 | * modify it under the terms of the GNU Lesser General Public | ||
13 | * License as published by the Free Software Foundation; either | ||
14 | * version 2.1 of the License, or (at your option) any later version. | ||
15 | * | ||
16 | * FFmpeg is distributed in the hope that it will be useful, | ||
17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
19 | * Lesser General Public License for more details. | ||
20 | * | ||
21 | * You should have received a copy of the GNU Lesser General Public | ||
22 | * License along with FFmpeg; if not, write to the Free Software | ||
23 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
24 | */ | ||
25 | |||
26 | #include <stdint.h> | ||
27 | |||
28 | #include "config.h" | ||
29 | #include "libavutil/attributes.h" | ||
30 | #include "libavutil/x86/cpu.h" | ||
31 | #include "libavutil/cpu.h" | ||
32 | #include "libavutil/bswap.h" | ||
33 | #include "libavutil/mem_internal.h" | ||
34 | |||
35 | #include "libswscale/rgb2rgb.h" | ||
36 | #include "libswscale/swscale.h" | ||
37 | #include "libswscale/swscale_internal.h" | ||
38 | |||
39 | #if HAVE_INLINE_ASM | ||
40 | #include "libavutil/x86/asm.h" | ||
41 | |||
42 | DECLARE_ASM_CONST(8, uint64_t, mmx_ff) = 0x00000000000000FFULL; | ||
43 | DECLARE_ASM_CONST(8, uint64_t, mmx_null) = 0x0000000000000000ULL; | ||
44 | DECLARE_ASM_CONST(8, uint64_t, mask32a) = 0xFF000000FF000000ULL; | ||
45 | DECLARE_ASM_CONST(8, uint64_t, mask3216br) = 0x00F800F800F800F8ULL; | ||
46 | DECLARE_ASM_CONST(8, uint64_t, mask3216g) = 0x0000FC000000FC00ULL; | ||
47 | DECLARE_ASM_CONST(8, uint64_t, mask3215g) = 0x0000F8000000F800ULL; | ||
48 | DECLARE_ASM_CONST(8, uint64_t, mul3216) = 0x2000000420000004ULL; | ||
49 | DECLARE_ASM_CONST(8, uint64_t, mul3215) = 0x2000000820000008ULL; | ||
50 | DECLARE_ASM_CONST(8, uint64_t, mask24b) = 0x00FF0000FF0000FFULL; | ||
51 | DECLARE_ASM_CONST(8, uint64_t, mask24g) = 0xFF0000FF0000FF00ULL; | ||
52 | DECLARE_ASM_CONST(8, uint64_t, mask24r) = 0x0000FF0000FF0000ULL; | ||
53 | DECLARE_ASM_CONST(8, uint64_t, mask24l) = 0x0000000000FFFFFFULL; | ||
54 | DECLARE_ASM_CONST(8, uint64_t, mask24h) = 0x0000FFFFFF000000ULL; | ||
55 | DECLARE_ASM_CONST(8, uint64_t, mask15b) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */ | ||
56 | DECLARE_ASM_CONST(8, uint64_t, mask15rg) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */ | ||
57 | DECLARE_ASM_CONST(8, uint64_t, mask15s) = 0xFFE0FFE0FFE0FFE0ULL; | ||
58 | DECLARE_ASM_CONST(8, uint64_t, mask15g) = 0x03E003E003E003E0ULL; | ||
59 | DECLARE_ASM_CONST(8, uint64_t, mask15r) = 0x7C007C007C007C00ULL; | ||
60 | #define mask16b mask15b | ||
61 | DECLARE_ASM_CONST(8, uint64_t, mask16g) = 0x07E007E007E007E0ULL; | ||
62 | DECLARE_ASM_CONST(8, uint64_t, mask16r) = 0xF800F800F800F800ULL; | ||
63 | DECLARE_ASM_CONST(8, uint64_t, red_16mask) = 0x0000f8000000f800ULL; | ||
64 | DECLARE_ASM_CONST(8, uint64_t, green_16mask) = 0x000007e0000007e0ULL; | ||
65 | DECLARE_ASM_CONST(8, uint64_t, blue_16mask) = 0x0000001f0000001fULL; | ||
66 | DECLARE_ASM_CONST(8, uint64_t, red_15mask) = 0x00007c0000007c00ULL; | ||
67 | DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL; | ||
68 | DECLARE_ASM_CONST(8, uint64_t, blue_15mask) = 0x0000001f0000001fULL; | ||
69 | DECLARE_ASM_CONST(8, uint64_t, mul15_mid) = 0x4200420042004200ULL; | ||
70 | DECLARE_ASM_CONST(8, uint64_t, mul15_hi) = 0x0210021002100210ULL; | ||
71 | DECLARE_ASM_CONST(8, uint64_t, mul16_mid) = 0x2080208020802080ULL; | ||
72 | |||
73 | #define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5)) | ||
74 | #define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5)) | ||
75 | #define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5)) | ||
76 | #define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5)) | ||
77 | #define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5)) | ||
78 | #define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5)) | ||
79 | #define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5)) | ||
80 | #define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5)) | ||
81 | #define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5)) | ||
82 | |||
83 | // MMXEXT versions | ||
84 | #define PREFETCH "prefetchnta" | ||
85 | #define PAVGB "pavgb" | ||
86 | #define MOVNTQ "movntq" | ||
87 | #define SFENCE "sfence" | ||
88 | |||
89 | #define EMMS "emms" | ||
90 | |||
91 | ✗ | static inline void rgb24tobgr32_mmxext(const uint8_t *src, uint8_t *dst, int src_size) | |
92 | { | ||
93 | ✗ | uint8_t *dest = dst; | |
94 | ✗ | const uint8_t *s = src; | |
95 | const uint8_t *end; | ||
96 | const uint8_t *mm_end; | ||
97 | ✗ | end = s + src_size; | |
98 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
99 | ✗ | mm_end = end - 23; | |
100 | ✗ | __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory"); | |
101 | ✗ | while (s < mm_end) { | |
102 | ✗ | __asm__ volatile( | |
103 | PREFETCH" 32(%1) \n\t" | ||
104 | "movd (%1), %%mm0 \n\t" | ||
105 | "punpckldq 3(%1), %%mm0 \n\t" | ||
106 | "movd 6(%1), %%mm1 \n\t" | ||
107 | "punpckldq 9(%1), %%mm1 \n\t" | ||
108 | "movd 12(%1), %%mm2 \n\t" | ||
109 | "punpckldq 15(%1), %%mm2 \n\t" | ||
110 | "movd 18(%1), %%mm3 \n\t" | ||
111 | "punpckldq 21(%1), %%mm3 \n\t" | ||
112 | "por %%mm7, %%mm0 \n\t" | ||
113 | "por %%mm7, %%mm1 \n\t" | ||
114 | "por %%mm7, %%mm2 \n\t" | ||
115 | "por %%mm7, %%mm3 \n\t" | ||
116 | MOVNTQ" %%mm0, (%0) \n\t" | ||
117 | MOVNTQ" %%mm1, 8(%0) \n\t" | ||
118 | MOVNTQ" %%mm2, 16(%0) \n\t" | ||
119 | MOVNTQ" %%mm3, 24(%0)" | ||
120 | :: "r"(dest), "r"(s) | ||
121 | :"memory"); | ||
122 | ✗ | dest += 32; | |
123 | ✗ | s += 24; | |
124 | } | ||
125 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
126 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
127 | ✗ | while (s < end) { | |
128 | ✗ | *dest++ = *s++; | |
129 | ✗ | *dest++ = *s++; | |
130 | ✗ | *dest++ = *s++; | |
131 | ✗ | *dest++ = 255; | |
132 | } | ||
133 | ✗ | } | |
134 | |||
135 | #define STORE_BGR24_MMX \ | ||
136 | "psrlq $8, %%mm2 \n\t" \ | ||
137 | "psrlq $8, %%mm3 \n\t" \ | ||
138 | "psrlq $8, %%mm6 \n\t" \ | ||
139 | "psrlq $8, %%mm7 \n\t" \ | ||
140 | "pand "MANGLE(mask24l)", %%mm0\n\t" \ | ||
141 | "pand "MANGLE(mask24l)", %%mm1\n\t" \ | ||
142 | "pand "MANGLE(mask24l)", %%mm4\n\t" \ | ||
143 | "pand "MANGLE(mask24l)", %%mm5\n\t" \ | ||
144 | "pand "MANGLE(mask24h)", %%mm2\n\t" \ | ||
145 | "pand "MANGLE(mask24h)", %%mm3\n\t" \ | ||
146 | "pand "MANGLE(mask24h)", %%mm6\n\t" \ | ||
147 | "pand "MANGLE(mask24h)", %%mm7\n\t" \ | ||
148 | "por %%mm2, %%mm0 \n\t" \ | ||
149 | "por %%mm3, %%mm1 \n\t" \ | ||
150 | "por %%mm6, %%mm4 \n\t" \ | ||
151 | "por %%mm7, %%mm5 \n\t" \ | ||
152 | \ | ||
153 | "movq %%mm1, %%mm2 \n\t" \ | ||
154 | "movq %%mm4, %%mm3 \n\t" \ | ||
155 | "psllq $48, %%mm2 \n\t" \ | ||
156 | "psllq $32, %%mm3 \n\t" \ | ||
157 | "por %%mm2, %%mm0 \n\t" \ | ||
158 | "psrlq $16, %%mm1 \n\t" \ | ||
159 | "psrlq $32, %%mm4 \n\t" \ | ||
160 | "psllq $16, %%mm5 \n\t" \ | ||
161 | "por %%mm3, %%mm1 \n\t" \ | ||
162 | "por %%mm5, %%mm4 \n\t" \ | ||
163 | \ | ||
164 | MOVNTQ" %%mm0, (%0) \n\t" \ | ||
165 | MOVNTQ" %%mm1, 8(%0) \n\t" \ | ||
166 | MOVNTQ" %%mm4, 16(%0)" | ||
167 | |||
168 | |||
169 | ✗ | static inline void rgb32tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size) | |
170 | { | ||
171 | ✗ | uint8_t *dest = dst; | |
172 | ✗ | const uint8_t *s = src; | |
173 | const uint8_t *end; | ||
174 | const uint8_t *mm_end; | ||
175 | ✗ | end = s + src_size; | |
176 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
177 | ✗ | mm_end = end - 31; | |
178 | ✗ | while (s < mm_end) { | |
179 | ✗ | __asm__ volatile( | |
180 | PREFETCH" 32(%1) \n\t" | ||
181 | "movq (%1), %%mm0 \n\t" | ||
182 | "movq 8(%1), %%mm1 \n\t" | ||
183 | "movq 16(%1), %%mm4 \n\t" | ||
184 | "movq 24(%1), %%mm5 \n\t" | ||
185 | "movq %%mm0, %%mm2 \n\t" | ||
186 | "movq %%mm1, %%mm3 \n\t" | ||
187 | "movq %%mm4, %%mm6 \n\t" | ||
188 | "movq %%mm5, %%mm7 \n\t" | ||
189 | STORE_BGR24_MMX | ||
190 | :: "r"(dest), "r"(s) | ||
191 | NAMED_CONSTRAINTS_ADD(mask24l,mask24h) | ||
192 | :"memory"); | ||
193 | ✗ | dest += 24; | |
194 | ✗ | s += 32; | |
195 | } | ||
196 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
197 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
198 | ✗ | while (s < end) { | |
199 | ✗ | *dest++ = *s++; | |
200 | ✗ | *dest++ = *s++; | |
201 | ✗ | *dest++ = *s++; | |
202 | ✗ | s++; | |
203 | } | ||
204 | ✗ | } | |
205 | |||
206 | /* | ||
207 | original by Strepto/Astral | ||
208 | ported to gcc & bugfixed: A'rpi | ||
209 | MMXEXT, 3DNOW optimization by Nick Kurshev | ||
210 | 32-bit C version, and and&add trick by Michael Niedermayer | ||
211 | */ | ||
212 | ✗ | static inline void rgb15to16_mmxext(const uint8_t *src, uint8_t *dst, int src_size) | |
213 | { | ||
214 | ✗ | register const uint8_t* s=src; | |
215 | ✗ | register uint8_t* d=dst; | |
216 | register const uint8_t *end; | ||
217 | const uint8_t *mm_end; | ||
218 | ✗ | end = s + src_size; | |
219 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*s)); | |
220 | ✗ | __asm__ volatile("movq %0, %%mm4"::"m"(mask15s)); | |
221 | ✗ | mm_end = end - 15; | |
222 | ✗ | while (s<mm_end) { | |
223 | ✗ | __asm__ volatile( | |
224 | PREFETCH" 32(%1) \n\t" | ||
225 | "movq (%1), %%mm0 \n\t" | ||
226 | "movq 8(%1), %%mm2 \n\t" | ||
227 | "movq %%mm0, %%mm1 \n\t" | ||
228 | "movq %%mm2, %%mm3 \n\t" | ||
229 | "pand %%mm4, %%mm0 \n\t" | ||
230 | "pand %%mm4, %%mm2 \n\t" | ||
231 | "paddw %%mm1, %%mm0 \n\t" | ||
232 | "paddw %%mm3, %%mm2 \n\t" | ||
233 | MOVNTQ" %%mm0, (%0) \n\t" | ||
234 | MOVNTQ" %%mm2, 8(%0)" | ||
235 | :: "r"(d), "r"(s) | ||
236 | ); | ||
237 | ✗ | d+=16; | |
238 | ✗ | s+=16; | |
239 | } | ||
240 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
241 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
242 | ✗ | mm_end = end - 3; | |
243 | ✗ | while (s < mm_end) { | |
244 | ✗ | register unsigned x= *((const uint32_t *)s); | |
245 | ✗ | *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0); | |
246 | ✗ | d+=4; | |
247 | ✗ | s+=4; | |
248 | } | ||
249 | ✗ | if (s < end) { | |
250 | ✗ | register unsigned short x= *((const uint16_t *)s); | |
251 | ✗ | *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0); | |
252 | } | ||
253 | ✗ | } | |
254 | |||
255 | ✗ | static inline void rgb16to15_mmxext(const uint8_t *src, uint8_t *dst, int src_size) | |
256 | { | ||
257 | ✗ | register const uint8_t* s=src; | |
258 | ✗ | register uint8_t* d=dst; | |
259 | register const uint8_t *end; | ||
260 | const uint8_t *mm_end; | ||
261 | ✗ | end = s + src_size; | |
262 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*s)); | |
263 | ✗ | __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg)); | |
264 | ✗ | __asm__ volatile("movq %0, %%mm6"::"m"(mask15b)); | |
265 | ✗ | mm_end = end - 15; | |
266 | ✗ | while (s<mm_end) { | |
267 | ✗ | __asm__ volatile( | |
268 | PREFETCH" 32(%1) \n\t" | ||
269 | "movq (%1), %%mm0 \n\t" | ||
270 | "movq 8(%1), %%mm2 \n\t" | ||
271 | "movq %%mm0, %%mm1 \n\t" | ||
272 | "movq %%mm2, %%mm3 \n\t" | ||
273 | "psrlq $1, %%mm0 \n\t" | ||
274 | "psrlq $1, %%mm2 \n\t" | ||
275 | "pand %%mm7, %%mm0 \n\t" | ||
276 | "pand %%mm7, %%mm2 \n\t" | ||
277 | "pand %%mm6, %%mm1 \n\t" | ||
278 | "pand %%mm6, %%mm3 \n\t" | ||
279 | "por %%mm1, %%mm0 \n\t" | ||
280 | "por %%mm3, %%mm2 \n\t" | ||
281 | MOVNTQ" %%mm0, (%0) \n\t" | ||
282 | MOVNTQ" %%mm2, 8(%0)" | ||
283 | :: "r"(d), "r"(s) | ||
284 | ); | ||
285 | ✗ | d+=16; | |
286 | ✗ | s+=16; | |
287 | } | ||
288 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
289 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
290 | ✗ | mm_end = end - 3; | |
291 | ✗ | while (s < mm_end) { | |
292 | ✗ | register uint32_t x= *((const uint32_t*)s); | |
293 | ✗ | *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F); | |
294 | ✗ | s+=4; | |
295 | ✗ | d+=4; | |
296 | } | ||
297 | ✗ | if (s < end) { | |
298 | ✗ | register uint16_t x= *((const uint16_t*)s); | |
299 | ✗ | *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F); | |
300 | } | ||
301 | ✗ | } | |
302 | |||
303 | ✗ | static inline void rgb32to16_mmxext(const uint8_t *src, uint8_t *dst, int src_size) | |
304 | { | ||
305 | ✗ | const uint8_t *s = src; | |
306 | const uint8_t *end; | ||
307 | const uint8_t *mm_end; | ||
308 | ✗ | uint16_t *d = (uint16_t *)dst; | |
309 | ✗ | end = s + src_size; | |
310 | ✗ | mm_end = end - 15; | |
311 | ✗ | __asm__ volatile( | |
312 | "movq %3, %%mm5 \n\t" | ||
313 | "movq %4, %%mm6 \n\t" | ||
314 | "movq %5, %%mm7 \n\t" | ||
315 | "jmp 2f \n\t" | ||
316 | ".p2align 4 \n\t" | ||
317 | "1: \n\t" | ||
318 | PREFETCH" 32(%1) \n\t" | ||
319 | "movd (%1), %%mm0 \n\t" | ||
320 | "movd 4(%1), %%mm3 \n\t" | ||
321 | "punpckldq 8(%1), %%mm0 \n\t" | ||
322 | "punpckldq 12(%1), %%mm3 \n\t" | ||
323 | "movq %%mm0, %%mm1 \n\t" | ||
324 | "movq %%mm3, %%mm4 \n\t" | ||
325 | "pand %%mm6, %%mm0 \n\t" | ||
326 | "pand %%mm6, %%mm3 \n\t" | ||
327 | "pmaddwd %%mm7, %%mm0 \n\t" | ||
328 | "pmaddwd %%mm7, %%mm3 \n\t" | ||
329 | "pand %%mm5, %%mm1 \n\t" | ||
330 | "pand %%mm5, %%mm4 \n\t" | ||
331 | "por %%mm1, %%mm0 \n\t" | ||
332 | "por %%mm4, %%mm3 \n\t" | ||
333 | "psrld $5, %%mm0 \n\t" | ||
334 | "pslld $11, %%mm3 \n\t" | ||
335 | "por %%mm3, %%mm0 \n\t" | ||
336 | MOVNTQ" %%mm0, (%0) \n\t" | ||
337 | "add $16, %1 \n\t" | ||
338 | "add $8, %0 \n\t" | ||
339 | "2: \n\t" | ||
340 | "cmp %2, %1 \n\t" | ||
341 | " jb 1b \n\t" | ||
342 | : "+r" (d), "+r"(s) | ||
343 | : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216) | ||
344 | ); | ||
345 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
346 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
347 | ✗ | while (s < end) { | |
348 | ✗ | register int rgb = *(const uint32_t*)s; s += 4; | |
349 | ✗ | *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8); | |
350 | } | ||
351 | ✗ | } | |
352 | |||
353 | ✗ | static inline void rgb32tobgr16_mmxext(const uint8_t *src, uint8_t *dst, int src_size) | |
354 | { | ||
355 | ✗ | const uint8_t *s = src; | |
356 | const uint8_t *end; | ||
357 | const uint8_t *mm_end; | ||
358 | ✗ | uint16_t *d = (uint16_t *)dst; | |
359 | ✗ | end = s + src_size; | |
360 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
361 | ✗ | __asm__ volatile( | |
362 | "movq %0, %%mm7 \n\t" | ||
363 | "movq %1, %%mm6 \n\t" | ||
364 | ::"m"(red_16mask),"m"(green_16mask)); | ||
365 | ✗ | mm_end = end - 15; | |
366 | ✗ | while (s < mm_end) { | |
367 | ✗ | __asm__ volatile( | |
368 | PREFETCH" 32(%1) \n\t" | ||
369 | "movd (%1), %%mm0 \n\t" | ||
370 | "movd 4(%1), %%mm3 \n\t" | ||
371 | "punpckldq 8(%1), %%mm0 \n\t" | ||
372 | "punpckldq 12(%1), %%mm3 \n\t" | ||
373 | "movq %%mm0, %%mm1 \n\t" | ||
374 | "movq %%mm0, %%mm2 \n\t" | ||
375 | "movq %%mm3, %%mm4 \n\t" | ||
376 | "movq %%mm3, %%mm5 \n\t" | ||
377 | "psllq $8, %%mm0 \n\t" | ||
378 | "psllq $8, %%mm3 \n\t" | ||
379 | "pand %%mm7, %%mm0 \n\t" | ||
380 | "pand %%mm7, %%mm3 \n\t" | ||
381 | "psrlq $5, %%mm1 \n\t" | ||
382 | "psrlq $5, %%mm4 \n\t" | ||
383 | "pand %%mm6, %%mm1 \n\t" | ||
384 | "pand %%mm6, %%mm4 \n\t" | ||
385 | "psrlq $19, %%mm2 \n\t" | ||
386 | "psrlq $19, %%mm5 \n\t" | ||
387 | "pand %2, %%mm2 \n\t" | ||
388 | "pand %2, %%mm5 \n\t" | ||
389 | "por %%mm1, %%mm0 \n\t" | ||
390 | "por %%mm4, %%mm3 \n\t" | ||
391 | "por %%mm2, %%mm0 \n\t" | ||
392 | "por %%mm5, %%mm3 \n\t" | ||
393 | "psllq $16, %%mm3 \n\t" | ||
394 | "por %%mm3, %%mm0 \n\t" | ||
395 | MOVNTQ" %%mm0, (%0) \n\t" | ||
396 | :: "r"(d),"r"(s),"m"(blue_16mask):"memory"); | ||
397 | ✗ | d += 4; | |
398 | ✗ | s += 16; | |
399 | } | ||
400 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
401 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
402 | ✗ | while (s < end) { | |
403 | ✗ | register int rgb = *(const uint32_t*)s; s += 4; | |
404 | ✗ | *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19); | |
405 | } | ||
406 | ✗ | } | |
407 | |||
408 | ✗ | static inline void rgb32to15_mmxext(const uint8_t *src, uint8_t *dst, int src_size) | |
409 | { | ||
410 | ✗ | const uint8_t *s = src; | |
411 | const uint8_t *end; | ||
412 | const uint8_t *mm_end; | ||
413 | ✗ | uint16_t *d = (uint16_t *)dst; | |
414 | ✗ | end = s + src_size; | |
415 | ✗ | mm_end = end - 15; | |
416 | ✗ | __asm__ volatile( | |
417 | "movq %3, %%mm5 \n\t" | ||
418 | "movq %4, %%mm6 \n\t" | ||
419 | "movq %5, %%mm7 \n\t" | ||
420 | "jmp 2f \n\t" | ||
421 | ".p2align 4 \n\t" | ||
422 | "1: \n\t" | ||
423 | PREFETCH" 32(%1) \n\t" | ||
424 | "movd (%1), %%mm0 \n\t" | ||
425 | "movd 4(%1), %%mm3 \n\t" | ||
426 | "punpckldq 8(%1), %%mm0 \n\t" | ||
427 | "punpckldq 12(%1), %%mm3 \n\t" | ||
428 | "movq %%mm0, %%mm1 \n\t" | ||
429 | "movq %%mm3, %%mm4 \n\t" | ||
430 | "pand %%mm6, %%mm0 \n\t" | ||
431 | "pand %%mm6, %%mm3 \n\t" | ||
432 | "pmaddwd %%mm7, %%mm0 \n\t" | ||
433 | "pmaddwd %%mm7, %%mm3 \n\t" | ||
434 | "pand %%mm5, %%mm1 \n\t" | ||
435 | "pand %%mm5, %%mm4 \n\t" | ||
436 | "por %%mm1, %%mm0 \n\t" | ||
437 | "por %%mm4, %%mm3 \n\t" | ||
438 | "psrld $6, %%mm0 \n\t" | ||
439 | "pslld $10, %%mm3 \n\t" | ||
440 | "por %%mm3, %%mm0 \n\t" | ||
441 | MOVNTQ" %%mm0, (%0) \n\t" | ||
442 | "add $16, %1 \n\t" | ||
443 | "add $8, %0 \n\t" | ||
444 | "2: \n\t" | ||
445 | "cmp %2, %1 \n\t" | ||
446 | " jb 1b \n\t" | ||
447 | : "+r" (d), "+r"(s) | ||
448 | : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215) | ||
449 | ); | ||
450 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
451 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
452 | ✗ | while (s < end) { | |
453 | ✗ | register int rgb = *(const uint32_t*)s; s += 4; | |
454 | ✗ | *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9); | |
455 | } | ||
456 | ✗ | } | |
457 | |||
458 | ✗ | static inline void rgb32tobgr15_mmxext(const uint8_t *src, uint8_t *dst, int src_size) | |
459 | { | ||
460 | ✗ | const uint8_t *s = src; | |
461 | const uint8_t *end; | ||
462 | const uint8_t *mm_end; | ||
463 | ✗ | uint16_t *d = (uint16_t *)dst; | |
464 | ✗ | end = s + src_size; | |
465 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
466 | ✗ | __asm__ volatile( | |
467 | "movq %0, %%mm7 \n\t" | ||
468 | "movq %1, %%mm6 \n\t" | ||
469 | ::"m"(red_15mask),"m"(green_15mask)); | ||
470 | ✗ | mm_end = end - 15; | |
471 | ✗ | while (s < mm_end) { | |
472 | ✗ | __asm__ volatile( | |
473 | PREFETCH" 32(%1) \n\t" | ||
474 | "movd (%1), %%mm0 \n\t" | ||
475 | "movd 4(%1), %%mm3 \n\t" | ||
476 | "punpckldq 8(%1), %%mm0 \n\t" | ||
477 | "punpckldq 12(%1), %%mm3 \n\t" | ||
478 | "movq %%mm0, %%mm1 \n\t" | ||
479 | "movq %%mm0, %%mm2 \n\t" | ||
480 | "movq %%mm3, %%mm4 \n\t" | ||
481 | "movq %%mm3, %%mm5 \n\t" | ||
482 | "psllq $7, %%mm0 \n\t" | ||
483 | "psllq $7, %%mm3 \n\t" | ||
484 | "pand %%mm7, %%mm0 \n\t" | ||
485 | "pand %%mm7, %%mm3 \n\t" | ||
486 | "psrlq $6, %%mm1 \n\t" | ||
487 | "psrlq $6, %%mm4 \n\t" | ||
488 | "pand %%mm6, %%mm1 \n\t" | ||
489 | "pand %%mm6, %%mm4 \n\t" | ||
490 | "psrlq $19, %%mm2 \n\t" | ||
491 | "psrlq $19, %%mm5 \n\t" | ||
492 | "pand %2, %%mm2 \n\t" | ||
493 | "pand %2, %%mm5 \n\t" | ||
494 | "por %%mm1, %%mm0 \n\t" | ||
495 | "por %%mm4, %%mm3 \n\t" | ||
496 | "por %%mm2, %%mm0 \n\t" | ||
497 | "por %%mm5, %%mm3 \n\t" | ||
498 | "psllq $16, %%mm3 \n\t" | ||
499 | "por %%mm3, %%mm0 \n\t" | ||
500 | MOVNTQ" %%mm0, (%0) \n\t" | ||
501 | ::"r"(d),"r"(s),"m"(blue_15mask):"memory"); | ||
502 | ✗ | d += 4; | |
503 | ✗ | s += 16; | |
504 | } | ||
505 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
506 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
507 | ✗ | while (s < end) { | |
508 | ✗ | register int rgb = *(const uint32_t*)s; s += 4; | |
509 | ✗ | *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19); | |
510 | } | ||
511 | ✗ | } | |
512 | |||
513 | ✗ | static inline void rgb24tobgr16_mmxext(const uint8_t *src, uint8_t *dst, int src_size) | |
514 | { | ||
515 | ✗ | const uint8_t *s = src; | |
516 | const uint8_t *end; | ||
517 | const uint8_t *mm_end; | ||
518 | ✗ | uint16_t *d = (uint16_t *)dst; | |
519 | ✗ | end = s + src_size; | |
520 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
521 | ✗ | __asm__ volatile( | |
522 | "movq %0, %%mm7 \n\t" | ||
523 | "movq %1, %%mm6 \n\t" | ||
524 | ::"m"(red_16mask),"m"(green_16mask)); | ||
525 | ✗ | mm_end = end - 11; | |
526 | ✗ | while (s < mm_end) { | |
527 | ✗ | __asm__ volatile( | |
528 | PREFETCH" 32(%1) \n\t" | ||
529 | "movd (%1), %%mm0 \n\t" | ||
530 | "movd 3(%1), %%mm3 \n\t" | ||
531 | "punpckldq 6(%1), %%mm0 \n\t" | ||
532 | "punpckldq 9(%1), %%mm3 \n\t" | ||
533 | "movq %%mm0, %%mm1 \n\t" | ||
534 | "movq %%mm0, %%mm2 \n\t" | ||
535 | "movq %%mm3, %%mm4 \n\t" | ||
536 | "movq %%mm3, %%mm5 \n\t" | ||
537 | "psrlq $3, %%mm0 \n\t" | ||
538 | "psrlq $3, %%mm3 \n\t" | ||
539 | "pand %2, %%mm0 \n\t" | ||
540 | "pand %2, %%mm3 \n\t" | ||
541 | "psrlq $5, %%mm1 \n\t" | ||
542 | "psrlq $5, %%mm4 \n\t" | ||
543 | "pand %%mm6, %%mm1 \n\t" | ||
544 | "pand %%mm6, %%mm4 \n\t" | ||
545 | "psrlq $8, %%mm2 \n\t" | ||
546 | "psrlq $8, %%mm5 \n\t" | ||
547 | "pand %%mm7, %%mm2 \n\t" | ||
548 | "pand %%mm7, %%mm5 \n\t" | ||
549 | "por %%mm1, %%mm0 \n\t" | ||
550 | "por %%mm4, %%mm3 \n\t" | ||
551 | "por %%mm2, %%mm0 \n\t" | ||
552 | "por %%mm5, %%mm3 \n\t" | ||
553 | "psllq $16, %%mm3 \n\t" | ||
554 | "por %%mm3, %%mm0 \n\t" | ||
555 | MOVNTQ" %%mm0, (%0) \n\t" | ||
556 | ::"r"(d),"r"(s),"m"(blue_16mask):"memory"); | ||
557 | ✗ | d += 4; | |
558 | ✗ | s += 12; | |
559 | } | ||
560 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
561 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
562 | ✗ | while (s < end) { | |
563 | ✗ | const int b = *s++; | |
564 | ✗ | const int g = *s++; | |
565 | ✗ | const int r = *s++; | |
566 | ✗ | *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); | |
567 | } | ||
568 | ✗ | } | |
569 | |||
570 | ✗ | static inline void rgb24to16_mmxext(const uint8_t *src, uint8_t *dst, int src_size) | |
571 | { | ||
572 | ✗ | const uint8_t *s = src; | |
573 | const uint8_t *end; | ||
574 | const uint8_t *mm_end; | ||
575 | ✗ | uint16_t *d = (uint16_t *)dst; | |
576 | ✗ | end = s + src_size; | |
577 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
578 | ✗ | __asm__ volatile( | |
579 | "movq %0, %%mm7 \n\t" | ||
580 | "movq %1, %%mm6 \n\t" | ||
581 | ::"m"(red_16mask),"m"(green_16mask)); | ||
582 | ✗ | mm_end = end - 15; | |
583 | ✗ | while (s < mm_end) { | |
584 | ✗ | __asm__ volatile( | |
585 | PREFETCH" 32(%1) \n\t" | ||
586 | "movd (%1), %%mm0 \n\t" | ||
587 | "movd 3(%1), %%mm3 \n\t" | ||
588 | "punpckldq 6(%1), %%mm0 \n\t" | ||
589 | "punpckldq 9(%1), %%mm3 \n\t" | ||
590 | "movq %%mm0, %%mm1 \n\t" | ||
591 | "movq %%mm0, %%mm2 \n\t" | ||
592 | "movq %%mm3, %%mm4 \n\t" | ||
593 | "movq %%mm3, %%mm5 \n\t" | ||
594 | "psllq $8, %%mm0 \n\t" | ||
595 | "psllq $8, %%mm3 \n\t" | ||
596 | "pand %%mm7, %%mm0 \n\t" | ||
597 | "pand %%mm7, %%mm3 \n\t" | ||
598 | "psrlq $5, %%mm1 \n\t" | ||
599 | "psrlq $5, %%mm4 \n\t" | ||
600 | "pand %%mm6, %%mm1 \n\t" | ||
601 | "pand %%mm6, %%mm4 \n\t" | ||
602 | "psrlq $19, %%mm2 \n\t" | ||
603 | "psrlq $19, %%mm5 \n\t" | ||
604 | "pand %2, %%mm2 \n\t" | ||
605 | "pand %2, %%mm5 \n\t" | ||
606 | "por %%mm1, %%mm0 \n\t" | ||
607 | "por %%mm4, %%mm3 \n\t" | ||
608 | "por %%mm2, %%mm0 \n\t" | ||
609 | "por %%mm5, %%mm3 \n\t" | ||
610 | "psllq $16, %%mm3 \n\t" | ||
611 | "por %%mm3, %%mm0 \n\t" | ||
612 | MOVNTQ" %%mm0, (%0) \n\t" | ||
613 | ::"r"(d),"r"(s),"m"(blue_16mask):"memory"); | ||
614 | ✗ | d += 4; | |
615 | ✗ | s += 12; | |
616 | } | ||
617 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
618 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
619 | ✗ | while (s < end) { | |
620 | ✗ | const int r = *s++; | |
621 | ✗ | const int g = *s++; | |
622 | ✗ | const int b = *s++; | |
623 | ✗ | *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); | |
624 | } | ||
625 | ✗ | } | |
626 | |||
627 | ✗ | static inline void rgb24tobgr15_mmxext(const uint8_t *src, uint8_t *dst, int src_size) | |
628 | { | ||
629 | ✗ | const uint8_t *s = src; | |
630 | const uint8_t *end; | ||
631 | const uint8_t *mm_end; | ||
632 | ✗ | uint16_t *d = (uint16_t *)dst; | |
633 | ✗ | end = s + src_size; | |
634 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
635 | ✗ | __asm__ volatile( | |
636 | "movq %0, %%mm7 \n\t" | ||
637 | "movq %1, %%mm6 \n\t" | ||
638 | ::"m"(red_15mask),"m"(green_15mask)); | ||
639 | ✗ | mm_end = end - 11; | |
640 | ✗ | while (s < mm_end) { | |
641 | ✗ | __asm__ volatile( | |
642 | PREFETCH" 32(%1) \n\t" | ||
643 | "movd (%1), %%mm0 \n\t" | ||
644 | "movd 3(%1), %%mm3 \n\t" | ||
645 | "punpckldq 6(%1), %%mm0 \n\t" | ||
646 | "punpckldq 9(%1), %%mm3 \n\t" | ||
647 | "movq %%mm0, %%mm1 \n\t" | ||
648 | "movq %%mm0, %%mm2 \n\t" | ||
649 | "movq %%mm3, %%mm4 \n\t" | ||
650 | "movq %%mm3, %%mm5 \n\t" | ||
651 | "psrlq $3, %%mm0 \n\t" | ||
652 | "psrlq $3, %%mm3 \n\t" | ||
653 | "pand %2, %%mm0 \n\t" | ||
654 | "pand %2, %%mm3 \n\t" | ||
655 | "psrlq $6, %%mm1 \n\t" | ||
656 | "psrlq $6, %%mm4 \n\t" | ||
657 | "pand %%mm6, %%mm1 \n\t" | ||
658 | "pand %%mm6, %%mm4 \n\t" | ||
659 | "psrlq $9, %%mm2 \n\t" | ||
660 | "psrlq $9, %%mm5 \n\t" | ||
661 | "pand %%mm7, %%mm2 \n\t" | ||
662 | "pand %%mm7, %%mm5 \n\t" | ||
663 | "por %%mm1, %%mm0 \n\t" | ||
664 | "por %%mm4, %%mm3 \n\t" | ||
665 | "por %%mm2, %%mm0 \n\t" | ||
666 | "por %%mm5, %%mm3 \n\t" | ||
667 | "psllq $16, %%mm3 \n\t" | ||
668 | "por %%mm3, %%mm0 \n\t" | ||
669 | MOVNTQ" %%mm0, (%0) \n\t" | ||
670 | ::"r"(d),"r"(s),"m"(blue_15mask):"memory"); | ||
671 | ✗ | d += 4; | |
672 | ✗ | s += 12; | |
673 | } | ||
674 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
675 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
676 | ✗ | while (s < end) { | |
677 | ✗ | const int b = *s++; | |
678 | ✗ | const int g = *s++; | |
679 | ✗ | const int r = *s++; | |
680 | ✗ | *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); | |
681 | } | ||
682 | ✗ | } | |
683 | |||
684 | ✗ | static inline void rgb24to15_mmxext(const uint8_t *src, uint8_t *dst, int src_size) | |
685 | { | ||
686 | ✗ | const uint8_t *s = src; | |
687 | const uint8_t *end; | ||
688 | const uint8_t *mm_end; | ||
689 | ✗ | uint16_t *d = (uint16_t *)dst; | |
690 | ✗ | end = s + src_size; | |
691 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
692 | ✗ | __asm__ volatile( | |
693 | "movq %0, %%mm7 \n\t" | ||
694 | "movq %1, %%mm6 \n\t" | ||
695 | ::"m"(red_15mask),"m"(green_15mask)); | ||
696 | ✗ | mm_end = end - 15; | |
697 | ✗ | while (s < mm_end) { | |
698 | ✗ | __asm__ volatile( | |
699 | PREFETCH" 32(%1) \n\t" | ||
700 | "movd (%1), %%mm0 \n\t" | ||
701 | "movd 3(%1), %%mm3 \n\t" | ||
702 | "punpckldq 6(%1), %%mm0 \n\t" | ||
703 | "punpckldq 9(%1), %%mm3 \n\t" | ||
704 | "movq %%mm0, %%mm1 \n\t" | ||
705 | "movq %%mm0, %%mm2 \n\t" | ||
706 | "movq %%mm3, %%mm4 \n\t" | ||
707 | "movq %%mm3, %%mm5 \n\t" | ||
708 | "psllq $7, %%mm0 \n\t" | ||
709 | "psllq $7, %%mm3 \n\t" | ||
710 | "pand %%mm7, %%mm0 \n\t" | ||
711 | "pand %%mm7, %%mm3 \n\t" | ||
712 | "psrlq $6, %%mm1 \n\t" | ||
713 | "psrlq $6, %%mm4 \n\t" | ||
714 | "pand %%mm6, %%mm1 \n\t" | ||
715 | "pand %%mm6, %%mm4 \n\t" | ||
716 | "psrlq $19, %%mm2 \n\t" | ||
717 | "psrlq $19, %%mm5 \n\t" | ||
718 | "pand %2, %%mm2 \n\t" | ||
719 | "pand %2, %%mm5 \n\t" | ||
720 | "por %%mm1, %%mm0 \n\t" | ||
721 | "por %%mm4, %%mm3 \n\t" | ||
722 | "por %%mm2, %%mm0 \n\t" | ||
723 | "por %%mm5, %%mm3 \n\t" | ||
724 | "psllq $16, %%mm3 \n\t" | ||
725 | "por %%mm3, %%mm0 \n\t" | ||
726 | MOVNTQ" %%mm0, (%0) \n\t" | ||
727 | ::"r"(d),"r"(s),"m"(blue_15mask):"memory"); | ||
728 | ✗ | d += 4; | |
729 | ✗ | s += 12; | |
730 | } | ||
731 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
732 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
733 | ✗ | while (s < end) { | |
734 | ✗ | const int r = *s++; | |
735 | ✗ | const int g = *s++; | |
736 | ✗ | const int b = *s++; | |
737 | ✗ | *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); | |
738 | } | ||
739 | ✗ | } | |
740 | |||
741 | ✗ | static inline void rgb15tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size) | |
742 | { | ||
743 | const uint16_t *end; | ||
744 | const uint16_t *mm_end; | ||
745 | ✗ | uint8_t *d = dst; | |
746 | ✗ | const uint16_t *s = (const uint16_t*)src; | |
747 | ✗ | end = s + src_size/2; | |
748 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
749 | ✗ | mm_end = end - 7; | |
750 | ✗ | while (s < mm_end) { | |
751 | ✗ | __asm__ volatile( | |
752 | PREFETCH" 32(%1) \n\t" | ||
753 | "movq (%1), %%mm0 \n\t" | ||
754 | "movq (%1), %%mm1 \n\t" | ||
755 | "movq (%1), %%mm2 \n\t" | ||
756 | "pand %2, %%mm0 \n\t" | ||
757 | "pand %3, %%mm1 \n\t" | ||
758 | "pand %4, %%mm2 \n\t" | ||
759 | "psllq $5, %%mm0 \n\t" | ||
760 | "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" | ||
761 | "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t" | ||
762 | "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" | ||
763 | "movq %%mm0, %%mm3 \n\t" | ||
764 | "movq %%mm1, %%mm4 \n\t" | ||
765 | "movq %%mm2, %%mm5 \n\t" | ||
766 | "punpcklwd %5, %%mm0 \n\t" | ||
767 | "punpcklwd %5, %%mm1 \n\t" | ||
768 | "punpcklwd %5, %%mm2 \n\t" | ||
769 | "punpckhwd %5, %%mm3 \n\t" | ||
770 | "punpckhwd %5, %%mm4 \n\t" | ||
771 | "punpckhwd %5, %%mm5 \n\t" | ||
772 | "psllq $8, %%mm1 \n\t" | ||
773 | "psllq $16, %%mm2 \n\t" | ||
774 | "por %%mm1, %%mm0 \n\t" | ||
775 | "por %%mm2, %%mm0 \n\t" | ||
776 | "psllq $8, %%mm4 \n\t" | ||
777 | "psllq $16, %%mm5 \n\t" | ||
778 | "por %%mm4, %%mm3 \n\t" | ||
779 | "por %%mm5, %%mm3 \n\t" | ||
780 | |||
781 | "movq %%mm0, %%mm6 \n\t" | ||
782 | "movq %%mm3, %%mm7 \n\t" | ||
783 | |||
784 | "movq 8(%1), %%mm0 \n\t" | ||
785 | "movq 8(%1), %%mm1 \n\t" | ||
786 | "movq 8(%1), %%mm2 \n\t" | ||
787 | "pand %2, %%mm0 \n\t" | ||
788 | "pand %3, %%mm1 \n\t" | ||
789 | "pand %4, %%mm2 \n\t" | ||
790 | "psllq $5, %%mm0 \n\t" | ||
791 | "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" | ||
792 | "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t" | ||
793 | "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" | ||
794 | "movq %%mm0, %%mm3 \n\t" | ||
795 | "movq %%mm1, %%mm4 \n\t" | ||
796 | "movq %%mm2, %%mm5 \n\t" | ||
797 | "punpcklwd %5, %%mm0 \n\t" | ||
798 | "punpcklwd %5, %%mm1 \n\t" | ||
799 | "punpcklwd %5, %%mm2 \n\t" | ||
800 | "punpckhwd %5, %%mm3 \n\t" | ||
801 | "punpckhwd %5, %%mm4 \n\t" | ||
802 | "punpckhwd %5, %%mm5 \n\t" | ||
803 | "psllq $8, %%mm1 \n\t" | ||
804 | "psllq $16, %%mm2 \n\t" | ||
805 | "por %%mm1, %%mm0 \n\t" | ||
806 | "por %%mm2, %%mm0 \n\t" | ||
807 | "psllq $8, %%mm4 \n\t" | ||
808 | "psllq $16, %%mm5 \n\t" | ||
809 | "por %%mm4, %%mm3 \n\t" | ||
810 | "por %%mm5, %%mm3 \n\t" | ||
811 | |||
812 | :"=m"(*d) | ||
813 | :"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null) | ||
814 | NAMED_CONSTRAINTS_ADD(mul15_mid,mul15_hi) | ||
815 | :"memory"); | ||
816 | /* borrowed 32 to 24 */ | ||
817 | ✗ | __asm__ volatile( | |
818 | "movq %%mm0, %%mm4 \n\t" | ||
819 | "movq %%mm3, %%mm5 \n\t" | ||
820 | "movq %%mm6, %%mm0 \n\t" | ||
821 | "movq %%mm7, %%mm1 \n\t" | ||
822 | |||
823 | "movq %%mm4, %%mm6 \n\t" | ||
824 | "movq %%mm5, %%mm7 \n\t" | ||
825 | "movq %%mm0, %%mm2 \n\t" | ||
826 | "movq %%mm1, %%mm3 \n\t" | ||
827 | |||
828 | STORE_BGR24_MMX | ||
829 | |||
830 | :: "r"(d), "m"(*s) | ||
831 | NAMED_CONSTRAINTS_ADD(mask24l,mask24h) | ||
832 | :"memory"); | ||
833 | ✗ | d += 24; | |
834 | ✗ | s += 8; | |
835 | } | ||
836 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
837 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
838 | ✗ | while (s < end) { | |
839 | register uint16_t bgr; | ||
840 | ✗ | bgr = *s++; | |
841 | ✗ | *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); | |
842 | ✗ | *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7); | |
843 | ✗ | *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12); | |
844 | } | ||
845 | ✗ | } | |
846 | |||
847 | ✗ | static inline void rgb16tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size) | |
848 | { | ||
849 | const uint16_t *end; | ||
850 | const uint16_t *mm_end; | ||
851 | ✗ | uint8_t *d = (uint8_t *)dst; | |
852 | ✗ | const uint16_t *s = (const uint16_t *)src; | |
853 | ✗ | end = s + src_size/2; | |
854 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
855 | ✗ | mm_end = end - 7; | |
856 | ✗ | while (s < mm_end) { | |
857 | ✗ | __asm__ volatile( | |
858 | PREFETCH" 32(%1) \n\t" | ||
859 | "movq (%1), %%mm0 \n\t" | ||
860 | "movq (%1), %%mm1 \n\t" | ||
861 | "movq (%1), %%mm2 \n\t" | ||
862 | "pand %2, %%mm0 \n\t" | ||
863 | "pand %3, %%mm1 \n\t" | ||
864 | "pand %4, %%mm2 \n\t" | ||
865 | "psllq $5, %%mm0 \n\t" | ||
866 | "psrlq $1, %%mm2 \n\t" | ||
867 | "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" | ||
868 | "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t" | ||
869 | "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" | ||
870 | "movq %%mm0, %%mm3 \n\t" | ||
871 | "movq %%mm1, %%mm4 \n\t" | ||
872 | "movq %%mm2, %%mm5 \n\t" | ||
873 | "punpcklwd %5, %%mm0 \n\t" | ||
874 | "punpcklwd %5, %%mm1 \n\t" | ||
875 | "punpcklwd %5, %%mm2 \n\t" | ||
876 | "punpckhwd %5, %%mm3 \n\t" | ||
877 | "punpckhwd %5, %%mm4 \n\t" | ||
878 | "punpckhwd %5, %%mm5 \n\t" | ||
879 | "psllq $8, %%mm1 \n\t" | ||
880 | "psllq $16, %%mm2 \n\t" | ||
881 | "por %%mm1, %%mm0 \n\t" | ||
882 | "por %%mm2, %%mm0 \n\t" | ||
883 | "psllq $8, %%mm4 \n\t" | ||
884 | "psllq $16, %%mm5 \n\t" | ||
885 | "por %%mm4, %%mm3 \n\t" | ||
886 | "por %%mm5, %%mm3 \n\t" | ||
887 | |||
888 | "movq %%mm0, %%mm6 \n\t" | ||
889 | "movq %%mm3, %%mm7 \n\t" | ||
890 | |||
891 | "movq 8(%1), %%mm0 \n\t" | ||
892 | "movq 8(%1), %%mm1 \n\t" | ||
893 | "movq 8(%1), %%mm2 \n\t" | ||
894 | "pand %2, %%mm0 \n\t" | ||
895 | "pand %3, %%mm1 \n\t" | ||
896 | "pand %4, %%mm2 \n\t" | ||
897 | "psllq $5, %%mm0 \n\t" | ||
898 | "psrlq $1, %%mm2 \n\t" | ||
899 | "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" | ||
900 | "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t" | ||
901 | "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" | ||
902 | "movq %%mm0, %%mm3 \n\t" | ||
903 | "movq %%mm1, %%mm4 \n\t" | ||
904 | "movq %%mm2, %%mm5 \n\t" | ||
905 | "punpcklwd %5, %%mm0 \n\t" | ||
906 | "punpcklwd %5, %%mm1 \n\t" | ||
907 | "punpcklwd %5, %%mm2 \n\t" | ||
908 | "punpckhwd %5, %%mm3 \n\t" | ||
909 | "punpckhwd %5, %%mm4 \n\t" | ||
910 | "punpckhwd %5, %%mm5 \n\t" | ||
911 | "psllq $8, %%mm1 \n\t" | ||
912 | "psllq $16, %%mm2 \n\t" | ||
913 | "por %%mm1, %%mm0 \n\t" | ||
914 | "por %%mm2, %%mm0 \n\t" | ||
915 | "psllq $8, %%mm4 \n\t" | ||
916 | "psllq $16, %%mm5 \n\t" | ||
917 | "por %%mm4, %%mm3 \n\t" | ||
918 | "por %%mm5, %%mm3 \n\t" | ||
919 | :"=m"(*d) | ||
920 | :"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null) | ||
921 | NAMED_CONSTRAINTS_ADD(mul15_mid,mul16_mid,mul15_hi) | ||
922 | :"memory"); | ||
923 | /* borrowed 32 to 24 */ | ||
924 | ✗ | __asm__ volatile( | |
925 | "movq %%mm0, %%mm4 \n\t" | ||
926 | "movq %%mm3, %%mm5 \n\t" | ||
927 | "movq %%mm6, %%mm0 \n\t" | ||
928 | "movq %%mm7, %%mm1 \n\t" | ||
929 | |||
930 | "movq %%mm4, %%mm6 \n\t" | ||
931 | "movq %%mm5, %%mm7 \n\t" | ||
932 | "movq %%mm0, %%mm2 \n\t" | ||
933 | "movq %%mm1, %%mm3 \n\t" | ||
934 | |||
935 | STORE_BGR24_MMX | ||
936 | |||
937 | :: "r"(d), "m"(*s) | ||
938 | NAMED_CONSTRAINTS_ADD(mask24l,mask24h) | ||
939 | :"memory"); | ||
940 | ✗ | d += 24; | |
941 | ✗ | s += 8; | |
942 | } | ||
943 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
944 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
945 | ✗ | while (s < end) { | |
946 | register uint16_t bgr; | ||
947 | ✗ | bgr = *s++; | |
948 | ✗ | *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); | |
949 | ✗ | *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9); | |
950 | ✗ | *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13); | |
951 | } | ||
952 | ✗ | } | |
953 | |||
954 | /* | ||
955 | * mm0 = 00 B3 00 B2 00 B1 00 B0 | ||
956 | * mm1 = 00 G3 00 G2 00 G1 00 G0 | ||
957 | * mm2 = 00 R3 00 R2 00 R1 00 R0 | ||
958 | * mm6 = FF FF FF FF FF FF FF FF | ||
959 | * mm7 = 00 00 00 00 00 00 00 00 | ||
960 | */ | ||
961 | #define PACK_RGB32 \ | ||
962 | "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \ | ||
963 | "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \ | ||
964 | "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \ | ||
965 | "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \ | ||
966 | "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \ | ||
967 | "movq %%mm0, %%mm3 \n\t" \ | ||
968 | "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \ | ||
969 | "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \ | ||
970 | MOVNTQ" %%mm0, (%0) \n\t" \ | ||
971 | MOVNTQ" %%mm3, 8(%0) \n\t" \ | ||
972 | |||
973 | ✗ | static inline void rgb15to32_mmxext(const uint8_t *src, uint8_t *dst, int src_size) | |
974 | { | ||
975 | const uint16_t *end; | ||
976 | const uint16_t *mm_end; | ||
977 | ✗ | uint8_t *d = dst; | |
978 | ✗ | const uint16_t *s = (const uint16_t *)src; | |
979 | ✗ | end = s + src_size/2; | |
980 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
981 | ✗ | __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory"); | |
982 | ✗ | __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory"); | |
983 | ✗ | mm_end = end - 3; | |
984 | ✗ | while (s < mm_end) { | |
985 | ✗ | __asm__ volatile( | |
986 | PREFETCH" 32(%1) \n\t" | ||
987 | "movq (%1), %%mm0 \n\t" | ||
988 | "movq (%1), %%mm1 \n\t" | ||
989 | "movq (%1), %%mm2 \n\t" | ||
990 | "pand %2, %%mm0 \n\t" | ||
991 | "pand %3, %%mm1 \n\t" | ||
992 | "pand %4, %%mm2 \n\t" | ||
993 | "psllq $5, %%mm0 \n\t" | ||
994 | "pmulhw %5, %%mm0 \n\t" | ||
995 | "pmulhw %5, %%mm1 \n\t" | ||
996 | "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" | ||
997 | PACK_RGB32 | ||
998 | ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r) ,"m"(mul15_mid) | ||
999 | NAMED_CONSTRAINTS_ADD(mul15_hi) | ||
1000 | :"memory"); | ||
1001 | ✗ | d += 16; | |
1002 | ✗ | s += 4; | |
1003 | } | ||
1004 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
1005 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
1006 | ✗ | while (s < end) { | |
1007 | register uint16_t bgr; | ||
1008 | ✗ | bgr = *s++; | |
1009 | ✗ | *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); | |
1010 | ✗ | *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7); | |
1011 | ✗ | *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12); | |
1012 | ✗ | *d++ = 255; | |
1013 | } | ||
1014 | ✗ | } | |
1015 | |||
1016 | ✗ | static inline void rgb16to32_mmxext(const uint8_t *src, uint8_t *dst, int src_size) | |
1017 | { | ||
1018 | const uint16_t *end; | ||
1019 | const uint16_t *mm_end; | ||
1020 | ✗ | uint8_t *d = dst; | |
1021 | ✗ | const uint16_t *s = (const uint16_t*)src; | |
1022 | ✗ | end = s + src_size/2; | |
1023 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
1024 | ✗ | __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory"); | |
1025 | ✗ | __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory"); | |
1026 | ✗ | mm_end = end - 3; | |
1027 | ✗ | while (s < mm_end) { | |
1028 | ✗ | __asm__ volatile( | |
1029 | PREFETCH" 32(%1) \n\t" | ||
1030 | "movq (%1), %%mm0 \n\t" | ||
1031 | "movq (%1), %%mm1 \n\t" | ||
1032 | "movq (%1), %%mm2 \n\t" | ||
1033 | "pand %2, %%mm0 \n\t" | ||
1034 | "pand %3, %%mm1 \n\t" | ||
1035 | "pand %4, %%mm2 \n\t" | ||
1036 | "psllq $5, %%mm0 \n\t" | ||
1037 | "psrlq $1, %%mm2 \n\t" | ||
1038 | "pmulhw %5, %%mm0 \n\t" | ||
1039 | "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t" | ||
1040 | "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" | ||
1041 | PACK_RGB32 | ||
1042 | ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid) | ||
1043 | NAMED_CONSTRAINTS_ADD(mul16_mid,mul15_hi) | ||
1044 | :"memory"); | ||
1045 | ✗ | d += 16; | |
1046 | ✗ | s += 4; | |
1047 | } | ||
1048 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
1049 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
1050 | ✗ | while (s < end) { | |
1051 | register uint16_t bgr; | ||
1052 | ✗ | bgr = *s++; | |
1053 | ✗ | *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); | |
1054 | ✗ | *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9); | |
1055 | ✗ | *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13); | |
1056 | ✗ | *d++ = 255; | |
1057 | } | ||
1058 | ✗ | } | |
1059 | |||
1060 | ✗ | static inline void rgb24tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size) | |
1061 | { | ||
1062 | ✗ | x86_reg mmx_size= 23 - src_size; | |
1063 | ✗ | __asm__ volatile ( | |
1064 | "test %%"FF_REG_a", %%"FF_REG_a" \n\t" | ||
1065 | "jns 2f \n\t" | ||
1066 | "movq "MANGLE(mask24r)", %%mm5 \n\t" | ||
1067 | "movq "MANGLE(mask24g)", %%mm6 \n\t" | ||
1068 | "movq "MANGLE(mask24b)", %%mm7 \n\t" | ||
1069 | ".p2align 4 \n\t" | ||
1070 | "1: \n\t" | ||
1071 | PREFETCH" 32(%1, %%"FF_REG_a") \n\t" | ||
1072 | "movq (%1, %%"FF_REG_a"), %%mm0 \n\t" // BGR BGR BG | ||
1073 | "movq (%1, %%"FF_REG_a"), %%mm1 \n\t" // BGR BGR BG | ||
1074 | "movq 2(%1, %%"FF_REG_a"), %%mm2 \n\t" // R BGR BGR B | ||
1075 | "psllq $16, %%mm0 \n\t" // 00 BGR BGR | ||
1076 | "pand %%mm5, %%mm0 \n\t" | ||
1077 | "pand %%mm6, %%mm1 \n\t" | ||
1078 | "pand %%mm7, %%mm2 \n\t" | ||
1079 | "por %%mm0, %%mm1 \n\t" | ||
1080 | "por %%mm2, %%mm1 \n\t" | ||
1081 | "movq 6(%1, %%"FF_REG_a"), %%mm0 \n\t" // BGR BGR BG | ||
1082 | MOVNTQ" %%mm1,(%2, %%"FF_REG_a") \n\t" // RGB RGB RG | ||
1083 | "movq 8(%1, %%"FF_REG_a"), %%mm1 \n\t" // R BGR BGR B | ||
1084 | "movq 10(%1, %%"FF_REG_a"), %%mm2 \n\t" // GR BGR BGR | ||
1085 | "pand %%mm7, %%mm0 \n\t" | ||
1086 | "pand %%mm5, %%mm1 \n\t" | ||
1087 | "pand %%mm6, %%mm2 \n\t" | ||
1088 | "por %%mm0, %%mm1 \n\t" | ||
1089 | "por %%mm2, %%mm1 \n\t" | ||
1090 | "movq 14(%1, %%"FF_REG_a"), %%mm0 \n\t" // R BGR BGR B | ||
1091 | MOVNTQ" %%mm1, 8(%2, %%"FF_REG_a")\n\t" // B RGB RGB R | ||
1092 | "movq 16(%1, %%"FF_REG_a"), %%mm1 \n\t" // GR BGR BGR | ||
1093 | "movq 18(%1, %%"FF_REG_a"), %%mm2 \n\t" // BGR BGR BG | ||
1094 | "pand %%mm6, %%mm0 \n\t" | ||
1095 | "pand %%mm7, %%mm1 \n\t" | ||
1096 | "pand %%mm5, %%mm2 \n\t" | ||
1097 | "por %%mm0, %%mm1 \n\t" | ||
1098 | "por %%mm2, %%mm1 \n\t" | ||
1099 | MOVNTQ" %%mm1, 16(%2, %%"FF_REG_a") \n\t" | ||
1100 | "add $24, %%"FF_REG_a" \n\t" | ||
1101 | " js 1b \n\t" | ||
1102 | "2: \n\t" | ||
1103 | : "+a" (mmx_size) | ||
1104 | ✗ | : "r" (src-mmx_size), "r"(dst-mmx_size) | |
1105 | NAMED_CONSTRAINTS_ADD(mask24r,mask24g,mask24b) | ||
1106 | ); | ||
1107 | |||
1108 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
1109 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
1110 | |||
1111 | ✗ | if (mmx_size==23) return; //finished, was multiple of 8 | |
1112 | |||
1113 | ✗ | src+= src_size; | |
1114 | ✗ | dst+= src_size; | |
1115 | ✗ | src_size= 23-mmx_size; | |
1116 | ✗ | src-= src_size; | |
1117 | ✗ | dst-= src_size; | |
1118 | ✗ | for (unsigned i = 0; i < src_size; i +=3) { | |
1119 | register uint8_t x; | ||
1120 | ✗ | x = src[i + 2]; | |
1121 | ✗ | dst[i + 1] = src[i + 1]; | |
1122 | ✗ | dst[i + 2] = src[i + 0]; | |
1123 | ✗ | dst[i + 0] = x; | |
1124 | } | ||
1125 | } | ||
1126 | |||
1127 | ✗ | static inline void yuvPlanartoyuy2_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | |
1128 | int width, int height, | ||
1129 | int lumStride, int chromStride, int dstStride, int vertLumPerChroma) | ||
1130 | { | ||
1131 | ✗ | const x86_reg chromWidth= width>>1; | |
1132 | ✗ | for (int y = 0; y < height; y++) { | |
1133 | //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway) | ||
1134 | ✗ | __asm__ volatile( | |
1135 | "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" | ||
1136 | ".p2align 4 \n\t" | ||
1137 | "1: \n\t" | ||
1138 | PREFETCH" 32(%1, %%"FF_REG_a", 2) \n\t" | ||
1139 | PREFETCH" 32(%2, %%"FF_REG_a") \n\t" | ||
1140 | PREFETCH" 32(%3, %%"FF_REG_a") \n\t" | ||
1141 | "movq (%2, %%"FF_REG_a"), %%mm0 \n\t" // U(0) | ||
1142 | "movq %%mm0, %%mm2 \n\t" // U(0) | ||
1143 | "movq (%3, %%"FF_REG_a"), %%mm1 \n\t" // V(0) | ||
1144 | "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | ||
1145 | "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) | ||
1146 | |||
1147 | "movq (%1, %%"FF_REG_a",2), %%mm3 \n\t" // Y(0) | ||
1148 | "movq 8(%1, %%"FF_REG_a",2), %%mm5 \n\t" // Y(8) | ||
1149 | "movq %%mm3, %%mm4 \n\t" // Y(0) | ||
1150 | "movq %%mm5, %%mm6 \n\t" // Y(8) | ||
1151 | "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0) | ||
1152 | "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4) | ||
1153 | "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8) | ||
1154 | "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12) | ||
1155 | |||
1156 | MOVNTQ" %%mm3, (%0, %%"FF_REG_a", 4) \n\t" | ||
1157 | MOVNTQ" %%mm4, 8(%0, %%"FF_REG_a", 4) \n\t" | ||
1158 | MOVNTQ" %%mm5, 16(%0, %%"FF_REG_a", 4) \n\t" | ||
1159 | MOVNTQ" %%mm6, 24(%0, %%"FF_REG_a", 4) \n\t" | ||
1160 | |||
1161 | "add $8, %%"FF_REG_a" \n\t" | ||
1162 | "cmp %4, %%"FF_REG_a" \n\t" | ||
1163 | " jb 1b \n\t" | ||
1164 | ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) | ||
1165 | : "%"FF_REG_a | ||
1166 | ); | ||
1167 | ✗ | if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) { | |
1168 | ✗ | usrc += chromStride; | |
1169 | ✗ | vsrc += chromStride; | |
1170 | } | ||
1171 | ✗ | ysrc += lumStride; | |
1172 | ✗ | dst += dstStride; | |
1173 | } | ||
1174 | ✗ | __asm__(EMMS" \n\t" | |
1175 | SFENCE" \n\t" | ||
1176 | :::"memory"); | ||
1177 | ✗ | } | |
1178 | |||
1179 | /** | ||
1180 | * Height should be a multiple of 2 and width should be a multiple of 16. | ||
1181 | * (If this is a problem for anyone then tell me, and I will fix it.) | ||
1182 | */ | ||
1183 | ✗ | static inline void yv12toyuy2_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | |
1184 | int width, int height, | ||
1185 | int lumStride, int chromStride, int dstStride) | ||
1186 | { | ||
1187 | //FIXME interpolate chroma | ||
1188 | ✗ | yuvPlanartoyuy2_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); | |
1189 | ✗ | } | |
1190 | |||
1191 | ✗ | static inline void yuvPlanartouyvy_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | |
1192 | int width, int height, | ||
1193 | int lumStride, int chromStride, int dstStride, int vertLumPerChroma) | ||
1194 | { | ||
1195 | ✗ | const x86_reg chromWidth= width>>1; | |
1196 | ✗ | for (int y = 0; y < height; y++) { | |
1197 | //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway) | ||
1198 | ✗ | __asm__ volatile( | |
1199 | "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" | ||
1200 | ".p2align 4 \n\t" | ||
1201 | "1: \n\t" | ||
1202 | PREFETCH" 32(%1, %%"FF_REG_a", 2) \n\t" | ||
1203 | PREFETCH" 32(%2, %%"FF_REG_a") \n\t" | ||
1204 | PREFETCH" 32(%3, %%"FF_REG_a") \n\t" | ||
1205 | "movq (%2, %%"FF_REG_a"), %%mm0 \n\t" // U(0) | ||
1206 | "movq %%mm0, %%mm2 \n\t" // U(0) | ||
1207 | "movq (%3, %%"FF_REG_a"), %%mm1 \n\t" // V(0) | ||
1208 | "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | ||
1209 | "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) | ||
1210 | |||
1211 | "movq (%1, %%"FF_REG_a",2), %%mm3 \n\t" // Y(0) | ||
1212 | "movq 8(%1, %%"FF_REG_a",2), %%mm5 \n\t" // Y(8) | ||
1213 | "movq %%mm0, %%mm4 \n\t" // Y(0) | ||
1214 | "movq %%mm2, %%mm6 \n\t" // Y(8) | ||
1215 | "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0) | ||
1216 | "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4) | ||
1217 | "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8) | ||
1218 | "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12) | ||
1219 | |||
1220 | MOVNTQ" %%mm0, (%0, %%"FF_REG_a", 4) \n\t" | ||
1221 | MOVNTQ" %%mm4, 8(%0, %%"FF_REG_a", 4) \n\t" | ||
1222 | MOVNTQ" %%mm2, 16(%0, %%"FF_REG_a", 4) \n\t" | ||
1223 | MOVNTQ" %%mm6, 24(%0, %%"FF_REG_a", 4) \n\t" | ||
1224 | |||
1225 | "add $8, %%"FF_REG_a" \n\t" | ||
1226 | "cmp %4, %%"FF_REG_a" \n\t" | ||
1227 | " jb 1b \n\t" | ||
1228 | ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) | ||
1229 | : "%"FF_REG_a | ||
1230 | ); | ||
1231 | ✗ | if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) { | |
1232 | ✗ | usrc += chromStride; | |
1233 | ✗ | vsrc += chromStride; | |
1234 | } | ||
1235 | ✗ | ysrc += lumStride; | |
1236 | ✗ | dst += dstStride; | |
1237 | } | ||
1238 | ✗ | __asm__(EMMS" \n\t" | |
1239 | SFENCE" \n\t" | ||
1240 | :::"memory"); | ||
1241 | ✗ | } | |
1242 | |||
1243 | /** | ||
1244 | * Height should be a multiple of 2 and width should be a multiple of 16 | ||
1245 | * (If this is a problem for anyone then tell me, and I will fix it.) | ||
1246 | */ | ||
1247 | ✗ | static inline void yv12touyvy_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | |
1248 | int width, int height, | ||
1249 | int lumStride, int chromStride, int dstStride) | ||
1250 | { | ||
1251 | //FIXME interpolate chroma | ||
1252 | ✗ | yuvPlanartouyvy_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); | |
1253 | ✗ | } | |
1254 | |||
1255 | /** | ||
1256 | * Width should be a multiple of 16. | ||
1257 | */ | ||
1258 | ✗ | static inline void yuv422ptouyvy_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | |
1259 | int width, int height, | ||
1260 | int lumStride, int chromStride, int dstStride) | ||
1261 | { | ||
1262 | ✗ | yuvPlanartouyvy_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1); | |
1263 | ✗ | } | |
1264 | |||
1265 | /** | ||
1266 | * Width should be a multiple of 16. | ||
1267 | */ | ||
1268 | ✗ | static inline void yuv422ptoyuy2_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | |
1269 | int width, int height, | ||
1270 | int lumStride, int chromStride, int dstStride) | ||
1271 | { | ||
1272 | ✗ | yuvPlanartoyuy2_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1); | |
1273 | ✗ | } | |
1274 | |||
1275 | /** | ||
1276 | * Height should be a multiple of 2 and width should be a multiple of 16. | ||
1277 | * (If this is a problem for anyone then tell me, and I will fix it.) | ||
1278 | */ | ||
1279 | ✗ | static inline void yuy2toyv12_mmxext(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | |
1280 | int width, int height, | ||
1281 | int lumStride, int chromStride, int srcStride) | ||
1282 | { | ||
1283 | ✗ | const x86_reg chromWidth= width>>1; | |
1284 | ✗ | for (int y = 0; y < height; y += 2) { | |
1285 | ✗ | __asm__ volatile( | |
1286 | "xor %%"FF_REG_a", %%"FF_REG_a"\n\t" | ||
1287 | "pcmpeqw %%mm7, %%mm7 \n\t" | ||
1288 | "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... | ||
1289 | ".p2align 4 \n\t" | ||
1290 | "1: \n\t" | ||
1291 | PREFETCH" 64(%0, %%"FF_REG_a", 4) \n\t" | ||
1292 | "movq (%0, %%"FF_REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) | ||
1293 | "movq 8(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) | ||
1294 | "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) | ||
1295 | "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) | ||
1296 | "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) | ||
1297 | "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) | ||
1298 | "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) | ||
1299 | "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) | ||
1300 | "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | ||
1301 | "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) | ||
1302 | |||
1303 | MOVNTQ" %%mm2, (%1, %%"FF_REG_a", 2) \n\t" | ||
1304 | |||
1305 | "movq 16(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8) | ||
1306 | "movq 24(%0, %%"FF_REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12) | ||
1307 | "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) | ||
1308 | "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) | ||
1309 | "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) | ||
1310 | "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) | ||
1311 | "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) | ||
1312 | "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) | ||
1313 | "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) | ||
1314 | "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) | ||
1315 | |||
1316 | MOVNTQ" %%mm3, 8(%1, %%"FF_REG_a", 2) \n\t" | ||
1317 | |||
1318 | "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) | ||
1319 | "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) | ||
1320 | "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) | ||
1321 | "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) | ||
1322 | "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) | ||
1323 | "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) | ||
1324 | "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) | ||
1325 | "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) | ||
1326 | |||
1327 | MOVNTQ" %%mm0, (%3, %%"FF_REG_a") \n\t" | ||
1328 | MOVNTQ" %%mm2, (%2, %%"FF_REG_a") \n\t" | ||
1329 | |||
1330 | "add $8, %%"FF_REG_a" \n\t" | ||
1331 | "cmp %4, %%"FF_REG_a" \n\t" | ||
1332 | " jb 1b \n\t" | ||
1333 | ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) | ||
1334 | : "memory", "%"FF_REG_a | ||
1335 | ); | ||
1336 | |||
1337 | ✗ | ydst += lumStride; | |
1338 | ✗ | src += srcStride; | |
1339 | |||
1340 | ✗ | __asm__ volatile( | |
1341 | "xor %%"FF_REG_a", %%"FF_REG_a"\n\t" | ||
1342 | ".p2align 4 \n\t" | ||
1343 | "1: \n\t" | ||
1344 | PREFETCH" 64(%0, %%"FF_REG_a", 4) \n\t" | ||
1345 | "movq (%0, %%"FF_REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) | ||
1346 | "movq 8(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) | ||
1347 | "movq 16(%0, %%"FF_REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8) | ||
1348 | "movq 24(%0, %%"FF_REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12) | ||
1349 | "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0) | ||
1350 | "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4) | ||
1351 | "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8) | ||
1352 | "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12) | ||
1353 | "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) | ||
1354 | "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) | ||
1355 | |||
1356 | MOVNTQ" %%mm0, (%1, %%"FF_REG_a", 2) \n\t" | ||
1357 | MOVNTQ" %%mm2, 8(%1, %%"FF_REG_a", 2) \n\t" | ||
1358 | |||
1359 | "add $8, %%"FF_REG_a"\n\t" | ||
1360 | "cmp %4, %%"FF_REG_a"\n\t" | ||
1361 | " jb 1b \n\t" | ||
1362 | |||
1363 | ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) | ||
1364 | : "memory", "%"FF_REG_a | ||
1365 | ); | ||
1366 | ✗ | udst += chromStride; | |
1367 | ✗ | vdst += chromStride; | |
1368 | ✗ | ydst += lumStride; | |
1369 | ✗ | src += srcStride; | |
1370 | } | ||
1371 | ✗ | __asm__ volatile(EMMS" \n\t" | |
1372 | SFENCE" \n\t" | ||
1373 | :::"memory"); | ||
1374 | ✗ | } | |
1375 | |||
1376 | ✗ | static inline void planar2x_mmxext(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride) | |
1377 | { | ||
1378 | ✗ | dst[0]= src[0]; | |
1379 | |||
1380 | // first line | ||
1381 | ✗ | for (int x = 0; x < srcWidth - 1; x++) { | |
1382 | ✗ | dst[2*x+1]= (3*src[x] + src[x+1])>>2; | |
1383 | ✗ | dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; | |
1384 | } | ||
1385 | ✗ | dst[2*srcWidth-1]= src[srcWidth-1]; | |
1386 | |||
1387 | ✗ | dst+= dstStride; | |
1388 | |||
1389 | ✗ | for (int y = 1; y < srcHeight; y++) { | |
1390 | ✗ | x86_reg mmxSize= srcWidth&~15; | |
1391 | |||
1392 | ✗ | if (mmxSize) { | |
1393 | ✗ | __asm__ volatile( | |
1394 | "mov %4, %%"FF_REG_a" \n\t" | ||
1395 | "movq "MANGLE(mmx_ff)", %%mm0 \n\t" | ||
1396 | "movq (%0, %%"FF_REG_a"), %%mm4 \n\t" | ||
1397 | "movq %%mm4, %%mm2 \n\t" | ||
1398 | "psllq $8, %%mm4 \n\t" | ||
1399 | "pand %%mm0, %%mm2 \n\t" | ||
1400 | "por %%mm2, %%mm4 \n\t" | ||
1401 | "movq (%1, %%"FF_REG_a"), %%mm5 \n\t" | ||
1402 | "movq %%mm5, %%mm3 \n\t" | ||
1403 | "psllq $8, %%mm5 \n\t" | ||
1404 | "pand %%mm0, %%mm3 \n\t" | ||
1405 | "por %%mm3, %%mm5 \n\t" | ||
1406 | "1: \n\t" | ||
1407 | "movq (%0, %%"FF_REG_a"), %%mm0 \n\t" | ||
1408 | "movq (%1, %%"FF_REG_a"), %%mm1 \n\t" | ||
1409 | "movq 1(%0, %%"FF_REG_a"), %%mm2 \n\t" | ||
1410 | "movq 1(%1, %%"FF_REG_a"), %%mm3 \n\t" | ||
1411 | PAVGB" %%mm0, %%mm5 \n\t" | ||
1412 | PAVGB" %%mm0, %%mm3 \n\t" | ||
1413 | PAVGB" %%mm0, %%mm5 \n\t" | ||
1414 | PAVGB" %%mm0, %%mm3 \n\t" | ||
1415 | PAVGB" %%mm1, %%mm4 \n\t" | ||
1416 | PAVGB" %%mm1, %%mm2 \n\t" | ||
1417 | PAVGB" %%mm1, %%mm4 \n\t" | ||
1418 | PAVGB" %%mm1, %%mm2 \n\t" | ||
1419 | "movq %%mm5, %%mm7 \n\t" | ||
1420 | "movq %%mm4, %%mm6 \n\t" | ||
1421 | "punpcklbw %%mm3, %%mm5 \n\t" | ||
1422 | "punpckhbw %%mm3, %%mm7 \n\t" | ||
1423 | "punpcklbw %%mm2, %%mm4 \n\t" | ||
1424 | "punpckhbw %%mm2, %%mm6 \n\t" | ||
1425 | MOVNTQ" %%mm5, (%2, %%"FF_REG_a", 2) \n\t" | ||
1426 | MOVNTQ" %%mm7, 8(%2, %%"FF_REG_a", 2) \n\t" | ||
1427 | MOVNTQ" %%mm4, (%3, %%"FF_REG_a", 2) \n\t" | ||
1428 | MOVNTQ" %%mm6, 8(%3, %%"FF_REG_a", 2) \n\t" | ||
1429 | "add $8, %%"FF_REG_a" \n\t" | ||
1430 | "movq -1(%0, %%"FF_REG_a"), %%mm4 \n\t" | ||
1431 | "movq -1(%1, %%"FF_REG_a"), %%mm5 \n\t" | ||
1432 | " js 1b \n\t" | ||
1433 | ✗ | :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ), | |
1434 | ✗ | "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2), | |
1435 | ✗ | "g" (-mmxSize) | |
1436 | NAMED_CONSTRAINTS_ADD(mmx_ff) | ||
1437 | : "%"FF_REG_a | ||
1438 | ); | ||
1439 | } else { | ||
1440 | ✗ | mmxSize = 1; | |
1441 | ✗ | dst[0] = (src[0] * 3 + src[srcStride]) >> 2; | |
1442 | ✗ | dst[dstStride] = (src[0] + 3 * src[srcStride]) >> 2; | |
1443 | } | ||
1444 | |||
1445 | ✗ | for (int x = mmxSize - 1; x < srcWidth - 1; x++) { | |
1446 | ✗ | dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2; | |
1447 | ✗ | dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2; | |
1448 | ✗ | dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2; | |
1449 | ✗ | dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2; | |
1450 | } | ||
1451 | ✗ | dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2; | |
1452 | ✗ | dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2; | |
1453 | |||
1454 | ✗ | dst+=dstStride*2; | |
1455 | ✗ | src+=srcStride; | |
1456 | } | ||
1457 | |||
1458 | // last line | ||
1459 | ✗ | dst[0]= src[0]; | |
1460 | |||
1461 | ✗ | for (int x = 0; x < srcWidth - 1; x++) { | |
1462 | ✗ | dst[2*x+1]= (3*src[x] + src[x+1])>>2; | |
1463 | ✗ | dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; | |
1464 | } | ||
1465 | ✗ | dst[2*srcWidth-1]= src[srcWidth-1]; | |
1466 | |||
1467 | ✗ | __asm__ volatile(EMMS" \n\t" | |
1468 | SFENCE" \n\t" | ||
1469 | :::"memory"); | ||
1470 | ✗ | } | |
1471 | |||
1472 | /** | ||
1473 | * Height should be a multiple of 2 and width should be a multiple of 2. | ||
1474 | * (If this is a problem for anyone then tell me, and I will fix it.) | ||
1475 | * Chrominance data is only taken from every second line, | ||
1476 | * others are ignored in the C version. | ||
1477 | * FIXME: Write HQ version. | ||
1478 | */ | ||
1479 | #if ARCH_X86_32 && HAVE_7REGS | ||
1480 | DECLARE_ASM_CONST(8, uint64_t, bgr2YOffset) = 0x1010101010101010ULL; | ||
1481 | DECLARE_ASM_CONST(8, uint64_t, bgr2UVOffset) = 0x8080808080808080ULL; | ||
1482 | DECLARE_ASM_CONST(8, uint64_t, w1111) = 0x0001000100010001ULL; | ||
1483 | |||
1484 | static inline void rgb24toyv12_mmxext(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | ||
1485 | int width, int height, | ||
1486 | int lumStride, int chromStride, int srcStride, | ||
1487 | const int32_t *rgb2yuv) | ||
1488 | { | ||
1489 | #define BGR2Y_IDX "16*4+16*32" | ||
1490 | #define BGR2U_IDX "16*4+16*33" | ||
1491 | #define BGR2V_IDX "16*4+16*34" | ||
1492 | int y; | ||
1493 | const x86_reg chromWidth= width>>1; | ||
1494 | |||
1495 | if (height > 2) { | ||
1496 | ff_rgb24toyv12_c(src, ydst, udst, vdst, width, 2, lumStride, chromStride, srcStride, rgb2yuv); | ||
1497 | src += 2*srcStride; | ||
1498 | ydst += 2*lumStride; | ||
1499 | udst += chromStride; | ||
1500 | vdst += chromStride; | ||
1501 | height -= 2; | ||
1502 | } | ||
1503 | |||
1504 | for (y = 0; y < height - 2; y += 2) { | ||
1505 | for (int i = 0; i < 2; i++) { | ||
1506 | __asm__ volatile( | ||
1507 | "mov %2, %%"FF_REG_a"\n\t" | ||
1508 | "movq "BGR2Y_IDX"(%3), %%mm6 \n\t" | ||
1509 | "movq "MANGLE(w1111)", %%mm5 \n\t" | ||
1510 | "pxor %%mm7, %%mm7 \n\t" | ||
1511 | "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_d" \n\t" | ||
1512 | ".p2align 4 \n\t" | ||
1513 | "1: \n\t" | ||
1514 | PREFETCH" 64(%0, %%"FF_REG_d") \n\t" | ||
1515 | "movd (%0, %%"FF_REG_d"), %%mm0 \n\t" | ||
1516 | "movd 3(%0, %%"FF_REG_d"), %%mm1 \n\t" | ||
1517 | "punpcklbw %%mm7, %%mm0 \n\t" | ||
1518 | "punpcklbw %%mm7, %%mm1 \n\t" | ||
1519 | "movd 6(%0, %%"FF_REG_d"), %%mm2 \n\t" | ||
1520 | "movd 9(%0, %%"FF_REG_d"), %%mm3 \n\t" | ||
1521 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
1522 | "punpcklbw %%mm7, %%mm3 \n\t" | ||
1523 | "pmaddwd %%mm6, %%mm0 \n\t" | ||
1524 | "pmaddwd %%mm6, %%mm1 \n\t" | ||
1525 | "pmaddwd %%mm6, %%mm2 \n\t" | ||
1526 | "pmaddwd %%mm6, %%mm3 \n\t" | ||
1527 | "psrad $8, %%mm0 \n\t" | ||
1528 | "psrad $8, %%mm1 \n\t" | ||
1529 | "psrad $8, %%mm2 \n\t" | ||
1530 | "psrad $8, %%mm3 \n\t" | ||
1531 | "packssdw %%mm1, %%mm0 \n\t" | ||
1532 | "packssdw %%mm3, %%mm2 \n\t" | ||
1533 | "pmaddwd %%mm5, %%mm0 \n\t" | ||
1534 | "pmaddwd %%mm5, %%mm2 \n\t" | ||
1535 | "packssdw %%mm2, %%mm0 \n\t" | ||
1536 | "psraw $7, %%mm0 \n\t" | ||
1537 | |||
1538 | "movd 12(%0, %%"FF_REG_d"), %%mm4 \n\t" | ||
1539 | "movd 15(%0, %%"FF_REG_d"), %%mm1 \n\t" | ||
1540 | "punpcklbw %%mm7, %%mm4 \n\t" | ||
1541 | "punpcklbw %%mm7, %%mm1 \n\t" | ||
1542 | "movd 18(%0, %%"FF_REG_d"), %%mm2 \n\t" | ||
1543 | "movd 21(%0, %%"FF_REG_d"), %%mm3 \n\t" | ||
1544 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
1545 | "punpcklbw %%mm7, %%mm3 \n\t" | ||
1546 | "pmaddwd %%mm6, %%mm4 \n\t" | ||
1547 | "pmaddwd %%mm6, %%mm1 \n\t" | ||
1548 | "pmaddwd %%mm6, %%mm2 \n\t" | ||
1549 | "pmaddwd %%mm6, %%mm3 \n\t" | ||
1550 | "psrad $8, %%mm4 \n\t" | ||
1551 | "psrad $8, %%mm1 \n\t" | ||
1552 | "psrad $8, %%mm2 \n\t" | ||
1553 | "psrad $8, %%mm3 \n\t" | ||
1554 | "packssdw %%mm1, %%mm4 \n\t" | ||
1555 | "packssdw %%mm3, %%mm2 \n\t" | ||
1556 | "pmaddwd %%mm5, %%mm4 \n\t" | ||
1557 | "pmaddwd %%mm5, %%mm2 \n\t" | ||
1558 | "add $24, %%"FF_REG_d"\n\t" | ||
1559 | "packssdw %%mm2, %%mm4 \n\t" | ||
1560 | "psraw $7, %%mm4 \n\t" | ||
1561 | |||
1562 | "packuswb %%mm4, %%mm0 \n\t" | ||
1563 | "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t" | ||
1564 | |||
1565 | MOVNTQ" %%mm0, (%1, %%"FF_REG_a") \n\t" | ||
1566 | "add $8, %%"FF_REG_a" \n\t" | ||
1567 | " js 1b \n\t" | ||
1568 | : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width), "r"(rgb2yuv) | ||
1569 | NAMED_CONSTRAINTS_ADD(w1111,bgr2YOffset) | ||
1570 | : "%"FF_REG_a, "%"FF_REG_d | ||
1571 | ); | ||
1572 | ydst += lumStride; | ||
1573 | src += srcStride; | ||
1574 | } | ||
1575 | src -= srcStride*2; | ||
1576 | __asm__ volatile( | ||
1577 | "mov %4, %%"FF_REG_a"\n\t" | ||
1578 | "movq "MANGLE(w1111)", %%mm5 \n\t" | ||
1579 | "movq "BGR2U_IDX"(%5), %%mm6 \n\t" | ||
1580 | "pxor %%mm7, %%mm7 \n\t" | ||
1581 | "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_d" \n\t" | ||
1582 | "add %%"FF_REG_d", %%"FF_REG_d"\n\t" | ||
1583 | ".p2align 4 \n\t" | ||
1584 | "1: \n\t" | ||
1585 | PREFETCH" 64(%0, %%"FF_REG_d") \n\t" | ||
1586 | PREFETCH" 64(%1, %%"FF_REG_d") \n\t" | ||
1587 | "movq (%0, %%"FF_REG_d"), %%mm0 \n\t" | ||
1588 | "movq (%1, %%"FF_REG_d"), %%mm1 \n\t" | ||
1589 | "movq 6(%0, %%"FF_REG_d"), %%mm2 \n\t" | ||
1590 | "movq 6(%1, %%"FF_REG_d"), %%mm3 \n\t" | ||
1591 | PAVGB" %%mm1, %%mm0 \n\t" | ||
1592 | PAVGB" %%mm3, %%mm2 \n\t" | ||
1593 | "movq %%mm0, %%mm1 \n\t" | ||
1594 | "movq %%mm2, %%mm3 \n\t" | ||
1595 | "psrlq $24, %%mm0 \n\t" | ||
1596 | "psrlq $24, %%mm2 \n\t" | ||
1597 | PAVGB" %%mm1, %%mm0 \n\t" | ||
1598 | PAVGB" %%mm3, %%mm2 \n\t" | ||
1599 | "punpcklbw %%mm7, %%mm0 \n\t" | ||
1600 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
1601 | "movq "BGR2V_IDX"(%5), %%mm1 \n\t" | ||
1602 | "movq "BGR2V_IDX"(%5), %%mm3 \n\t" | ||
1603 | |||
1604 | "pmaddwd %%mm0, %%mm1 \n\t" | ||
1605 | "pmaddwd %%mm2, %%mm3 \n\t" | ||
1606 | "pmaddwd %%mm6, %%mm0 \n\t" | ||
1607 | "pmaddwd %%mm6, %%mm2 \n\t" | ||
1608 | "psrad $8, %%mm0 \n\t" | ||
1609 | "psrad $8, %%mm1 \n\t" | ||
1610 | "psrad $8, %%mm2 \n\t" | ||
1611 | "psrad $8, %%mm3 \n\t" | ||
1612 | "packssdw %%mm2, %%mm0 \n\t" | ||
1613 | "packssdw %%mm3, %%mm1 \n\t" | ||
1614 | "pmaddwd %%mm5, %%mm0 \n\t" | ||
1615 | "pmaddwd %%mm5, %%mm1 \n\t" | ||
1616 | "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 | ||
1617 | "psraw $7, %%mm0 \n\t" | ||
1618 | |||
1619 | "movq 12(%0, %%"FF_REG_d"), %%mm4 \n\t" | ||
1620 | "movq 12(%1, %%"FF_REG_d"), %%mm1 \n\t" | ||
1621 | "movq 18(%0, %%"FF_REG_d"), %%mm2 \n\t" | ||
1622 | "movq 18(%1, %%"FF_REG_d"), %%mm3 \n\t" | ||
1623 | PAVGB" %%mm1, %%mm4 \n\t" | ||
1624 | PAVGB" %%mm3, %%mm2 \n\t" | ||
1625 | "movq %%mm4, %%mm1 \n\t" | ||
1626 | "movq %%mm2, %%mm3 \n\t" | ||
1627 | "psrlq $24, %%mm4 \n\t" | ||
1628 | "psrlq $24, %%mm2 \n\t" | ||
1629 | PAVGB" %%mm1, %%mm4 \n\t" | ||
1630 | PAVGB" %%mm3, %%mm2 \n\t" | ||
1631 | "punpcklbw %%mm7, %%mm4 \n\t" | ||
1632 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
1633 | "movq "BGR2V_IDX"(%5), %%mm1 \n\t" | ||
1634 | "movq "BGR2V_IDX"(%5), %%mm3 \n\t" | ||
1635 | |||
1636 | "pmaddwd %%mm4, %%mm1 \n\t" | ||
1637 | "pmaddwd %%mm2, %%mm3 \n\t" | ||
1638 | "pmaddwd %%mm6, %%mm4 \n\t" | ||
1639 | "pmaddwd %%mm6, %%mm2 \n\t" | ||
1640 | "psrad $8, %%mm4 \n\t" | ||
1641 | "psrad $8, %%mm1 \n\t" | ||
1642 | "psrad $8, %%mm2 \n\t" | ||
1643 | "psrad $8, %%mm3 \n\t" | ||
1644 | "packssdw %%mm2, %%mm4 \n\t" | ||
1645 | "packssdw %%mm3, %%mm1 \n\t" | ||
1646 | "pmaddwd %%mm5, %%mm4 \n\t" | ||
1647 | "pmaddwd %%mm5, %%mm1 \n\t" | ||
1648 | "add $24, %%"FF_REG_d"\n\t" | ||
1649 | "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 | ||
1650 | "psraw $7, %%mm4 \n\t" | ||
1651 | |||
1652 | "movq %%mm0, %%mm1 \n\t" | ||
1653 | "punpckldq %%mm4, %%mm0 \n\t" | ||
1654 | "punpckhdq %%mm4, %%mm1 \n\t" | ||
1655 | "packsswb %%mm1, %%mm0 \n\t" | ||
1656 | "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t" | ||
1657 | "movd %%mm0, (%2, %%"FF_REG_a") \n\t" | ||
1658 | "punpckhdq %%mm0, %%mm0 \n\t" | ||
1659 | "movd %%mm0, (%3, %%"FF_REG_a") \n\t" | ||
1660 | "add $4, %%"FF_REG_a" \n\t" | ||
1661 | " js 1b \n\t" | ||
1662 | : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth), "r"(rgb2yuv) | ||
1663 | NAMED_CONSTRAINTS_ADD(w1111,bgr2UVOffset) | ||
1664 | : "%"FF_REG_a, "%"FF_REG_d | ||
1665 | ); | ||
1666 | |||
1667 | udst += chromStride; | ||
1668 | vdst += chromStride; | ||
1669 | src += srcStride*2; | ||
1670 | } | ||
1671 | |||
1672 | __asm__ volatile(EMMS" \n\t" | ||
1673 | SFENCE" \n\t" | ||
1674 | :::"memory"); | ||
1675 | |||
1676 | ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride, rgb2yuv); | ||
1677 | } | ||
1678 | #endif /* HAVE_7REGS */ | ||
1679 | |||
1680 | ✗ | static inline void vu9_to_vu12_mmxext(const uint8_t *src1, const uint8_t *src2, | |
1681 | uint8_t *dst1, uint8_t *dst2, | ||
1682 | int width, int height, | ||
1683 | int srcStride1, int srcStride2, | ||
1684 | int dstStride1, int dstStride2) | ||
1685 | { | ||
1686 | int w,h; | ||
1687 | ✗ | w=width/2; h=height/2; | |
1688 | ✗ | __asm__ volatile( | |
1689 | PREFETCH" %0 \n\t" | ||
1690 | PREFETCH" %1 \n\t" | ||
1691 | ✗ | ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory"); | |
1692 | ✗ | for (x86_reg y = 0; y < h; y++) { | |
1693 | ✗ | const uint8_t* s1=src1+srcStride1*(y>>1); | |
1694 | ✗ | uint8_t* d=dst1+dstStride1*y; | |
1695 | ✗ | x86_reg x = 0; | |
1696 | ✗ | for (;x<w-31;x+=32) { | |
1697 | ✗ | __asm__ volatile( | |
1698 | PREFETCH" 32(%1,%2) \n\t" | ||
1699 | "movq (%1,%2), %%mm0 \n\t" | ||
1700 | "movq 8(%1,%2), %%mm2 \n\t" | ||
1701 | "movq 16(%1,%2), %%mm4 \n\t" | ||
1702 | "movq 24(%1,%2), %%mm6 \n\t" | ||
1703 | "movq %%mm0, %%mm1 \n\t" | ||
1704 | "movq %%mm2, %%mm3 \n\t" | ||
1705 | "movq %%mm4, %%mm5 \n\t" | ||
1706 | "movq %%mm6, %%mm7 \n\t" | ||
1707 | "punpcklbw %%mm0, %%mm0 \n\t" | ||
1708 | "punpckhbw %%mm1, %%mm1 \n\t" | ||
1709 | "punpcklbw %%mm2, %%mm2 \n\t" | ||
1710 | "punpckhbw %%mm3, %%mm3 \n\t" | ||
1711 | "punpcklbw %%mm4, %%mm4 \n\t" | ||
1712 | "punpckhbw %%mm5, %%mm5 \n\t" | ||
1713 | "punpcklbw %%mm6, %%mm6 \n\t" | ||
1714 | "punpckhbw %%mm7, %%mm7 \n\t" | ||
1715 | MOVNTQ" %%mm0, (%0,%2,2) \n\t" | ||
1716 | MOVNTQ" %%mm1, 8(%0,%2,2) \n\t" | ||
1717 | MOVNTQ" %%mm2, 16(%0,%2,2) \n\t" | ||
1718 | MOVNTQ" %%mm3, 24(%0,%2,2) \n\t" | ||
1719 | MOVNTQ" %%mm4, 32(%0,%2,2) \n\t" | ||
1720 | MOVNTQ" %%mm5, 40(%0,%2,2) \n\t" | ||
1721 | MOVNTQ" %%mm6, 48(%0,%2,2) \n\t" | ||
1722 | MOVNTQ" %%mm7, 56(%0,%2,2)" | ||
1723 | :: "r"(d), "r"(s1), "r"(x) | ||
1724 | :"memory"); | ||
1725 | } | ||
1726 | ✗ | for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x]; | |
1727 | } | ||
1728 | ✗ | for (x86_reg y = 0; y < h; y++) { | |
1729 | ✗ | const uint8_t* s2=src2+srcStride2*(y>>1); | |
1730 | ✗ | uint8_t* d=dst2+dstStride2*y; | |
1731 | ✗ | x86_reg x = 0; | |
1732 | ✗ | for (;x<w-31;x+=32) { | |
1733 | ✗ | __asm__ volatile( | |
1734 | PREFETCH" 32(%1,%2) \n\t" | ||
1735 | "movq (%1,%2), %%mm0 \n\t" | ||
1736 | "movq 8(%1,%2), %%mm2 \n\t" | ||
1737 | "movq 16(%1,%2), %%mm4 \n\t" | ||
1738 | "movq 24(%1,%2), %%mm6 \n\t" | ||
1739 | "movq %%mm0, %%mm1 \n\t" | ||
1740 | "movq %%mm2, %%mm3 \n\t" | ||
1741 | "movq %%mm4, %%mm5 \n\t" | ||
1742 | "movq %%mm6, %%mm7 \n\t" | ||
1743 | "punpcklbw %%mm0, %%mm0 \n\t" | ||
1744 | "punpckhbw %%mm1, %%mm1 \n\t" | ||
1745 | "punpcklbw %%mm2, %%mm2 \n\t" | ||
1746 | "punpckhbw %%mm3, %%mm3 \n\t" | ||
1747 | "punpcklbw %%mm4, %%mm4 \n\t" | ||
1748 | "punpckhbw %%mm5, %%mm5 \n\t" | ||
1749 | "punpcklbw %%mm6, %%mm6 \n\t" | ||
1750 | "punpckhbw %%mm7, %%mm7 \n\t" | ||
1751 | MOVNTQ" %%mm0, (%0,%2,2) \n\t" | ||
1752 | MOVNTQ" %%mm1, 8(%0,%2,2) \n\t" | ||
1753 | MOVNTQ" %%mm2, 16(%0,%2,2) \n\t" | ||
1754 | MOVNTQ" %%mm3, 24(%0,%2,2) \n\t" | ||
1755 | MOVNTQ" %%mm4, 32(%0,%2,2) \n\t" | ||
1756 | MOVNTQ" %%mm5, 40(%0,%2,2) \n\t" | ||
1757 | MOVNTQ" %%mm6, 48(%0,%2,2) \n\t" | ||
1758 | MOVNTQ" %%mm7, 56(%0,%2,2)" | ||
1759 | :: "r"(d), "r"(s2), "r"(x) | ||
1760 | :"memory"); | ||
1761 | } | ||
1762 | ✗ | for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x]; | |
1763 | } | ||
1764 | ✗ | __asm__( | |
1765 | EMMS" \n\t" | ||
1766 | SFENCE" \n\t" | ||
1767 | ::: "memory" | ||
1768 | ); | ||
1769 | ✗ | } | |
1770 | |||
1771 | ✗ | static inline void yvu9_to_yuy2_mmxext(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, | |
1772 | uint8_t *dst, | ||
1773 | int width, int height, | ||
1774 | int srcStride1, int srcStride2, | ||
1775 | int srcStride3, int dstStride) | ||
1776 | { | ||
1777 | int w,h; | ||
1778 | ✗ | w=width/2; h=height; | |
1779 | ✗ | for (int y = 0; y < h; y++) { | |
1780 | ✗ | const uint8_t* yp=src1+srcStride1*y; | |
1781 | ✗ | const uint8_t* up=src2+srcStride2*(y>>2); | |
1782 | ✗ | const uint8_t* vp=src3+srcStride3*(y>>2); | |
1783 | ✗ | uint8_t* d=dst+dstStride*y; | |
1784 | ✗ | x86_reg x = 0; | |
1785 | ✗ | for (;x<w-7;x+=8) { | |
1786 | ✗ | __asm__ volatile( | |
1787 | PREFETCH" 32(%1, %0) \n\t" | ||
1788 | PREFETCH" 32(%2, %0) \n\t" | ||
1789 | PREFETCH" 32(%3, %0) \n\t" | ||
1790 | "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ | ||
1791 | "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */ | ||
1792 | "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */ | ||
1793 | "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ | ||
1794 | "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */ | ||
1795 | "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */ | ||
1796 | "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */ | ||
1797 | "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */ | ||
1798 | "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */ | ||
1799 | "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */ | ||
1800 | |||
1801 | "movq %%mm1, %%mm6 \n\t" | ||
1802 | "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/ | ||
1803 | "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/ | ||
1804 | "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/ | ||
1805 | MOVNTQ" %%mm0, (%4, %0, 8) \n\t" | ||
1806 | MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t" | ||
1807 | |||
1808 | "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/ | ||
1809 | "movq 8(%1, %0, 4), %%mm0 \n\t" | ||
1810 | "movq %%mm0, %%mm3 \n\t" | ||
1811 | "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/ | ||
1812 | "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/ | ||
1813 | MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t" | ||
1814 | MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t" | ||
1815 | |||
1816 | "movq %%mm4, %%mm6 \n\t" | ||
1817 | "movq 16(%1, %0, 4), %%mm0 \n\t" | ||
1818 | "movq %%mm0, %%mm3 \n\t" | ||
1819 | "punpcklbw %%mm5, %%mm4 \n\t" | ||
1820 | "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/ | ||
1821 | "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/ | ||
1822 | MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t" | ||
1823 | MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t" | ||
1824 | |||
1825 | "punpckhbw %%mm5, %%mm6 \n\t" | ||
1826 | "movq 24(%1, %0, 4), %%mm0 \n\t" | ||
1827 | "movq %%mm0, %%mm3 \n\t" | ||
1828 | "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/ | ||
1829 | "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/ | ||
1830 | MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t" | ||
1831 | MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t" | ||
1832 | |||
1833 | : "+r" (x) | ||
1834 | : "r"(yp), "r" (up), "r"(vp), "r"(d) | ||
1835 | :"memory"); | ||
1836 | } | ||
1837 | ✗ | for (; x<w; x++) { | |
1838 | ✗ | const int x2 = x<<2; | |
1839 | ✗ | d[8*x+0] = yp[x2]; | |
1840 | ✗ | d[8*x+1] = up[x]; | |
1841 | ✗ | d[8*x+2] = yp[x2+1]; | |
1842 | ✗ | d[8*x+3] = vp[x]; | |
1843 | ✗ | d[8*x+4] = yp[x2+2]; | |
1844 | ✗ | d[8*x+5] = up[x]; | |
1845 | ✗ | d[8*x+6] = yp[x2+3]; | |
1846 | ✗ | d[8*x+7] = vp[x]; | |
1847 | } | ||
1848 | } | ||
1849 | ✗ | __asm__( | |
1850 | EMMS" \n\t" | ||
1851 | SFENCE" \n\t" | ||
1852 | ::: "memory" | ||
1853 | ); | ||
1854 | ✗ | } | |
1855 | |||
1856 | ✗ | static void extract_even_mmxext(const uint8_t *src, uint8_t *dst, x86_reg count) | |
1857 | { | ||
1858 | ✗ | dst += count; | |
1859 | ✗ | src += 2*count; | |
1860 | ✗ | count= - count; | |
1861 | |||
1862 | ✗ | if(count <= -16) { | |
1863 | ✗ | count += 15; | |
1864 | ✗ | __asm__ volatile( | |
1865 | "pcmpeqw %%mm7, %%mm7 \n\t" | ||
1866 | "psrlw $8, %%mm7 \n\t" | ||
1867 | "1: \n\t" | ||
1868 | "movq -30(%1, %0, 2), %%mm0 \n\t" | ||
1869 | "movq -22(%1, %0, 2), %%mm1 \n\t" | ||
1870 | "movq -14(%1, %0, 2), %%mm2 \n\t" | ||
1871 | "movq -6(%1, %0, 2), %%mm3 \n\t" | ||
1872 | "pand %%mm7, %%mm0 \n\t" | ||
1873 | "pand %%mm7, %%mm1 \n\t" | ||
1874 | "pand %%mm7, %%mm2 \n\t" | ||
1875 | "pand %%mm7, %%mm3 \n\t" | ||
1876 | "packuswb %%mm1, %%mm0 \n\t" | ||
1877 | "packuswb %%mm3, %%mm2 \n\t" | ||
1878 | MOVNTQ" %%mm0,-15(%2, %0) \n\t" | ||
1879 | MOVNTQ" %%mm2,- 7(%2, %0) \n\t" | ||
1880 | "add $16, %0 \n\t" | ||
1881 | " js 1b \n\t" | ||
1882 | : "+r"(count) | ||
1883 | : "r"(src), "r"(dst) | ||
1884 | ); | ||
1885 | ✗ | count -= 15; | |
1886 | } | ||
1887 | ✗ | while(count<0) { | |
1888 | ✗ | dst[count]= src[2*count]; | |
1889 | ✗ | count++; | |
1890 | } | ||
1891 | ✗ | } | |
1892 | |||
1893 | ✗ | static void extract_odd_mmxext(const uint8_t *src, uint8_t *dst, x86_reg count) | |
1894 | { | ||
1895 | ✗ | src ++; | |
1896 | ✗ | dst += count; | |
1897 | ✗ | src += 2*count; | |
1898 | ✗ | count= - count; | |
1899 | |||
1900 | ✗ | if(count < -16) { | |
1901 | ✗ | count += 16; | |
1902 | ✗ | __asm__ volatile( | |
1903 | "pcmpeqw %%mm7, %%mm7 \n\t" | ||
1904 | "psrlw $8, %%mm7 \n\t" | ||
1905 | "1: \n\t" | ||
1906 | "movq -32(%1, %0, 2), %%mm0 \n\t" | ||
1907 | "movq -24(%1, %0, 2), %%mm1 \n\t" | ||
1908 | "movq -16(%1, %0, 2), %%mm2 \n\t" | ||
1909 | "movq -8(%1, %0, 2), %%mm3 \n\t" | ||
1910 | "pand %%mm7, %%mm0 \n\t" | ||
1911 | "pand %%mm7, %%mm1 \n\t" | ||
1912 | "pand %%mm7, %%mm2 \n\t" | ||
1913 | "pand %%mm7, %%mm3 \n\t" | ||
1914 | "packuswb %%mm1, %%mm0 \n\t" | ||
1915 | "packuswb %%mm3, %%mm2 \n\t" | ||
1916 | MOVNTQ" %%mm0,-16(%2, %0) \n\t" | ||
1917 | MOVNTQ" %%mm2,- 8(%2, %0) \n\t" | ||
1918 | "add $16, %0 \n\t" | ||
1919 | " js 1b \n\t" | ||
1920 | : "+r"(count) | ||
1921 | : "r"(src), "r"(dst) | ||
1922 | ); | ||
1923 | ✗ | count -= 16; | |
1924 | } | ||
1925 | ✗ | while(count<0) { | |
1926 | ✗ | dst[count]= src[2*count]; | |
1927 | ✗ | count++; | |
1928 | } | ||
1929 | ✗ | } | |
1930 | |||
1931 | #if ARCH_X86_32 | ||
1932 | static void extract_even2_mmxext(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count) | ||
1933 | { | ||
1934 | dst0+= count; | ||
1935 | dst1+= count; | ||
1936 | src += 4*count; | ||
1937 | count= - count; | ||
1938 | if(count <= -8) { | ||
1939 | count += 7; | ||
1940 | __asm__ volatile( | ||
1941 | "pcmpeqw %%mm7, %%mm7 \n\t" | ||
1942 | "psrlw $8, %%mm7 \n\t" | ||
1943 | "1: \n\t" | ||
1944 | "movq -28(%1, %0, 4), %%mm0 \n\t" | ||
1945 | "movq -20(%1, %0, 4), %%mm1 \n\t" | ||
1946 | "movq -12(%1, %0, 4), %%mm2 \n\t" | ||
1947 | "movq -4(%1, %0, 4), %%mm3 \n\t" | ||
1948 | "pand %%mm7, %%mm0 \n\t" | ||
1949 | "pand %%mm7, %%mm1 \n\t" | ||
1950 | "pand %%mm7, %%mm2 \n\t" | ||
1951 | "pand %%mm7, %%mm3 \n\t" | ||
1952 | "packuswb %%mm1, %%mm0 \n\t" | ||
1953 | "packuswb %%mm3, %%mm2 \n\t" | ||
1954 | "movq %%mm0, %%mm1 \n\t" | ||
1955 | "movq %%mm2, %%mm3 \n\t" | ||
1956 | "psrlw $8, %%mm0 \n\t" | ||
1957 | "psrlw $8, %%mm2 \n\t" | ||
1958 | "pand %%mm7, %%mm1 \n\t" | ||
1959 | "pand %%mm7, %%mm3 \n\t" | ||
1960 | "packuswb %%mm2, %%mm0 \n\t" | ||
1961 | "packuswb %%mm3, %%mm1 \n\t" | ||
1962 | MOVNTQ" %%mm0,- 7(%3, %0) \n\t" | ||
1963 | MOVNTQ" %%mm1,- 7(%2, %0) \n\t" | ||
1964 | "add $8, %0 \n\t" | ||
1965 | " js 1b \n\t" | ||
1966 | : "+r"(count) | ||
1967 | : "r"(src), "r"(dst0), "r"(dst1) | ||
1968 | ); | ||
1969 | count -= 7; | ||
1970 | } | ||
1971 | while(count<0) { | ||
1972 | dst0[count]= src[4*count+0]; | ||
1973 | dst1[count]= src[4*count+2]; | ||
1974 | count++; | ||
1975 | } | ||
1976 | } | ||
1977 | #endif /* ARCH_X86_32 */ | ||
1978 | |||
1979 | ✗ | static void extract_even2avg_mmxext(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count) | |
1980 | { | ||
1981 | ✗ | dst0 += count; | |
1982 | ✗ | dst1 += count; | |
1983 | ✗ | src0 += 4*count; | |
1984 | ✗ | src1 += 4*count; | |
1985 | ✗ | count= - count; | |
1986 | #ifdef PAVGB | ||
1987 | ✗ | if(count <= -8) { | |
1988 | ✗ | count += 7; | |
1989 | ✗ | __asm__ volatile( | |
1990 | "pcmpeqw %%mm7, %%mm7 \n\t" | ||
1991 | "psrlw $8, %%mm7 \n\t" | ||
1992 | "1: \n\t" | ||
1993 | "movq -28(%1, %0, 4), %%mm0 \n\t" | ||
1994 | "movq -20(%1, %0, 4), %%mm1 \n\t" | ||
1995 | "movq -12(%1, %0, 4), %%mm2 \n\t" | ||
1996 | "movq -4(%1, %0, 4), %%mm3 \n\t" | ||
1997 | PAVGB" -28(%2, %0, 4), %%mm0 \n\t" | ||
1998 | PAVGB" -20(%2, %0, 4), %%mm1 \n\t" | ||
1999 | PAVGB" -12(%2, %0, 4), %%mm2 \n\t" | ||
2000 | PAVGB" - 4(%2, %0, 4), %%mm3 \n\t" | ||
2001 | "pand %%mm7, %%mm0 \n\t" | ||
2002 | "pand %%mm7, %%mm1 \n\t" | ||
2003 | "pand %%mm7, %%mm2 \n\t" | ||
2004 | "pand %%mm7, %%mm3 \n\t" | ||
2005 | "packuswb %%mm1, %%mm0 \n\t" | ||
2006 | "packuswb %%mm3, %%mm2 \n\t" | ||
2007 | "movq %%mm0, %%mm1 \n\t" | ||
2008 | "movq %%mm2, %%mm3 \n\t" | ||
2009 | "psrlw $8, %%mm0 \n\t" | ||
2010 | "psrlw $8, %%mm2 \n\t" | ||
2011 | "pand %%mm7, %%mm1 \n\t" | ||
2012 | "pand %%mm7, %%mm3 \n\t" | ||
2013 | "packuswb %%mm2, %%mm0 \n\t" | ||
2014 | "packuswb %%mm3, %%mm1 \n\t" | ||
2015 | MOVNTQ" %%mm0,- 7(%4, %0) \n\t" | ||
2016 | MOVNTQ" %%mm1,- 7(%3, %0) \n\t" | ||
2017 | "add $8, %0 \n\t" | ||
2018 | " js 1b \n\t" | ||
2019 | : "+r"(count) | ||
2020 | : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1) | ||
2021 | ); | ||
2022 | ✗ | count -= 7; | |
2023 | } | ||
2024 | #endif | ||
2025 | ✗ | while(count<0) { | |
2026 | ✗ | dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1; | |
2027 | ✗ | dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1; | |
2028 | ✗ | count++; | |
2029 | } | ||
2030 | ✗ | } | |
2031 | |||
2032 | ✗ | static void extract_odd2_mmxext(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count) | |
2033 | { | ||
2034 | ✗ | dst0+= count; | |
2035 | ✗ | dst1+= count; | |
2036 | ✗ | src += 4*count; | |
2037 | ✗ | count= - count; | |
2038 | ✗ | if(count <= -8) { | |
2039 | ✗ | count += 7; | |
2040 | ✗ | __asm__ volatile( | |
2041 | "pcmpeqw %%mm7, %%mm7 \n\t" | ||
2042 | "psrlw $8, %%mm7 \n\t" | ||
2043 | "1: \n\t" | ||
2044 | "movq -28(%1, %0, 4), %%mm0 \n\t" | ||
2045 | "movq -20(%1, %0, 4), %%mm1 \n\t" | ||
2046 | "movq -12(%1, %0, 4), %%mm2 \n\t" | ||
2047 | "movq -4(%1, %0, 4), %%mm3 \n\t" | ||
2048 | "psrlw $8, %%mm0 \n\t" | ||
2049 | "psrlw $8, %%mm1 \n\t" | ||
2050 | "psrlw $8, %%mm2 \n\t" | ||
2051 | "psrlw $8, %%mm3 \n\t" | ||
2052 | "packuswb %%mm1, %%mm0 \n\t" | ||
2053 | "packuswb %%mm3, %%mm2 \n\t" | ||
2054 | "movq %%mm0, %%mm1 \n\t" | ||
2055 | "movq %%mm2, %%mm3 \n\t" | ||
2056 | "psrlw $8, %%mm0 \n\t" | ||
2057 | "psrlw $8, %%mm2 \n\t" | ||
2058 | "pand %%mm7, %%mm1 \n\t" | ||
2059 | "pand %%mm7, %%mm3 \n\t" | ||
2060 | "packuswb %%mm2, %%mm0 \n\t" | ||
2061 | "packuswb %%mm3, %%mm1 \n\t" | ||
2062 | MOVNTQ" %%mm0,- 7(%3, %0) \n\t" | ||
2063 | MOVNTQ" %%mm1,- 7(%2, %0) \n\t" | ||
2064 | "add $8, %0 \n\t" | ||
2065 | " js 1b \n\t" | ||
2066 | : "+r"(count) | ||
2067 | : "r"(src), "r"(dst0), "r"(dst1) | ||
2068 | ); | ||
2069 | ✗ | count -= 7; | |
2070 | } | ||
2071 | ✗ | src++; | |
2072 | ✗ | while(count<0) { | |
2073 | ✗ | dst0[count]= src[4*count+0]; | |
2074 | ✗ | dst1[count]= src[4*count+2]; | |
2075 | ✗ | count++; | |
2076 | } | ||
2077 | ✗ | } | |
2078 | |||
2079 | ✗ | static void extract_odd2avg_mmxext(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count) | |
2080 | { | ||
2081 | ✗ | dst0 += count; | |
2082 | ✗ | dst1 += count; | |
2083 | ✗ | src0 += 4*count; | |
2084 | ✗ | src1 += 4*count; | |
2085 | ✗ | count= - count; | |
2086 | #ifdef PAVGB | ||
2087 | ✗ | if(count <= -8) { | |
2088 | ✗ | count += 7; | |
2089 | ✗ | __asm__ volatile( | |
2090 | "pcmpeqw %%mm7, %%mm7 \n\t" | ||
2091 | "psrlw $8, %%mm7 \n\t" | ||
2092 | "1: \n\t" | ||
2093 | "movq -28(%1, %0, 4), %%mm0 \n\t" | ||
2094 | "movq -20(%1, %0, 4), %%mm1 \n\t" | ||
2095 | "movq -12(%1, %0, 4), %%mm2 \n\t" | ||
2096 | "movq -4(%1, %0, 4), %%mm3 \n\t" | ||
2097 | PAVGB" -28(%2, %0, 4), %%mm0 \n\t" | ||
2098 | PAVGB" -20(%2, %0, 4), %%mm1 \n\t" | ||
2099 | PAVGB" -12(%2, %0, 4), %%mm2 \n\t" | ||
2100 | PAVGB" - 4(%2, %0, 4), %%mm3 \n\t" | ||
2101 | "psrlw $8, %%mm0 \n\t" | ||
2102 | "psrlw $8, %%mm1 \n\t" | ||
2103 | "psrlw $8, %%mm2 \n\t" | ||
2104 | "psrlw $8, %%mm3 \n\t" | ||
2105 | "packuswb %%mm1, %%mm0 \n\t" | ||
2106 | "packuswb %%mm3, %%mm2 \n\t" | ||
2107 | "movq %%mm0, %%mm1 \n\t" | ||
2108 | "movq %%mm2, %%mm3 \n\t" | ||
2109 | "psrlw $8, %%mm0 \n\t" | ||
2110 | "psrlw $8, %%mm2 \n\t" | ||
2111 | "pand %%mm7, %%mm1 \n\t" | ||
2112 | "pand %%mm7, %%mm3 \n\t" | ||
2113 | "packuswb %%mm2, %%mm0 \n\t" | ||
2114 | "packuswb %%mm3, %%mm1 \n\t" | ||
2115 | MOVNTQ" %%mm0,- 7(%4, %0) \n\t" | ||
2116 | MOVNTQ" %%mm1,- 7(%3, %0) \n\t" | ||
2117 | "add $8, %0 \n\t" | ||
2118 | " js 1b \n\t" | ||
2119 | : "+r"(count) | ||
2120 | : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1) | ||
2121 | ); | ||
2122 | ✗ | count -= 7; | |
2123 | } | ||
2124 | #endif | ||
2125 | ✗ | src0++; | |
2126 | ✗ | src1++; | |
2127 | ✗ | while(count<0) { | |
2128 | ✗ | dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1; | |
2129 | ✗ | dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1; | |
2130 | ✗ | count++; | |
2131 | } | ||
2132 | ✗ | } | |
2133 | |||
2134 | ✗ | static void yuyvtoyuv420_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, | |
2135 | int width, int height, | ||
2136 | int lumStride, int chromStride, int srcStride) | ||
2137 | { | ||
2138 | ✗ | const int chromWidth = AV_CEIL_RSHIFT(width, 1); | |
2139 | |||
2140 | ✗ | for (int y = 0; y < height; y++) { | |
2141 | ✗ | extract_even_mmxext(src, ydst, width); | |
2142 | ✗ | if(y&1) { | |
2143 | ✗ | extract_odd2avg_mmxext(src-srcStride, src, udst, vdst, chromWidth); | |
2144 | ✗ | udst+= chromStride; | |
2145 | ✗ | vdst+= chromStride; | |
2146 | } | ||
2147 | |||
2148 | ✗ | src += srcStride; | |
2149 | ✗ | ydst+= lumStride; | |
2150 | } | ||
2151 | ✗ | __asm__( | |
2152 | EMMS" \n\t" | ||
2153 | SFENCE" \n\t" | ||
2154 | ::: "memory" | ||
2155 | ); | ||
2156 | ✗ | } | |
2157 | |||
2158 | ✗ | static void yuyvtoyuv422_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, | |
2159 | int width, int height, | ||
2160 | int lumStride, int chromStride, int srcStride) | ||
2161 | { | ||
2162 | ✗ | const int chromWidth = AV_CEIL_RSHIFT(width, 1); | |
2163 | |||
2164 | ✗ | for (int y = 0; y < height; y++) { | |
2165 | ✗ | extract_even_mmxext(src, ydst, width); | |
2166 | ✗ | extract_odd2_mmxext(src, udst, vdst, chromWidth); | |
2167 | |||
2168 | ✗ | src += srcStride; | |
2169 | ✗ | ydst+= lumStride; | |
2170 | ✗ | udst+= chromStride; | |
2171 | ✗ | vdst+= chromStride; | |
2172 | } | ||
2173 | ✗ | __asm__( | |
2174 | EMMS" \n\t" | ||
2175 | SFENCE" \n\t" | ||
2176 | ::: "memory" | ||
2177 | ); | ||
2178 | ✗ | } | |
2179 | |||
2180 | ✗ | static void uyvytoyuv420_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, | |
2181 | int width, int height, | ||
2182 | int lumStride, int chromStride, int srcStride) | ||
2183 | { | ||
2184 | ✗ | const int chromWidth = AV_CEIL_RSHIFT(width, 1); | |
2185 | |||
2186 | ✗ | for (int y = 0; y < height; y++) { | |
2187 | ✗ | extract_odd_mmxext(src, ydst, width); | |
2188 | ✗ | if(y&1) { | |
2189 | ✗ | extract_even2avg_mmxext(src-srcStride, src, udst, vdst, chromWidth); | |
2190 | ✗ | udst+= chromStride; | |
2191 | ✗ | vdst+= chromStride; | |
2192 | } | ||
2193 | |||
2194 | ✗ | src += srcStride; | |
2195 | ✗ | ydst+= lumStride; | |
2196 | } | ||
2197 | ✗ | __asm__( | |
2198 | EMMS" \n\t" | ||
2199 | SFENCE" \n\t" | ||
2200 | ::: "memory" | ||
2201 | ); | ||
2202 | ✗ | } | |
2203 | |||
2204 | #if ARCH_X86_32 | ||
2205 | static void uyvytoyuv422_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, | ||
2206 | int width, int height, | ||
2207 | int lumStride, int chromStride, int srcStride) | ||
2208 | { | ||
2209 | const int chromWidth = AV_CEIL_RSHIFT(width, 1); | ||
2210 | |||
2211 | for (int y = 0; y < height; y++) { | ||
2212 | extract_odd_mmxext(src, ydst, width); | ||
2213 | extract_even2_mmxext(src, udst, vdst, chromWidth); | ||
2214 | |||
2215 | src += srcStride; | ||
2216 | ydst+= lumStride; | ||
2217 | udst+= chromStride; | ||
2218 | vdst+= chromStride; | ||
2219 | } | ||
2220 | __asm__( | ||
2221 | EMMS" \n\t" | ||
2222 | SFENCE" \n\t" | ||
2223 | ::: "memory" | ||
2224 | ); | ||
2225 | } | ||
2226 | #endif /* ARCH_X86_32 */ | ||
2227 | |||
2228 | 16 | static av_cold void rgb2rgb_init_mmxext(void) | |
2229 | { | ||
2230 | 16 | rgb15to16 = rgb15to16_mmxext; | |
2231 | 16 | rgb15tobgr24 = rgb15tobgr24_mmxext; | |
2232 | 16 | rgb15to32 = rgb15to32_mmxext; | |
2233 | 16 | rgb16tobgr24 = rgb16tobgr24_mmxext; | |
2234 | 16 | rgb16to32 = rgb16to32_mmxext; | |
2235 | 16 | rgb16to15 = rgb16to15_mmxext; | |
2236 | 16 | rgb24tobgr16 = rgb24tobgr16_mmxext; | |
2237 | 16 | rgb24tobgr15 = rgb24tobgr15_mmxext; | |
2238 | 16 | rgb24tobgr32 = rgb24tobgr32_mmxext; | |
2239 | 16 | rgb32to16 = rgb32to16_mmxext; | |
2240 | 16 | rgb32to15 = rgb32to15_mmxext; | |
2241 | 16 | rgb32tobgr24 = rgb32tobgr24_mmxext; | |
2242 | 16 | rgb24to15 = rgb24to15_mmxext; | |
2243 | 16 | rgb24to16 = rgb24to16_mmxext; | |
2244 | 16 | rgb24tobgr24 = rgb24tobgr24_mmxext; | |
2245 | 16 | rgb32tobgr16 = rgb32tobgr16_mmxext; | |
2246 | 16 | rgb32tobgr15 = rgb32tobgr15_mmxext; | |
2247 | 16 | yv12toyuy2 = yv12toyuy2_mmxext; | |
2248 | 16 | yv12touyvy = yv12touyvy_mmxext; | |
2249 | 16 | yuv422ptoyuy2 = yuv422ptoyuy2_mmxext; | |
2250 | 16 | yuv422ptouyvy = yuv422ptouyvy_mmxext; | |
2251 | 16 | yuy2toyv12 = yuy2toyv12_mmxext; | |
2252 | 16 | vu9_to_vu12 = vu9_to_vu12_mmxext; | |
2253 | 16 | yvu9_to_yuy2 = yvu9_to_yuy2_mmxext; | |
2254 | #if ARCH_X86_32 | ||
2255 | uyvytoyuv422 = uyvytoyuv422_mmxext; | ||
2256 | #endif | ||
2257 | 16 | yuyvtoyuv422 = yuyvtoyuv422_mmxext; | |
2258 | |||
2259 | 16 | planar2x = planar2x_mmxext; | |
2260 | #if ARCH_X86_32 && HAVE_7REGS | ||
2261 | ff_rgb24toyv12 = rgb24toyv12_mmxext; | ||
2262 | #endif /* ARCH_X86_32 && HAVE_7REGS */ | ||
2263 | |||
2264 | 16 | yuyvtoyuv420 = yuyvtoyuv420_mmxext; | |
2265 | 16 | uyvytoyuv420 = uyvytoyuv420_mmxext; | |
2266 | 16 | } | |
2267 | |||
2268 | //SSE2 versions | ||
2269 | 17 | static void interleave_bytes_sse2(const uint8_t *src1, const uint8_t *src2, uint8_t *dest, | |
2270 | int width, int height, int src1Stride, | ||
2271 | int src2Stride, int dstStride) | ||
2272 | { | ||
2273 |
2/2✓ Branch 0 taken 901 times.
✓ Branch 1 taken 17 times.
|
918 | for (int h = 0; h < height; h++) { |
2274 |
2/2✓ Branch 0 taken 78 times.
✓ Branch 1 taken 823 times.
|
901 | if (width >= 16) { |
2275 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 78 times.
|
78 | if (!((((intptr_t)src1) | ((intptr_t)src2) | ((intptr_t)dest))&15)) { |
2276 | ✗ | __asm__( | |
2277 | "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" | ||
2278 | "1: \n\t" | ||
2279 | PREFETCH" 64(%1, %%"FF_REG_a") \n\t" | ||
2280 | PREFETCH" 64(%2, %%"FF_REG_a") \n\t" | ||
2281 | "movdqa (%1, %%"FF_REG_a"), %%xmm0 \n\t" | ||
2282 | "movdqa (%1, %%"FF_REG_a"), %%xmm1 \n\t" | ||
2283 | "movdqa (%2, %%"FF_REG_a"), %%xmm2 \n\t" | ||
2284 | "punpcklbw %%xmm2, %%xmm0 \n\t" | ||
2285 | "punpckhbw %%xmm2, %%xmm1 \n\t" | ||
2286 | "movntdq %%xmm0, (%0, %%"FF_REG_a", 2) \n\t" | ||
2287 | "movntdq %%xmm1, 16(%0, %%"FF_REG_a", 2) \n\t" | ||
2288 | "add $16, %%"FF_REG_a" \n\t" | ||
2289 | "cmp %3, %%"FF_REG_a" \n\t" | ||
2290 | " jb 1b \n\t" | ||
2291 | ✗ | ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15) | |
2292 | : "memory", XMM_CLOBBERS("xmm0", "xmm1", "xmm2",) "%"FF_REG_a | ||
2293 | ); | ||
2294 | } else | ||
2295 | 78 | __asm__( | |
2296 | "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" | ||
2297 | "1: \n\t" | ||
2298 | PREFETCH" 64(%1, %%"FF_REG_a") \n\t" | ||
2299 | PREFETCH" 64(%2, %%"FF_REG_a") \n\t" | ||
2300 | "movq (%1, %%"FF_REG_a"), %%mm0 \n\t" | ||
2301 | "movq 8(%1, %%"FF_REG_a"), %%mm2 \n\t" | ||
2302 | "movq %%mm0, %%mm1 \n\t" | ||
2303 | "movq %%mm2, %%mm3 \n\t" | ||
2304 | "movq (%2, %%"FF_REG_a"), %%mm4 \n\t" | ||
2305 | "movq 8(%2, %%"FF_REG_a"), %%mm5 \n\t" | ||
2306 | "punpcklbw %%mm4, %%mm0 \n\t" | ||
2307 | "punpckhbw %%mm4, %%mm1 \n\t" | ||
2308 | "punpcklbw %%mm5, %%mm2 \n\t" | ||
2309 | "punpckhbw %%mm5, %%mm3 \n\t" | ||
2310 | MOVNTQ" %%mm0, (%0, %%"FF_REG_a", 2) \n\t" | ||
2311 | MOVNTQ" %%mm1, 8(%0, %%"FF_REG_a", 2) \n\t" | ||
2312 | MOVNTQ" %%mm2, 16(%0, %%"FF_REG_a", 2) \n\t" | ||
2313 | MOVNTQ" %%mm3, 24(%0, %%"FF_REG_a", 2) \n\t" | ||
2314 | "add $16, %%"FF_REG_a" \n\t" | ||
2315 | "cmp %3, %%"FF_REG_a" \n\t" | ||
2316 | " jb 1b \n\t" | ||
2317 | 78 | ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15) | |
2318 | : "memory", "%"FF_REG_a | ||
2319 | ); | ||
2320 | |||
2321 | } | ||
2322 |
2/2✓ Branch 0 taken 6484 times.
✓ Branch 1 taken 901 times.
|
7385 | for (int w = (width & (~15)); w < width; w++) { |
2323 | 6484 | dest[2*w+0] = src1[w]; | |
2324 | 6484 | dest[2*w+1] = src2[w]; | |
2325 | } | ||
2326 | 901 | dest += dstStride; | |
2327 | 901 | src1 += src1Stride; | |
2328 | 901 | src2 += src2Stride; | |
2329 | } | ||
2330 | 17 | __asm__( | |
2331 | EMMS" \n\t" | ||
2332 | SFENCE" \n\t" | ||
2333 | ::: "memory" | ||
2334 | ); | ||
2335 | 17 | } | |
2336 | |||
2337 | /* | ||
2338 | RGB15->RGB16 original by Strepto/Astral | ||
2339 | ported to gcc & bugfixed : A'rpi | ||
2340 | MMXEXT, 3DNOW optimization by Nick Kurshev | ||
2341 | 32-bit C version, and and&add trick by Michael Niedermayer | ||
2342 | */ | ||
2343 | |||
2344 | #endif /* HAVE_INLINE_ASM */ | ||
2345 | |||
2346 | void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size); | ||
2347 | void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size); | ||
2348 | void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size); | ||
2349 | void ff_shuffle_bytes_3012_ssse3(const uint8_t *src, uint8_t *dst, int src_size); | ||
2350 | void ff_shuffle_bytes_3210_ssse3(const uint8_t *src, uint8_t *dst, int src_size); | ||
2351 | void ff_shuffle_bytes_3102_ssse3(const uint8_t *src, uint8_t *dst, int src_size); | ||
2352 | void ff_shuffle_bytes_2013_ssse3(const uint8_t *src, uint8_t *dst, int src_size); | ||
2353 | void ff_shuffle_bytes_2130_ssse3(const uint8_t *src, uint8_t *dst, int src_size); | ||
2354 | void ff_shuffle_bytes_1203_ssse3(const uint8_t *src, uint8_t *dst, int src_size); | ||
2355 | |||
2356 | #if ARCH_X86_64 | ||
2357 | void ff_shuffle_bytes_2103_avx2(const uint8_t *src, uint8_t *dst, int src_size); | ||
2358 | void ff_shuffle_bytes_0321_avx2(const uint8_t *src, uint8_t *dst, int src_size); | ||
2359 | void ff_shuffle_bytes_1230_avx2(const uint8_t *src, uint8_t *dst, int src_size); | ||
2360 | void ff_shuffle_bytes_3012_avx2(const uint8_t *src, uint8_t *dst, int src_size); | ||
2361 | void ff_shuffle_bytes_3210_avx2(const uint8_t *src, uint8_t *dst, int src_size); | ||
2362 | void ff_shuffle_bytes_3102_avx2(const uint8_t *src, uint8_t *dst, int src_size); | ||
2363 | void ff_shuffle_bytes_2013_avx2(const uint8_t *src, uint8_t *dst, int src_size); | ||
2364 | void ff_shuffle_bytes_2130_avx2(const uint8_t *src, uint8_t *dst, int src_size); | ||
2365 | void ff_shuffle_bytes_1203_avx2(const uint8_t *src, uint8_t *dst, int src_size); | ||
2366 | |||
2367 | void ff_shuffle_bytes_2103_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); | ||
2368 | void ff_shuffle_bytes_0321_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); | ||
2369 | void ff_shuffle_bytes_1230_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); | ||
2370 | void ff_shuffle_bytes_3012_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); | ||
2371 | void ff_shuffle_bytes_3210_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); | ||
2372 | void ff_shuffle_bytes_3102_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); | ||
2373 | void ff_shuffle_bytes_2013_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); | ||
2374 | void ff_shuffle_bytes_2130_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); | ||
2375 | void ff_shuffle_bytes_1203_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); | ||
2376 | |||
2377 | void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | ||
2378 | const uint8_t *src, int width, int height, | ||
2379 | int lumStride, int chromStride, int srcStride); | ||
2380 | void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | ||
2381 | const uint8_t *src, int width, int height, | ||
2382 | int lumStride, int chromStride, int srcStride); | ||
2383 | void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | ||
2384 | const uint8_t *src, int width, int height, | ||
2385 | int lumStride, int chromStride, int srcStride); | ||
2386 | void ff_uyvytoyuv422_avx512icl(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | ||
2387 | const uint8_t *src, int width, int height, | ||
2388 | int lumStride, int chromStride, int srcStride); | ||
2389 | #endif | ||
2390 | |||
2391 | #define DEINTERLEAVE_BYTES(cpuext) \ | ||
2392 | void ff_nv12ToUV_ ## cpuext(uint8_t *dstU, uint8_t *dstV, \ | ||
2393 | const uint8_t *unused, \ | ||
2394 | const uint8_t *src1, \ | ||
2395 | const uint8_t *src2, \ | ||
2396 | int w, \ | ||
2397 | uint32_t *unused2, \ | ||
2398 | void *opq); \ | ||
2399 | static void deinterleave_bytes_ ## cpuext(const uint8_t *src, uint8_t *dst1, uint8_t *dst2, \ | ||
2400 | int width, int height, int srcStride, \ | ||
2401 | int dst1Stride, int dst2Stride) \ | ||
2402 | { \ | ||
2403 | for (int h = 0; h < height; h++) { \ | ||
2404 | if (width >= 16) \ | ||
2405 | ff_nv12ToUV_ ## cpuext(dst1, dst2, NULL, src, NULL, width - 15, NULL, NULL); \ | ||
2406 | for (int w = (width & (~15)); w < width; w++) { \ | ||
2407 | dst1[w] = src[2*w+0]; \ | ||
2408 | dst2[w] = src[2*w+1]; \ | ||
2409 | } \ | ||
2410 | src += srcStride; \ | ||
2411 | dst1 += dst1Stride; \ | ||
2412 | dst2 += dst2Stride; \ | ||
2413 | } \ | ||
2414 | } | ||
2415 | |||
2416 | #if HAVE_SSE2_EXTERNAL | ||
2417 |
6/6✓ Branch 0 taken 188 times.
✓ Branch 1 taken 2090 times.
✓ Branch 3 taken 17926 times.
✓ Branch 4 taken 2278 times.
✓ Branch 5 taken 2278 times.
✓ Branch 6 taken 34 times.
|
20238 | DEINTERLEAVE_BYTES(sse2) |
2418 | #endif | ||
2419 | #if HAVE_AVX_EXTERNAL | ||
2420 |
6/6✓ Branch 0 taken 66 times.
✓ Branch 1 taken 1096 times.
✓ Branch 3 taken 9114 times.
✓ Branch 4 taken 1162 times.
✓ Branch 5 taken 1162 times.
✓ Branch 6 taken 17 times.
|
10293 | DEINTERLEAVE_BYTES(avx) |
2421 | #endif | ||
2422 | |||
2423 | 4538 | av_cold void rgb2rgb_init_x86(void) | |
2424 | { | ||
2425 | 4538 | int cpu_flags = av_get_cpu_flags(); | |
2426 | |||
2427 | #if HAVE_INLINE_ASM | ||
2428 |
2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 4522 times.
|
4538 | if (INLINE_MMXEXT(cpu_flags)) |
2429 | 16 | rgb2rgb_init_mmxext(); | |
2430 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 4524 times.
|
4538 | if (INLINE_SSE2(cpu_flags)) |
2431 | 14 | interleaveBytes = interleave_bytes_sse2; | |
2432 | #endif /* HAVE_INLINE_ASM */ | ||
2433 | |||
2434 | #if HAVE_SSE2_EXTERNAL | ||
2435 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 4524 times.
|
4538 | if (EXTERNAL_SSE2(cpu_flags)) { |
2436 | #if ARCH_X86_64 | ||
2437 | 14 | uyvytoyuv422 = ff_uyvytoyuv422_sse2; | |
2438 | #endif | ||
2439 | 14 | deinterleaveBytes = deinterleave_bytes_sse2; | |
2440 | } | ||
2441 | #endif | ||
2442 |
2/2✓ Branch 0 taken 12 times.
✓ Branch 1 taken 4526 times.
|
4538 | if (EXTERNAL_SSSE3(cpu_flags)) { |
2443 | 12 | shuffle_bytes_0321 = ff_shuffle_bytes_0321_ssse3; | |
2444 | 12 | shuffle_bytes_2103 = ff_shuffle_bytes_2103_ssse3; | |
2445 | 12 | shuffle_bytes_1230 = ff_shuffle_bytes_1230_ssse3; | |
2446 | 12 | shuffle_bytes_3012 = ff_shuffle_bytes_3012_ssse3; | |
2447 | 12 | shuffle_bytes_3210 = ff_shuffle_bytes_3210_ssse3; | |
2448 | 12 | shuffle_bytes_3102 = ff_shuffle_bytes_3102_ssse3; | |
2449 | 12 | shuffle_bytes_2013 = ff_shuffle_bytes_2013_ssse3; | |
2450 | 12 | shuffle_bytes_2130 = ff_shuffle_bytes_2130_ssse3; | |
2451 | 12 | shuffle_bytes_1203 = ff_shuffle_bytes_1203_ssse3; | |
2452 | } | ||
2453 | #if HAVE_AVX_EXTERNAL | ||
2454 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 4530 times.
|
4538 | if (EXTERNAL_AVX(cpu_flags)) { |
2455 | 8 | deinterleaveBytes = deinterleave_bytes_avx; | |
2456 | #if ARCH_X86_64 | ||
2457 | 8 | uyvytoyuv422 = ff_uyvytoyuv422_avx; | |
2458 | } | ||
2459 |
3/4✓ Branch 0 taken 6 times.
✓ Branch 1 taken 4532 times.
✓ Branch 2 taken 6 times.
✗ Branch 3 not taken.
|
4538 | if (EXTERNAL_AVX2_FAST(cpu_flags)) { |
2460 | 6 | shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx2; | |
2461 | 6 | shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx2; | |
2462 | 6 | shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx2; | |
2463 | 6 | shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx2; | |
2464 | 6 | shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx2; | |
2465 | 6 | shuffle_bytes_3102 = ff_shuffle_bytes_3102_avx2; | |
2466 | 6 | shuffle_bytes_2013 = ff_shuffle_bytes_2013_avx2; | |
2467 | 6 | shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx2; | |
2468 | 6 | shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx2; | |
2469 | } | ||
2470 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 4538 times.
|
4538 | if (EXTERNAL_AVX512ICL(cpu_flags)) { |
2471 | ✗ | shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx512icl; | |
2472 | ✗ | shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx512icl; | |
2473 | ✗ | shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx512icl; | |
2474 | ✗ | shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx512icl; | |
2475 | ✗ | shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx512icl; | |
2476 | ✗ | shuffle_bytes_3102 = ff_shuffle_bytes_3102_avx512icl; | |
2477 | ✗ | shuffle_bytes_2013 = ff_shuffle_bytes_2013_avx512icl; | |
2478 | ✗ | shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx512icl; | |
2479 | ✗ | shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx512icl; | |
2480 | } | ||
2481 |
3/4✓ Branch 0 taken 6 times.
✓ Branch 1 taken 4532 times.
✓ Branch 2 taken 6 times.
✗ Branch 3 not taken.
|
4538 | if (EXTERNAL_AVX2_FAST(cpu_flags)) { |
2482 | 6 | uyvytoyuv422 = ff_uyvytoyuv422_avx2; | |
2483 | } | ||
2484 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 4538 times.
|
4538 | if (EXTERNAL_AVX512ICL(cpu_flags)) { |
2485 | ✗ | uyvytoyuv422 = ff_uyvytoyuv422_avx512icl; | |
2486 | #endif | ||
2487 | } | ||
2488 | #endif | ||
2489 | 4538 | } | |
2490 |