Line | Branch | Exec | Source |
---|---|---|---|
1 | /* | ||
2 | * software RGB to RGB converter | ||
3 | * pluralize by software PAL8 to RGB converter | ||
4 | * software YUV to YUV converter | ||
5 | * software YUV to RGB converter | ||
6 | * Written by Nick Kurshev. | ||
7 | * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at) | ||
8 | * lot of big-endian byte order fixes by Alex Beregszaszi | ||
9 | * | ||
10 | * This file is part of FFmpeg. | ||
11 | * | ||
12 | * FFmpeg is free software; you can redistribute it and/or | ||
13 | * modify it under the terms of the GNU Lesser General Public | ||
14 | * License as published by the Free Software Foundation; either | ||
15 | * version 2.1 of the License, or (at your option) any later version. | ||
16 | * | ||
17 | * FFmpeg is distributed in the hope that it will be useful, | ||
18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
20 | * Lesser General Public License for more details. | ||
21 | * | ||
22 | * You should have received a copy of the GNU Lesser General Public | ||
23 | * License along with FFmpeg; if not, write to the Free Software | ||
24 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
25 | */ | ||
26 | |||
27 | #include <stddef.h> | ||
28 | #include <stdint.h> | ||
29 | |||
30 | #include "libavutil/attributes.h" | ||
31 | #include "libavutil/x86/asm.h" | ||
32 | |||
33 | #undef PREFETCH | ||
34 | #undef MOVNTQ | ||
35 | #undef EMMS | ||
36 | #undef SFENCE | ||
37 | #undef PAVGB | ||
38 | |||
39 | #define PREFETCH "prefetchnta" | ||
40 | #define PAVGB "pavgb" | ||
41 | #define MOVNTQ "movntq" | ||
42 | #define SFENCE "sfence" | ||
43 | |||
44 | #define EMMS "emms" | ||
45 | |||
46 | #if !COMPILE_TEMPLATE_SSE2 | ||
47 | |||
48 | ✗ | static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size) | |
49 | { | ||
50 | ✗ | uint8_t *dest = dst; | |
51 | ✗ | const uint8_t *s = src; | |
52 | const uint8_t *end; | ||
53 | const uint8_t *mm_end; | ||
54 | ✗ | end = s + src_size; | |
55 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
56 | ✗ | mm_end = end - 23; | |
57 | ✗ | __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory"); | |
58 | ✗ | while (s < mm_end) { | |
59 | ✗ | __asm__ volatile( | |
60 | PREFETCH" 32(%1) \n\t" | ||
61 | "movd (%1), %%mm0 \n\t" | ||
62 | "punpckldq 3(%1), %%mm0 \n\t" | ||
63 | "movd 6(%1), %%mm1 \n\t" | ||
64 | "punpckldq 9(%1), %%mm1 \n\t" | ||
65 | "movd 12(%1), %%mm2 \n\t" | ||
66 | "punpckldq 15(%1), %%mm2 \n\t" | ||
67 | "movd 18(%1), %%mm3 \n\t" | ||
68 | "punpckldq 21(%1), %%mm3 \n\t" | ||
69 | "por %%mm7, %%mm0 \n\t" | ||
70 | "por %%mm7, %%mm1 \n\t" | ||
71 | "por %%mm7, %%mm2 \n\t" | ||
72 | "por %%mm7, %%mm3 \n\t" | ||
73 | MOVNTQ" %%mm0, (%0) \n\t" | ||
74 | MOVNTQ" %%mm1, 8(%0) \n\t" | ||
75 | MOVNTQ" %%mm2, 16(%0) \n\t" | ||
76 | MOVNTQ" %%mm3, 24(%0)" | ||
77 | :: "r"(dest), "r"(s) | ||
78 | :"memory"); | ||
79 | ✗ | dest += 32; | |
80 | ✗ | s += 24; | |
81 | } | ||
82 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
83 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
84 | ✗ | while (s < end) { | |
85 | ✗ | *dest++ = *s++; | |
86 | ✗ | *dest++ = *s++; | |
87 | ✗ | *dest++ = *s++; | |
88 | ✗ | *dest++ = 255; | |
89 | } | ||
90 | ✗ | } | |
91 | |||
92 | #define STORE_BGR24_MMX \ | ||
93 | "psrlq $8, %%mm2 \n\t" \ | ||
94 | "psrlq $8, %%mm3 \n\t" \ | ||
95 | "psrlq $8, %%mm6 \n\t" \ | ||
96 | "psrlq $8, %%mm7 \n\t" \ | ||
97 | "pand "MANGLE(mask24l)", %%mm0\n\t" \ | ||
98 | "pand "MANGLE(mask24l)", %%mm1\n\t" \ | ||
99 | "pand "MANGLE(mask24l)", %%mm4\n\t" \ | ||
100 | "pand "MANGLE(mask24l)", %%mm5\n\t" \ | ||
101 | "pand "MANGLE(mask24h)", %%mm2\n\t" \ | ||
102 | "pand "MANGLE(mask24h)", %%mm3\n\t" \ | ||
103 | "pand "MANGLE(mask24h)", %%mm6\n\t" \ | ||
104 | "pand "MANGLE(mask24h)", %%mm7\n\t" \ | ||
105 | "por %%mm2, %%mm0 \n\t" \ | ||
106 | "por %%mm3, %%mm1 \n\t" \ | ||
107 | "por %%mm6, %%mm4 \n\t" \ | ||
108 | "por %%mm7, %%mm5 \n\t" \ | ||
109 | \ | ||
110 | "movq %%mm1, %%mm2 \n\t" \ | ||
111 | "movq %%mm4, %%mm3 \n\t" \ | ||
112 | "psllq $48, %%mm2 \n\t" \ | ||
113 | "psllq $32, %%mm3 \n\t" \ | ||
114 | "por %%mm2, %%mm0 \n\t" \ | ||
115 | "psrlq $16, %%mm1 \n\t" \ | ||
116 | "psrlq $32, %%mm4 \n\t" \ | ||
117 | "psllq $16, %%mm5 \n\t" \ | ||
118 | "por %%mm3, %%mm1 \n\t" \ | ||
119 | "por %%mm5, %%mm4 \n\t" \ | ||
120 | \ | ||
121 | MOVNTQ" %%mm0, (%0) \n\t" \ | ||
122 | MOVNTQ" %%mm1, 8(%0) \n\t" \ | ||
123 | MOVNTQ" %%mm4, 16(%0)" | ||
124 | |||
125 | |||
126 | ✗ | static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size) | |
127 | { | ||
128 | ✗ | uint8_t *dest = dst; | |
129 | ✗ | const uint8_t *s = src; | |
130 | const uint8_t *end; | ||
131 | const uint8_t *mm_end; | ||
132 | ✗ | end = s + src_size; | |
133 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
134 | ✗ | mm_end = end - 31; | |
135 | ✗ | while (s < mm_end) { | |
136 | ✗ | __asm__ volatile( | |
137 | PREFETCH" 32(%1) \n\t" | ||
138 | "movq (%1), %%mm0 \n\t" | ||
139 | "movq 8(%1), %%mm1 \n\t" | ||
140 | "movq 16(%1), %%mm4 \n\t" | ||
141 | "movq 24(%1), %%mm5 \n\t" | ||
142 | "movq %%mm0, %%mm2 \n\t" | ||
143 | "movq %%mm1, %%mm3 \n\t" | ||
144 | "movq %%mm4, %%mm6 \n\t" | ||
145 | "movq %%mm5, %%mm7 \n\t" | ||
146 | STORE_BGR24_MMX | ||
147 | :: "r"(dest), "r"(s) | ||
148 | NAMED_CONSTRAINTS_ADD(mask24l,mask24h) | ||
149 | :"memory"); | ||
150 | ✗ | dest += 24; | |
151 | ✗ | s += 32; | |
152 | } | ||
153 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
154 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
155 | ✗ | while (s < end) { | |
156 | ✗ | *dest++ = *s++; | |
157 | ✗ | *dest++ = *s++; | |
158 | ✗ | *dest++ = *s++; | |
159 | ✗ | s++; | |
160 | } | ||
161 | ✗ | } | |
162 | |||
163 | /* | ||
164 | original by Strepto/Astral | ||
165 | ported to gcc & bugfixed: A'rpi | ||
166 | MMXEXT, 3DNOW optimization by Nick Kurshev | ||
167 | 32-bit C version, and and&add trick by Michael Niedermayer | ||
168 | */ | ||
169 | ✗ | static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size) | |
170 | { | ||
171 | ✗ | register const uint8_t* s=src; | |
172 | ✗ | register uint8_t* d=dst; | |
173 | register const uint8_t *end; | ||
174 | const uint8_t *mm_end; | ||
175 | ✗ | end = s + src_size; | |
176 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*s)); | |
177 | ✗ | __asm__ volatile("movq %0, %%mm4"::"m"(mask15s)); | |
178 | ✗ | mm_end = end - 15; | |
179 | ✗ | while (s<mm_end) { | |
180 | ✗ | __asm__ volatile( | |
181 | PREFETCH" 32(%1) \n\t" | ||
182 | "movq (%1), %%mm0 \n\t" | ||
183 | "movq 8(%1), %%mm2 \n\t" | ||
184 | "movq %%mm0, %%mm1 \n\t" | ||
185 | "movq %%mm2, %%mm3 \n\t" | ||
186 | "pand %%mm4, %%mm0 \n\t" | ||
187 | "pand %%mm4, %%mm2 \n\t" | ||
188 | "paddw %%mm1, %%mm0 \n\t" | ||
189 | "paddw %%mm3, %%mm2 \n\t" | ||
190 | MOVNTQ" %%mm0, (%0) \n\t" | ||
191 | MOVNTQ" %%mm2, 8(%0)" | ||
192 | :: "r"(d), "r"(s) | ||
193 | ); | ||
194 | ✗ | d+=16; | |
195 | ✗ | s+=16; | |
196 | } | ||
197 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
198 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
199 | ✗ | mm_end = end - 3; | |
200 | ✗ | while (s < mm_end) { | |
201 | ✗ | register unsigned x= *((const uint32_t *)s); | |
202 | ✗ | *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0); | |
203 | ✗ | d+=4; | |
204 | ✗ | s+=4; | |
205 | } | ||
206 | ✗ | if (s < end) { | |
207 | ✗ | register unsigned short x= *((const uint16_t *)s); | |
208 | ✗ | *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0); | |
209 | } | ||
210 | ✗ | } | |
211 | |||
212 | ✗ | static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size) | |
213 | { | ||
214 | ✗ | register const uint8_t* s=src; | |
215 | ✗ | register uint8_t* d=dst; | |
216 | register const uint8_t *end; | ||
217 | const uint8_t *mm_end; | ||
218 | ✗ | end = s + src_size; | |
219 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*s)); | |
220 | ✗ | __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg)); | |
221 | ✗ | __asm__ volatile("movq %0, %%mm6"::"m"(mask15b)); | |
222 | ✗ | mm_end = end - 15; | |
223 | ✗ | while (s<mm_end) { | |
224 | ✗ | __asm__ volatile( | |
225 | PREFETCH" 32(%1) \n\t" | ||
226 | "movq (%1), %%mm0 \n\t" | ||
227 | "movq 8(%1), %%mm2 \n\t" | ||
228 | "movq %%mm0, %%mm1 \n\t" | ||
229 | "movq %%mm2, %%mm3 \n\t" | ||
230 | "psrlq $1, %%mm0 \n\t" | ||
231 | "psrlq $1, %%mm2 \n\t" | ||
232 | "pand %%mm7, %%mm0 \n\t" | ||
233 | "pand %%mm7, %%mm2 \n\t" | ||
234 | "pand %%mm6, %%mm1 \n\t" | ||
235 | "pand %%mm6, %%mm3 \n\t" | ||
236 | "por %%mm1, %%mm0 \n\t" | ||
237 | "por %%mm3, %%mm2 \n\t" | ||
238 | MOVNTQ" %%mm0, (%0) \n\t" | ||
239 | MOVNTQ" %%mm2, 8(%0)" | ||
240 | :: "r"(d), "r"(s) | ||
241 | ); | ||
242 | ✗ | d+=16; | |
243 | ✗ | s+=16; | |
244 | } | ||
245 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
246 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
247 | ✗ | mm_end = end - 3; | |
248 | ✗ | while (s < mm_end) { | |
249 | ✗ | register uint32_t x= *((const uint32_t*)s); | |
250 | ✗ | *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F); | |
251 | ✗ | s+=4; | |
252 | ✗ | d+=4; | |
253 | } | ||
254 | ✗ | if (s < end) { | |
255 | ✗ | register uint16_t x= *((const uint16_t*)s); | |
256 | ✗ | *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F); | |
257 | } | ||
258 | ✗ | } | |
259 | |||
260 | ✗ | static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size) | |
261 | { | ||
262 | ✗ | const uint8_t *s = src; | |
263 | const uint8_t *end; | ||
264 | const uint8_t *mm_end; | ||
265 | ✗ | uint16_t *d = (uint16_t *)dst; | |
266 | ✗ | end = s + src_size; | |
267 | ✗ | mm_end = end - 15; | |
268 | ✗ | __asm__ volatile( | |
269 | "movq %3, %%mm5 \n\t" | ||
270 | "movq %4, %%mm6 \n\t" | ||
271 | "movq %5, %%mm7 \n\t" | ||
272 | "jmp 2f \n\t" | ||
273 | ".p2align 4 \n\t" | ||
274 | "1: \n\t" | ||
275 | PREFETCH" 32(%1) \n\t" | ||
276 | "movd (%1), %%mm0 \n\t" | ||
277 | "movd 4(%1), %%mm3 \n\t" | ||
278 | "punpckldq 8(%1), %%mm0 \n\t" | ||
279 | "punpckldq 12(%1), %%mm3 \n\t" | ||
280 | "movq %%mm0, %%mm1 \n\t" | ||
281 | "movq %%mm3, %%mm4 \n\t" | ||
282 | "pand %%mm6, %%mm0 \n\t" | ||
283 | "pand %%mm6, %%mm3 \n\t" | ||
284 | "pmaddwd %%mm7, %%mm0 \n\t" | ||
285 | "pmaddwd %%mm7, %%mm3 \n\t" | ||
286 | "pand %%mm5, %%mm1 \n\t" | ||
287 | "pand %%mm5, %%mm4 \n\t" | ||
288 | "por %%mm1, %%mm0 \n\t" | ||
289 | "por %%mm4, %%mm3 \n\t" | ||
290 | "psrld $5, %%mm0 \n\t" | ||
291 | "pslld $11, %%mm3 \n\t" | ||
292 | "por %%mm3, %%mm0 \n\t" | ||
293 | MOVNTQ" %%mm0, (%0) \n\t" | ||
294 | "add $16, %1 \n\t" | ||
295 | "add $8, %0 \n\t" | ||
296 | "2: \n\t" | ||
297 | "cmp %2, %1 \n\t" | ||
298 | " jb 1b \n\t" | ||
299 | : "+r" (d), "+r"(s) | ||
300 | : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216) | ||
301 | ); | ||
302 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
303 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
304 | ✗ | while (s < end) { | |
305 | ✗ | register int rgb = *(const uint32_t*)s; s += 4; | |
306 | ✗ | *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8); | |
307 | } | ||
308 | ✗ | } | |
309 | |||
310 | ✗ | static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size) | |
311 | { | ||
312 | ✗ | const uint8_t *s = src; | |
313 | const uint8_t *end; | ||
314 | const uint8_t *mm_end; | ||
315 | ✗ | uint16_t *d = (uint16_t *)dst; | |
316 | ✗ | end = s + src_size; | |
317 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
318 | ✗ | __asm__ volatile( | |
319 | "movq %0, %%mm7 \n\t" | ||
320 | "movq %1, %%mm6 \n\t" | ||
321 | ::"m"(red_16mask),"m"(green_16mask)); | ||
322 | ✗ | mm_end = end - 15; | |
323 | ✗ | while (s < mm_end) { | |
324 | ✗ | __asm__ volatile( | |
325 | PREFETCH" 32(%1) \n\t" | ||
326 | "movd (%1), %%mm0 \n\t" | ||
327 | "movd 4(%1), %%mm3 \n\t" | ||
328 | "punpckldq 8(%1), %%mm0 \n\t" | ||
329 | "punpckldq 12(%1), %%mm3 \n\t" | ||
330 | "movq %%mm0, %%mm1 \n\t" | ||
331 | "movq %%mm0, %%mm2 \n\t" | ||
332 | "movq %%mm3, %%mm4 \n\t" | ||
333 | "movq %%mm3, %%mm5 \n\t" | ||
334 | "psllq $8, %%mm0 \n\t" | ||
335 | "psllq $8, %%mm3 \n\t" | ||
336 | "pand %%mm7, %%mm0 \n\t" | ||
337 | "pand %%mm7, %%mm3 \n\t" | ||
338 | "psrlq $5, %%mm1 \n\t" | ||
339 | "psrlq $5, %%mm4 \n\t" | ||
340 | "pand %%mm6, %%mm1 \n\t" | ||
341 | "pand %%mm6, %%mm4 \n\t" | ||
342 | "psrlq $19, %%mm2 \n\t" | ||
343 | "psrlq $19, %%mm5 \n\t" | ||
344 | "pand %2, %%mm2 \n\t" | ||
345 | "pand %2, %%mm5 \n\t" | ||
346 | "por %%mm1, %%mm0 \n\t" | ||
347 | "por %%mm4, %%mm3 \n\t" | ||
348 | "por %%mm2, %%mm0 \n\t" | ||
349 | "por %%mm5, %%mm3 \n\t" | ||
350 | "psllq $16, %%mm3 \n\t" | ||
351 | "por %%mm3, %%mm0 \n\t" | ||
352 | MOVNTQ" %%mm0, (%0) \n\t" | ||
353 | :: "r"(d),"r"(s),"m"(blue_16mask):"memory"); | ||
354 | ✗ | d += 4; | |
355 | ✗ | s += 16; | |
356 | } | ||
357 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
358 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
359 | ✗ | while (s < end) { | |
360 | ✗ | register int rgb = *(const uint32_t*)s; s += 4; | |
361 | ✗ | *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19); | |
362 | } | ||
363 | ✗ | } | |
364 | |||
365 | ✗ | static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size) | |
366 | { | ||
367 | ✗ | const uint8_t *s = src; | |
368 | const uint8_t *end; | ||
369 | const uint8_t *mm_end; | ||
370 | ✗ | uint16_t *d = (uint16_t *)dst; | |
371 | ✗ | end = s + src_size; | |
372 | ✗ | mm_end = end - 15; | |
373 | ✗ | __asm__ volatile( | |
374 | "movq %3, %%mm5 \n\t" | ||
375 | "movq %4, %%mm6 \n\t" | ||
376 | "movq %5, %%mm7 \n\t" | ||
377 | "jmp 2f \n\t" | ||
378 | ".p2align 4 \n\t" | ||
379 | "1: \n\t" | ||
380 | PREFETCH" 32(%1) \n\t" | ||
381 | "movd (%1), %%mm0 \n\t" | ||
382 | "movd 4(%1), %%mm3 \n\t" | ||
383 | "punpckldq 8(%1), %%mm0 \n\t" | ||
384 | "punpckldq 12(%1), %%mm3 \n\t" | ||
385 | "movq %%mm0, %%mm1 \n\t" | ||
386 | "movq %%mm3, %%mm4 \n\t" | ||
387 | "pand %%mm6, %%mm0 \n\t" | ||
388 | "pand %%mm6, %%mm3 \n\t" | ||
389 | "pmaddwd %%mm7, %%mm0 \n\t" | ||
390 | "pmaddwd %%mm7, %%mm3 \n\t" | ||
391 | "pand %%mm5, %%mm1 \n\t" | ||
392 | "pand %%mm5, %%mm4 \n\t" | ||
393 | "por %%mm1, %%mm0 \n\t" | ||
394 | "por %%mm4, %%mm3 \n\t" | ||
395 | "psrld $6, %%mm0 \n\t" | ||
396 | "pslld $10, %%mm3 \n\t" | ||
397 | "por %%mm3, %%mm0 \n\t" | ||
398 | MOVNTQ" %%mm0, (%0) \n\t" | ||
399 | "add $16, %1 \n\t" | ||
400 | "add $8, %0 \n\t" | ||
401 | "2: \n\t" | ||
402 | "cmp %2, %1 \n\t" | ||
403 | " jb 1b \n\t" | ||
404 | : "+r" (d), "+r"(s) | ||
405 | : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215) | ||
406 | ); | ||
407 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
408 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
409 | ✗ | while (s < end) { | |
410 | ✗ | register int rgb = *(const uint32_t*)s; s += 4; | |
411 | ✗ | *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9); | |
412 | } | ||
413 | ✗ | } | |
414 | |||
415 | ✗ | static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size) | |
416 | { | ||
417 | ✗ | const uint8_t *s = src; | |
418 | const uint8_t *end; | ||
419 | const uint8_t *mm_end; | ||
420 | ✗ | uint16_t *d = (uint16_t *)dst; | |
421 | ✗ | end = s + src_size; | |
422 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
423 | ✗ | __asm__ volatile( | |
424 | "movq %0, %%mm7 \n\t" | ||
425 | "movq %1, %%mm6 \n\t" | ||
426 | ::"m"(red_15mask),"m"(green_15mask)); | ||
427 | ✗ | mm_end = end - 15; | |
428 | ✗ | while (s < mm_end) { | |
429 | ✗ | __asm__ volatile( | |
430 | PREFETCH" 32(%1) \n\t" | ||
431 | "movd (%1), %%mm0 \n\t" | ||
432 | "movd 4(%1), %%mm3 \n\t" | ||
433 | "punpckldq 8(%1), %%mm0 \n\t" | ||
434 | "punpckldq 12(%1), %%mm3 \n\t" | ||
435 | "movq %%mm0, %%mm1 \n\t" | ||
436 | "movq %%mm0, %%mm2 \n\t" | ||
437 | "movq %%mm3, %%mm4 \n\t" | ||
438 | "movq %%mm3, %%mm5 \n\t" | ||
439 | "psllq $7, %%mm0 \n\t" | ||
440 | "psllq $7, %%mm3 \n\t" | ||
441 | "pand %%mm7, %%mm0 \n\t" | ||
442 | "pand %%mm7, %%mm3 \n\t" | ||
443 | "psrlq $6, %%mm1 \n\t" | ||
444 | "psrlq $6, %%mm4 \n\t" | ||
445 | "pand %%mm6, %%mm1 \n\t" | ||
446 | "pand %%mm6, %%mm4 \n\t" | ||
447 | "psrlq $19, %%mm2 \n\t" | ||
448 | "psrlq $19, %%mm5 \n\t" | ||
449 | "pand %2, %%mm2 \n\t" | ||
450 | "pand %2, %%mm5 \n\t" | ||
451 | "por %%mm1, %%mm0 \n\t" | ||
452 | "por %%mm4, %%mm3 \n\t" | ||
453 | "por %%mm2, %%mm0 \n\t" | ||
454 | "por %%mm5, %%mm3 \n\t" | ||
455 | "psllq $16, %%mm3 \n\t" | ||
456 | "por %%mm3, %%mm0 \n\t" | ||
457 | MOVNTQ" %%mm0, (%0) \n\t" | ||
458 | ::"r"(d),"r"(s),"m"(blue_15mask):"memory"); | ||
459 | ✗ | d += 4; | |
460 | ✗ | s += 16; | |
461 | } | ||
462 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
463 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
464 | ✗ | while (s < end) { | |
465 | ✗ | register int rgb = *(const uint32_t*)s; s += 4; | |
466 | ✗ | *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19); | |
467 | } | ||
468 | ✗ | } | |
469 | |||
470 | ✗ | static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size) | |
471 | { | ||
472 | ✗ | const uint8_t *s = src; | |
473 | const uint8_t *end; | ||
474 | const uint8_t *mm_end; | ||
475 | ✗ | uint16_t *d = (uint16_t *)dst; | |
476 | ✗ | end = s + src_size; | |
477 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
478 | ✗ | __asm__ volatile( | |
479 | "movq %0, %%mm7 \n\t" | ||
480 | "movq %1, %%mm6 \n\t" | ||
481 | ::"m"(red_16mask),"m"(green_16mask)); | ||
482 | ✗ | mm_end = end - 11; | |
483 | ✗ | while (s < mm_end) { | |
484 | ✗ | __asm__ volatile( | |
485 | PREFETCH" 32(%1) \n\t" | ||
486 | "movd (%1), %%mm0 \n\t" | ||
487 | "movd 3(%1), %%mm3 \n\t" | ||
488 | "punpckldq 6(%1), %%mm0 \n\t" | ||
489 | "punpckldq 9(%1), %%mm3 \n\t" | ||
490 | "movq %%mm0, %%mm1 \n\t" | ||
491 | "movq %%mm0, %%mm2 \n\t" | ||
492 | "movq %%mm3, %%mm4 \n\t" | ||
493 | "movq %%mm3, %%mm5 \n\t" | ||
494 | "psrlq $3, %%mm0 \n\t" | ||
495 | "psrlq $3, %%mm3 \n\t" | ||
496 | "pand %2, %%mm0 \n\t" | ||
497 | "pand %2, %%mm3 \n\t" | ||
498 | "psrlq $5, %%mm1 \n\t" | ||
499 | "psrlq $5, %%mm4 \n\t" | ||
500 | "pand %%mm6, %%mm1 \n\t" | ||
501 | "pand %%mm6, %%mm4 \n\t" | ||
502 | "psrlq $8, %%mm2 \n\t" | ||
503 | "psrlq $8, %%mm5 \n\t" | ||
504 | "pand %%mm7, %%mm2 \n\t" | ||
505 | "pand %%mm7, %%mm5 \n\t" | ||
506 | "por %%mm1, %%mm0 \n\t" | ||
507 | "por %%mm4, %%mm3 \n\t" | ||
508 | "por %%mm2, %%mm0 \n\t" | ||
509 | "por %%mm5, %%mm3 \n\t" | ||
510 | "psllq $16, %%mm3 \n\t" | ||
511 | "por %%mm3, %%mm0 \n\t" | ||
512 | MOVNTQ" %%mm0, (%0) \n\t" | ||
513 | ::"r"(d),"r"(s),"m"(blue_16mask):"memory"); | ||
514 | ✗ | d += 4; | |
515 | ✗ | s += 12; | |
516 | } | ||
517 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
518 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
519 | ✗ | while (s < end) { | |
520 | ✗ | const int b = *s++; | |
521 | ✗ | const int g = *s++; | |
522 | ✗ | const int r = *s++; | |
523 | ✗ | *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); | |
524 | } | ||
525 | ✗ | } | |
526 | |||
527 | ✗ | static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size) | |
528 | { | ||
529 | ✗ | const uint8_t *s = src; | |
530 | const uint8_t *end; | ||
531 | const uint8_t *mm_end; | ||
532 | ✗ | uint16_t *d = (uint16_t *)dst; | |
533 | ✗ | end = s + src_size; | |
534 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
535 | ✗ | __asm__ volatile( | |
536 | "movq %0, %%mm7 \n\t" | ||
537 | "movq %1, %%mm6 \n\t" | ||
538 | ::"m"(red_16mask),"m"(green_16mask)); | ||
539 | ✗ | mm_end = end - 15; | |
540 | ✗ | while (s < mm_end) { | |
541 | ✗ | __asm__ volatile( | |
542 | PREFETCH" 32(%1) \n\t" | ||
543 | "movd (%1), %%mm0 \n\t" | ||
544 | "movd 3(%1), %%mm3 \n\t" | ||
545 | "punpckldq 6(%1), %%mm0 \n\t" | ||
546 | "punpckldq 9(%1), %%mm3 \n\t" | ||
547 | "movq %%mm0, %%mm1 \n\t" | ||
548 | "movq %%mm0, %%mm2 \n\t" | ||
549 | "movq %%mm3, %%mm4 \n\t" | ||
550 | "movq %%mm3, %%mm5 \n\t" | ||
551 | "psllq $8, %%mm0 \n\t" | ||
552 | "psllq $8, %%mm3 \n\t" | ||
553 | "pand %%mm7, %%mm0 \n\t" | ||
554 | "pand %%mm7, %%mm3 \n\t" | ||
555 | "psrlq $5, %%mm1 \n\t" | ||
556 | "psrlq $5, %%mm4 \n\t" | ||
557 | "pand %%mm6, %%mm1 \n\t" | ||
558 | "pand %%mm6, %%mm4 \n\t" | ||
559 | "psrlq $19, %%mm2 \n\t" | ||
560 | "psrlq $19, %%mm5 \n\t" | ||
561 | "pand %2, %%mm2 \n\t" | ||
562 | "pand %2, %%mm5 \n\t" | ||
563 | "por %%mm1, %%mm0 \n\t" | ||
564 | "por %%mm4, %%mm3 \n\t" | ||
565 | "por %%mm2, %%mm0 \n\t" | ||
566 | "por %%mm5, %%mm3 \n\t" | ||
567 | "psllq $16, %%mm3 \n\t" | ||
568 | "por %%mm3, %%mm0 \n\t" | ||
569 | MOVNTQ" %%mm0, (%0) \n\t" | ||
570 | ::"r"(d),"r"(s),"m"(blue_16mask):"memory"); | ||
571 | ✗ | d += 4; | |
572 | ✗ | s += 12; | |
573 | } | ||
574 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
575 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
576 | ✗ | while (s < end) { | |
577 | ✗ | const int r = *s++; | |
578 | ✗ | const int g = *s++; | |
579 | ✗ | const int b = *s++; | |
580 | ✗ | *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); | |
581 | } | ||
582 | ✗ | } | |
583 | |||
584 | ✗ | static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size) | |
585 | { | ||
586 | ✗ | const uint8_t *s = src; | |
587 | const uint8_t *end; | ||
588 | const uint8_t *mm_end; | ||
589 | ✗ | uint16_t *d = (uint16_t *)dst; | |
590 | ✗ | end = s + src_size; | |
591 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
592 | ✗ | __asm__ volatile( | |
593 | "movq %0, %%mm7 \n\t" | ||
594 | "movq %1, %%mm6 \n\t" | ||
595 | ::"m"(red_15mask),"m"(green_15mask)); | ||
596 | ✗ | mm_end = end - 11; | |
597 | ✗ | while (s < mm_end) { | |
598 | ✗ | __asm__ volatile( | |
599 | PREFETCH" 32(%1) \n\t" | ||
600 | "movd (%1), %%mm0 \n\t" | ||
601 | "movd 3(%1), %%mm3 \n\t" | ||
602 | "punpckldq 6(%1), %%mm0 \n\t" | ||
603 | "punpckldq 9(%1), %%mm3 \n\t" | ||
604 | "movq %%mm0, %%mm1 \n\t" | ||
605 | "movq %%mm0, %%mm2 \n\t" | ||
606 | "movq %%mm3, %%mm4 \n\t" | ||
607 | "movq %%mm3, %%mm5 \n\t" | ||
608 | "psrlq $3, %%mm0 \n\t" | ||
609 | "psrlq $3, %%mm3 \n\t" | ||
610 | "pand %2, %%mm0 \n\t" | ||
611 | "pand %2, %%mm3 \n\t" | ||
612 | "psrlq $6, %%mm1 \n\t" | ||
613 | "psrlq $6, %%mm4 \n\t" | ||
614 | "pand %%mm6, %%mm1 \n\t" | ||
615 | "pand %%mm6, %%mm4 \n\t" | ||
616 | "psrlq $9, %%mm2 \n\t" | ||
617 | "psrlq $9, %%mm5 \n\t" | ||
618 | "pand %%mm7, %%mm2 \n\t" | ||
619 | "pand %%mm7, %%mm5 \n\t" | ||
620 | "por %%mm1, %%mm0 \n\t" | ||
621 | "por %%mm4, %%mm3 \n\t" | ||
622 | "por %%mm2, %%mm0 \n\t" | ||
623 | "por %%mm5, %%mm3 \n\t" | ||
624 | "psllq $16, %%mm3 \n\t" | ||
625 | "por %%mm3, %%mm0 \n\t" | ||
626 | MOVNTQ" %%mm0, (%0) \n\t" | ||
627 | ::"r"(d),"r"(s),"m"(blue_15mask):"memory"); | ||
628 | ✗ | d += 4; | |
629 | ✗ | s += 12; | |
630 | } | ||
631 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
632 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
633 | ✗ | while (s < end) { | |
634 | ✗ | const int b = *s++; | |
635 | ✗ | const int g = *s++; | |
636 | ✗ | const int r = *s++; | |
637 | ✗ | *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); | |
638 | } | ||
639 | ✗ | } | |
640 | |||
641 | ✗ | static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size) | |
642 | { | ||
643 | ✗ | const uint8_t *s = src; | |
644 | const uint8_t *end; | ||
645 | const uint8_t *mm_end; | ||
646 | ✗ | uint16_t *d = (uint16_t *)dst; | |
647 | ✗ | end = s + src_size; | |
648 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); | |
649 | ✗ | __asm__ volatile( | |
650 | "movq %0, %%mm7 \n\t" | ||
651 | "movq %1, %%mm6 \n\t" | ||
652 | ::"m"(red_15mask),"m"(green_15mask)); | ||
653 | ✗ | mm_end = end - 15; | |
654 | ✗ | while (s < mm_end) { | |
655 | ✗ | __asm__ volatile( | |
656 | PREFETCH" 32(%1) \n\t" | ||
657 | "movd (%1), %%mm0 \n\t" | ||
658 | "movd 3(%1), %%mm3 \n\t" | ||
659 | "punpckldq 6(%1), %%mm0 \n\t" | ||
660 | "punpckldq 9(%1), %%mm3 \n\t" | ||
661 | "movq %%mm0, %%mm1 \n\t" | ||
662 | "movq %%mm0, %%mm2 \n\t" | ||
663 | "movq %%mm3, %%mm4 \n\t" | ||
664 | "movq %%mm3, %%mm5 \n\t" | ||
665 | "psllq $7, %%mm0 \n\t" | ||
666 | "psllq $7, %%mm3 \n\t" | ||
667 | "pand %%mm7, %%mm0 \n\t" | ||
668 | "pand %%mm7, %%mm3 \n\t" | ||
669 | "psrlq $6, %%mm1 \n\t" | ||
670 | "psrlq $6, %%mm4 \n\t" | ||
671 | "pand %%mm6, %%mm1 \n\t" | ||
672 | "pand %%mm6, %%mm4 \n\t" | ||
673 | "psrlq $19, %%mm2 \n\t" | ||
674 | "psrlq $19, %%mm5 \n\t" | ||
675 | "pand %2, %%mm2 \n\t" | ||
676 | "pand %2, %%mm5 \n\t" | ||
677 | "por %%mm1, %%mm0 \n\t" | ||
678 | "por %%mm4, %%mm3 \n\t" | ||
679 | "por %%mm2, %%mm0 \n\t" | ||
680 | "por %%mm5, %%mm3 \n\t" | ||
681 | "psllq $16, %%mm3 \n\t" | ||
682 | "por %%mm3, %%mm0 \n\t" | ||
683 | MOVNTQ" %%mm0, (%0) \n\t" | ||
684 | ::"r"(d),"r"(s),"m"(blue_15mask):"memory"); | ||
685 | ✗ | d += 4; | |
686 | ✗ | s += 12; | |
687 | } | ||
688 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
689 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
690 | ✗ | while (s < end) { | |
691 | ✗ | const int r = *s++; | |
692 | ✗ | const int g = *s++; | |
693 | ✗ | const int b = *s++; | |
694 | ✗ | *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); | |
695 | } | ||
696 | ✗ | } | |
697 | |||
698 | ✗ | static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size) | |
699 | { | ||
700 | const uint16_t *end; | ||
701 | const uint16_t *mm_end; | ||
702 | ✗ | uint8_t *d = dst; | |
703 | ✗ | const uint16_t *s = (const uint16_t*)src; | |
704 | ✗ | end = s + src_size/2; | |
705 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
706 | ✗ | mm_end = end - 7; | |
707 | ✗ | while (s < mm_end) { | |
708 | ✗ | __asm__ volatile( | |
709 | PREFETCH" 32(%1) \n\t" | ||
710 | "movq (%1), %%mm0 \n\t" | ||
711 | "movq (%1), %%mm1 \n\t" | ||
712 | "movq (%1), %%mm2 \n\t" | ||
713 | "pand %2, %%mm0 \n\t" | ||
714 | "pand %3, %%mm1 \n\t" | ||
715 | "pand %4, %%mm2 \n\t" | ||
716 | "psllq $5, %%mm0 \n\t" | ||
717 | "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" | ||
718 | "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t" | ||
719 | "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" | ||
720 | "movq %%mm0, %%mm3 \n\t" | ||
721 | "movq %%mm1, %%mm4 \n\t" | ||
722 | "movq %%mm2, %%mm5 \n\t" | ||
723 | "punpcklwd %5, %%mm0 \n\t" | ||
724 | "punpcklwd %5, %%mm1 \n\t" | ||
725 | "punpcklwd %5, %%mm2 \n\t" | ||
726 | "punpckhwd %5, %%mm3 \n\t" | ||
727 | "punpckhwd %5, %%mm4 \n\t" | ||
728 | "punpckhwd %5, %%mm5 \n\t" | ||
729 | "psllq $8, %%mm1 \n\t" | ||
730 | "psllq $16, %%mm2 \n\t" | ||
731 | "por %%mm1, %%mm0 \n\t" | ||
732 | "por %%mm2, %%mm0 \n\t" | ||
733 | "psllq $8, %%mm4 \n\t" | ||
734 | "psllq $16, %%mm5 \n\t" | ||
735 | "por %%mm4, %%mm3 \n\t" | ||
736 | "por %%mm5, %%mm3 \n\t" | ||
737 | |||
738 | "movq %%mm0, %%mm6 \n\t" | ||
739 | "movq %%mm3, %%mm7 \n\t" | ||
740 | |||
741 | "movq 8(%1), %%mm0 \n\t" | ||
742 | "movq 8(%1), %%mm1 \n\t" | ||
743 | "movq 8(%1), %%mm2 \n\t" | ||
744 | "pand %2, %%mm0 \n\t" | ||
745 | "pand %3, %%mm1 \n\t" | ||
746 | "pand %4, %%mm2 \n\t" | ||
747 | "psllq $5, %%mm0 \n\t" | ||
748 | "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" | ||
749 | "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t" | ||
750 | "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" | ||
751 | "movq %%mm0, %%mm3 \n\t" | ||
752 | "movq %%mm1, %%mm4 \n\t" | ||
753 | "movq %%mm2, %%mm5 \n\t" | ||
754 | "punpcklwd %5, %%mm0 \n\t" | ||
755 | "punpcklwd %5, %%mm1 \n\t" | ||
756 | "punpcklwd %5, %%mm2 \n\t" | ||
757 | "punpckhwd %5, %%mm3 \n\t" | ||
758 | "punpckhwd %5, %%mm4 \n\t" | ||
759 | "punpckhwd %5, %%mm5 \n\t" | ||
760 | "psllq $8, %%mm1 \n\t" | ||
761 | "psllq $16, %%mm2 \n\t" | ||
762 | "por %%mm1, %%mm0 \n\t" | ||
763 | "por %%mm2, %%mm0 \n\t" | ||
764 | "psllq $8, %%mm4 \n\t" | ||
765 | "psllq $16, %%mm5 \n\t" | ||
766 | "por %%mm4, %%mm3 \n\t" | ||
767 | "por %%mm5, %%mm3 \n\t" | ||
768 | |||
769 | :"=m"(*d) | ||
770 | :"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null) | ||
771 | NAMED_CONSTRAINTS_ADD(mul15_mid,mul15_hi) | ||
772 | :"memory"); | ||
773 | /* borrowed 32 to 24 */ | ||
774 | ✗ | __asm__ volatile( | |
775 | "movq %%mm0, %%mm4 \n\t" | ||
776 | "movq %%mm3, %%mm5 \n\t" | ||
777 | "movq %%mm6, %%mm0 \n\t" | ||
778 | "movq %%mm7, %%mm1 \n\t" | ||
779 | |||
780 | "movq %%mm4, %%mm6 \n\t" | ||
781 | "movq %%mm5, %%mm7 \n\t" | ||
782 | "movq %%mm0, %%mm2 \n\t" | ||
783 | "movq %%mm1, %%mm3 \n\t" | ||
784 | |||
785 | STORE_BGR24_MMX | ||
786 | |||
787 | :: "r"(d), "m"(*s) | ||
788 | NAMED_CONSTRAINTS_ADD(mask24l,mask24h) | ||
789 | :"memory"); | ||
790 | ✗ | d += 24; | |
791 | ✗ | s += 8; | |
792 | } | ||
793 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
794 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
795 | ✗ | while (s < end) { | |
796 | register uint16_t bgr; | ||
797 | ✗ | bgr = *s++; | |
798 | ✗ | *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); | |
799 | ✗ | *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7); | |
800 | ✗ | *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12); | |
801 | } | ||
802 | ✗ | } | |
803 | |||
804 | ✗ | static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size) | |
805 | { | ||
806 | const uint16_t *end; | ||
807 | const uint16_t *mm_end; | ||
808 | ✗ | uint8_t *d = (uint8_t *)dst; | |
809 | ✗ | const uint16_t *s = (const uint16_t *)src; | |
810 | ✗ | end = s + src_size/2; | |
811 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
812 | ✗ | mm_end = end - 7; | |
813 | ✗ | while (s < mm_end) { | |
814 | ✗ | __asm__ volatile( | |
815 | PREFETCH" 32(%1) \n\t" | ||
816 | "movq (%1), %%mm0 \n\t" | ||
817 | "movq (%1), %%mm1 \n\t" | ||
818 | "movq (%1), %%mm2 \n\t" | ||
819 | "pand %2, %%mm0 \n\t" | ||
820 | "pand %3, %%mm1 \n\t" | ||
821 | "pand %4, %%mm2 \n\t" | ||
822 | "psllq $5, %%mm0 \n\t" | ||
823 | "psrlq $1, %%mm2 \n\t" | ||
824 | "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" | ||
825 | "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t" | ||
826 | "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" | ||
827 | "movq %%mm0, %%mm3 \n\t" | ||
828 | "movq %%mm1, %%mm4 \n\t" | ||
829 | "movq %%mm2, %%mm5 \n\t" | ||
830 | "punpcklwd %5, %%mm0 \n\t" | ||
831 | "punpcklwd %5, %%mm1 \n\t" | ||
832 | "punpcklwd %5, %%mm2 \n\t" | ||
833 | "punpckhwd %5, %%mm3 \n\t" | ||
834 | "punpckhwd %5, %%mm4 \n\t" | ||
835 | "punpckhwd %5, %%mm5 \n\t" | ||
836 | "psllq $8, %%mm1 \n\t" | ||
837 | "psllq $16, %%mm2 \n\t" | ||
838 | "por %%mm1, %%mm0 \n\t" | ||
839 | "por %%mm2, %%mm0 \n\t" | ||
840 | "psllq $8, %%mm4 \n\t" | ||
841 | "psllq $16, %%mm5 \n\t" | ||
842 | "por %%mm4, %%mm3 \n\t" | ||
843 | "por %%mm5, %%mm3 \n\t" | ||
844 | |||
845 | "movq %%mm0, %%mm6 \n\t" | ||
846 | "movq %%mm3, %%mm7 \n\t" | ||
847 | |||
848 | "movq 8(%1), %%mm0 \n\t" | ||
849 | "movq 8(%1), %%mm1 \n\t" | ||
850 | "movq 8(%1), %%mm2 \n\t" | ||
851 | "pand %2, %%mm0 \n\t" | ||
852 | "pand %3, %%mm1 \n\t" | ||
853 | "pand %4, %%mm2 \n\t" | ||
854 | "psllq $5, %%mm0 \n\t" | ||
855 | "psrlq $1, %%mm2 \n\t" | ||
856 | "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" | ||
857 | "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t" | ||
858 | "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" | ||
859 | "movq %%mm0, %%mm3 \n\t" | ||
860 | "movq %%mm1, %%mm4 \n\t" | ||
861 | "movq %%mm2, %%mm5 \n\t" | ||
862 | "punpcklwd %5, %%mm0 \n\t" | ||
863 | "punpcklwd %5, %%mm1 \n\t" | ||
864 | "punpcklwd %5, %%mm2 \n\t" | ||
865 | "punpckhwd %5, %%mm3 \n\t" | ||
866 | "punpckhwd %5, %%mm4 \n\t" | ||
867 | "punpckhwd %5, %%mm5 \n\t" | ||
868 | "psllq $8, %%mm1 \n\t" | ||
869 | "psllq $16, %%mm2 \n\t" | ||
870 | "por %%mm1, %%mm0 \n\t" | ||
871 | "por %%mm2, %%mm0 \n\t" | ||
872 | "psllq $8, %%mm4 \n\t" | ||
873 | "psllq $16, %%mm5 \n\t" | ||
874 | "por %%mm4, %%mm3 \n\t" | ||
875 | "por %%mm5, %%mm3 \n\t" | ||
876 | :"=m"(*d) | ||
877 | :"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null) | ||
878 | NAMED_CONSTRAINTS_ADD(mul15_mid,mul16_mid,mul15_hi) | ||
879 | :"memory"); | ||
880 | /* borrowed 32 to 24 */ | ||
881 | ✗ | __asm__ volatile( | |
882 | "movq %%mm0, %%mm4 \n\t" | ||
883 | "movq %%mm3, %%mm5 \n\t" | ||
884 | "movq %%mm6, %%mm0 \n\t" | ||
885 | "movq %%mm7, %%mm1 \n\t" | ||
886 | |||
887 | "movq %%mm4, %%mm6 \n\t" | ||
888 | "movq %%mm5, %%mm7 \n\t" | ||
889 | "movq %%mm0, %%mm2 \n\t" | ||
890 | "movq %%mm1, %%mm3 \n\t" | ||
891 | |||
892 | STORE_BGR24_MMX | ||
893 | |||
894 | :: "r"(d), "m"(*s) | ||
895 | NAMED_CONSTRAINTS_ADD(mask24l,mask24h) | ||
896 | :"memory"); | ||
897 | ✗ | d += 24; | |
898 | ✗ | s += 8; | |
899 | } | ||
900 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
901 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
902 | ✗ | while (s < end) { | |
903 | register uint16_t bgr; | ||
904 | ✗ | bgr = *s++; | |
905 | ✗ | *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); | |
906 | ✗ | *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9); | |
907 | ✗ | *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13); | |
908 | } | ||
909 | ✗ | } | |
910 | |||
911 | /* | ||
912 | * mm0 = 00 B3 00 B2 00 B1 00 B0 | ||
913 | * mm1 = 00 G3 00 G2 00 G1 00 G0 | ||
914 | * mm2 = 00 R3 00 R2 00 R1 00 R0 | ||
915 | * mm6 = FF FF FF FF FF FF FF FF | ||
916 | * mm7 = 00 00 00 00 00 00 00 00 | ||
917 | */ | ||
918 | #define PACK_RGB32 \ | ||
919 | "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \ | ||
920 | "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \ | ||
921 | "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \ | ||
922 | "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \ | ||
923 | "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \ | ||
924 | "movq %%mm0, %%mm3 \n\t" \ | ||
925 | "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \ | ||
926 | "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \ | ||
927 | MOVNTQ" %%mm0, (%0) \n\t" \ | ||
928 | MOVNTQ" %%mm3, 8(%0) \n\t" \ | ||
929 | |||
930 | ✗ | static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size) | |
931 | { | ||
932 | const uint16_t *end; | ||
933 | const uint16_t *mm_end; | ||
934 | ✗ | uint8_t *d = dst; | |
935 | ✗ | const uint16_t *s = (const uint16_t *)src; | |
936 | ✗ | end = s + src_size/2; | |
937 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
938 | ✗ | __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory"); | |
939 | ✗ | __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory"); | |
940 | ✗ | mm_end = end - 3; | |
941 | ✗ | while (s < mm_end) { | |
942 | ✗ | __asm__ volatile( | |
943 | PREFETCH" 32(%1) \n\t" | ||
944 | "movq (%1), %%mm0 \n\t" | ||
945 | "movq (%1), %%mm1 \n\t" | ||
946 | "movq (%1), %%mm2 \n\t" | ||
947 | "pand %2, %%mm0 \n\t" | ||
948 | "pand %3, %%mm1 \n\t" | ||
949 | "pand %4, %%mm2 \n\t" | ||
950 | "psllq $5, %%mm0 \n\t" | ||
951 | "pmulhw %5, %%mm0 \n\t" | ||
952 | "pmulhw %5, %%mm1 \n\t" | ||
953 | "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" | ||
954 | PACK_RGB32 | ||
955 | ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r) ,"m"(mul15_mid) | ||
956 | NAMED_CONSTRAINTS_ADD(mul15_hi) | ||
957 | :"memory"); | ||
958 | ✗ | d += 16; | |
959 | ✗ | s += 4; | |
960 | } | ||
961 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
962 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
963 | ✗ | while (s < end) { | |
964 | register uint16_t bgr; | ||
965 | ✗ | bgr = *s++; | |
966 | ✗ | *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); | |
967 | ✗ | *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7); | |
968 | ✗ | *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12); | |
969 | ✗ | *d++ = 255; | |
970 | } | ||
971 | ✗ | } | |
972 | |||
973 | ✗ | static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size) | |
974 | { | ||
975 | const uint16_t *end; | ||
976 | const uint16_t *mm_end; | ||
977 | ✗ | uint8_t *d = dst; | |
978 | ✗ | const uint16_t *s = (const uint16_t*)src; | |
979 | ✗ | end = s + src_size/2; | |
980 | ✗ | __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); | |
981 | ✗ | __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory"); | |
982 | ✗ | __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory"); | |
983 | ✗ | mm_end = end - 3; | |
984 | ✗ | while (s < mm_end) { | |
985 | ✗ | __asm__ volatile( | |
986 | PREFETCH" 32(%1) \n\t" | ||
987 | "movq (%1), %%mm0 \n\t" | ||
988 | "movq (%1), %%mm1 \n\t" | ||
989 | "movq (%1), %%mm2 \n\t" | ||
990 | "pand %2, %%mm0 \n\t" | ||
991 | "pand %3, %%mm1 \n\t" | ||
992 | "pand %4, %%mm2 \n\t" | ||
993 | "psllq $5, %%mm0 \n\t" | ||
994 | "psrlq $1, %%mm2 \n\t" | ||
995 | "pmulhw %5, %%mm0 \n\t" | ||
996 | "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t" | ||
997 | "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" | ||
998 | PACK_RGB32 | ||
999 | ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid) | ||
1000 | NAMED_CONSTRAINTS_ADD(mul16_mid,mul15_hi) | ||
1001 | :"memory"); | ||
1002 | ✗ | d += 16; | |
1003 | ✗ | s += 4; | |
1004 | } | ||
1005 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
1006 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
1007 | ✗ | while (s < end) { | |
1008 | register uint16_t bgr; | ||
1009 | ✗ | bgr = *s++; | |
1010 | ✗ | *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); | |
1011 | ✗ | *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9); | |
1012 | ✗ | *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13); | |
1013 | ✗ | *d++ = 255; | |
1014 | } | ||
1015 | ✗ | } | |
1016 | |||
1017 | ✗ | static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size) | |
1018 | { | ||
1019 | unsigned i; | ||
1020 | ✗ | x86_reg mmx_size= 23 - src_size; | |
1021 | ✗ | __asm__ volatile ( | |
1022 | "test %%"FF_REG_a", %%"FF_REG_a" \n\t" | ||
1023 | "jns 2f \n\t" | ||
1024 | "movq "MANGLE(mask24r)", %%mm5 \n\t" | ||
1025 | "movq "MANGLE(mask24g)", %%mm6 \n\t" | ||
1026 | "movq "MANGLE(mask24b)", %%mm7 \n\t" | ||
1027 | ".p2align 4 \n\t" | ||
1028 | "1: \n\t" | ||
1029 | PREFETCH" 32(%1, %%"FF_REG_a") \n\t" | ||
1030 | "movq (%1, %%"FF_REG_a"), %%mm0 \n\t" // BGR BGR BG | ||
1031 | "movq (%1, %%"FF_REG_a"), %%mm1 \n\t" // BGR BGR BG | ||
1032 | "movq 2(%1, %%"FF_REG_a"), %%mm2 \n\t" // R BGR BGR B | ||
1033 | "psllq $16, %%mm0 \n\t" // 00 BGR BGR | ||
1034 | "pand %%mm5, %%mm0 \n\t" | ||
1035 | "pand %%mm6, %%mm1 \n\t" | ||
1036 | "pand %%mm7, %%mm2 \n\t" | ||
1037 | "por %%mm0, %%mm1 \n\t" | ||
1038 | "por %%mm2, %%mm1 \n\t" | ||
1039 | "movq 6(%1, %%"FF_REG_a"), %%mm0 \n\t" // BGR BGR BG | ||
1040 | MOVNTQ" %%mm1,(%2, %%"FF_REG_a") \n\t" // RGB RGB RG | ||
1041 | "movq 8(%1, %%"FF_REG_a"), %%mm1 \n\t" // R BGR BGR B | ||
1042 | "movq 10(%1, %%"FF_REG_a"), %%mm2 \n\t" // GR BGR BGR | ||
1043 | "pand %%mm7, %%mm0 \n\t" | ||
1044 | "pand %%mm5, %%mm1 \n\t" | ||
1045 | "pand %%mm6, %%mm2 \n\t" | ||
1046 | "por %%mm0, %%mm1 \n\t" | ||
1047 | "por %%mm2, %%mm1 \n\t" | ||
1048 | "movq 14(%1, %%"FF_REG_a"), %%mm0 \n\t" // R BGR BGR B | ||
1049 | MOVNTQ" %%mm1, 8(%2, %%"FF_REG_a")\n\t" // B RGB RGB R | ||
1050 | "movq 16(%1, %%"FF_REG_a"), %%mm1 \n\t" // GR BGR BGR | ||
1051 | "movq 18(%1, %%"FF_REG_a"), %%mm2 \n\t" // BGR BGR BG | ||
1052 | "pand %%mm6, %%mm0 \n\t" | ||
1053 | "pand %%mm7, %%mm1 \n\t" | ||
1054 | "pand %%mm5, %%mm2 \n\t" | ||
1055 | "por %%mm0, %%mm1 \n\t" | ||
1056 | "por %%mm2, %%mm1 \n\t" | ||
1057 | MOVNTQ" %%mm1, 16(%2, %%"FF_REG_a") \n\t" | ||
1058 | "add $24, %%"FF_REG_a" \n\t" | ||
1059 | " js 1b \n\t" | ||
1060 | "2: \n\t" | ||
1061 | : "+a" (mmx_size) | ||
1062 | ✗ | : "r" (src-mmx_size), "r"(dst-mmx_size) | |
1063 | NAMED_CONSTRAINTS_ADD(mask24r,mask24g,mask24b) | ||
1064 | ); | ||
1065 | |||
1066 | ✗ | __asm__ volatile(SFENCE:::"memory"); | |
1067 | ✗ | __asm__ volatile(EMMS:::"memory"); | |
1068 | |||
1069 | ✗ | if (mmx_size==23) return; //finished, was multiple of 8 | |
1070 | |||
1071 | ✗ | src+= src_size; | |
1072 | ✗ | dst+= src_size; | |
1073 | ✗ | src_size= 23-mmx_size; | |
1074 | ✗ | src-= src_size; | |
1075 | ✗ | dst-= src_size; | |
1076 | ✗ | for (i=0; i<src_size; i+=3) { | |
1077 | register uint8_t x; | ||
1078 | ✗ | x = src[i + 2]; | |
1079 | ✗ | dst[i + 1] = src[i + 1]; | |
1080 | ✗ | dst[i + 2] = src[i + 0]; | |
1081 | ✗ | dst[i + 0] = x; | |
1082 | } | ||
1083 | } | ||
1084 | |||
1085 | ✗ | static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | |
1086 | int width, int height, | ||
1087 | int lumStride, int chromStride, int dstStride, int vertLumPerChroma) | ||
1088 | { | ||
1089 | int y; | ||
1090 | ✗ | const x86_reg chromWidth= width>>1; | |
1091 | ✗ | for (y=0; y<height; y++) { | |
1092 | //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway) | ||
1093 | ✗ | __asm__ volatile( | |
1094 | "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" | ||
1095 | ".p2align 4 \n\t" | ||
1096 | "1: \n\t" | ||
1097 | PREFETCH" 32(%1, %%"FF_REG_a", 2) \n\t" | ||
1098 | PREFETCH" 32(%2, %%"FF_REG_a") \n\t" | ||
1099 | PREFETCH" 32(%3, %%"FF_REG_a") \n\t" | ||
1100 | "movq (%2, %%"FF_REG_a"), %%mm0 \n\t" // U(0) | ||
1101 | "movq %%mm0, %%mm2 \n\t" // U(0) | ||
1102 | "movq (%3, %%"FF_REG_a"), %%mm1 \n\t" // V(0) | ||
1103 | "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | ||
1104 | "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) | ||
1105 | |||
1106 | "movq (%1, %%"FF_REG_a",2), %%mm3 \n\t" // Y(0) | ||
1107 | "movq 8(%1, %%"FF_REG_a",2), %%mm5 \n\t" // Y(8) | ||
1108 | "movq %%mm3, %%mm4 \n\t" // Y(0) | ||
1109 | "movq %%mm5, %%mm6 \n\t" // Y(8) | ||
1110 | "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0) | ||
1111 | "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4) | ||
1112 | "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8) | ||
1113 | "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12) | ||
1114 | |||
1115 | MOVNTQ" %%mm3, (%0, %%"FF_REG_a", 4) \n\t" | ||
1116 | MOVNTQ" %%mm4, 8(%0, %%"FF_REG_a", 4) \n\t" | ||
1117 | MOVNTQ" %%mm5, 16(%0, %%"FF_REG_a", 4) \n\t" | ||
1118 | MOVNTQ" %%mm6, 24(%0, %%"FF_REG_a", 4) \n\t" | ||
1119 | |||
1120 | "add $8, %%"FF_REG_a" \n\t" | ||
1121 | "cmp %4, %%"FF_REG_a" \n\t" | ||
1122 | " jb 1b \n\t" | ||
1123 | ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) | ||
1124 | : "%"FF_REG_a | ||
1125 | ); | ||
1126 | ✗ | if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) { | |
1127 | ✗ | usrc += chromStride; | |
1128 | ✗ | vsrc += chromStride; | |
1129 | } | ||
1130 | ✗ | ysrc += lumStride; | |
1131 | ✗ | dst += dstStride; | |
1132 | } | ||
1133 | ✗ | __asm__(EMMS" \n\t" | |
1134 | SFENCE" \n\t" | ||
1135 | :::"memory"); | ||
1136 | ✗ | } | |
1137 | |||
1138 | /** | ||
1139 | * Height should be a multiple of 2 and width should be a multiple of 16. | ||
1140 | * (If this is a problem for anyone then tell me, and I will fix it.) | ||
1141 | */ | ||
1142 | ✗ | static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | |
1143 | int width, int height, | ||
1144 | int lumStride, int chromStride, int dstStride) | ||
1145 | { | ||
1146 | //FIXME interpolate chroma | ||
1147 | ✗ | RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); | |
1148 | ✗ | } | |
1149 | |||
1150 | ✗ | static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | |
1151 | int width, int height, | ||
1152 | int lumStride, int chromStride, int dstStride, int vertLumPerChroma) | ||
1153 | { | ||
1154 | int y; | ||
1155 | ✗ | const x86_reg chromWidth= width>>1; | |
1156 | ✗ | for (y=0; y<height; y++) { | |
1157 | //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway) | ||
1158 | ✗ | __asm__ volatile( | |
1159 | "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" | ||
1160 | ".p2align 4 \n\t" | ||
1161 | "1: \n\t" | ||
1162 | PREFETCH" 32(%1, %%"FF_REG_a", 2) \n\t" | ||
1163 | PREFETCH" 32(%2, %%"FF_REG_a") \n\t" | ||
1164 | PREFETCH" 32(%3, %%"FF_REG_a") \n\t" | ||
1165 | "movq (%2, %%"FF_REG_a"), %%mm0 \n\t" // U(0) | ||
1166 | "movq %%mm0, %%mm2 \n\t" // U(0) | ||
1167 | "movq (%3, %%"FF_REG_a"), %%mm1 \n\t" // V(0) | ||
1168 | "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | ||
1169 | "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) | ||
1170 | |||
1171 | "movq (%1, %%"FF_REG_a",2), %%mm3 \n\t" // Y(0) | ||
1172 | "movq 8(%1, %%"FF_REG_a",2), %%mm5 \n\t" // Y(8) | ||
1173 | "movq %%mm0, %%mm4 \n\t" // Y(0) | ||
1174 | "movq %%mm2, %%mm6 \n\t" // Y(8) | ||
1175 | "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0) | ||
1176 | "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4) | ||
1177 | "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8) | ||
1178 | "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12) | ||
1179 | |||
1180 | MOVNTQ" %%mm0, (%0, %%"FF_REG_a", 4) \n\t" | ||
1181 | MOVNTQ" %%mm4, 8(%0, %%"FF_REG_a", 4) \n\t" | ||
1182 | MOVNTQ" %%mm2, 16(%0, %%"FF_REG_a", 4) \n\t" | ||
1183 | MOVNTQ" %%mm6, 24(%0, %%"FF_REG_a", 4) \n\t" | ||
1184 | |||
1185 | "add $8, %%"FF_REG_a" \n\t" | ||
1186 | "cmp %4, %%"FF_REG_a" \n\t" | ||
1187 | " jb 1b \n\t" | ||
1188 | ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) | ||
1189 | : "%"FF_REG_a | ||
1190 | ); | ||
1191 | ✗ | if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) { | |
1192 | ✗ | usrc += chromStride; | |
1193 | ✗ | vsrc += chromStride; | |
1194 | } | ||
1195 | ✗ | ysrc += lumStride; | |
1196 | ✗ | dst += dstStride; | |
1197 | } | ||
1198 | ✗ | __asm__(EMMS" \n\t" | |
1199 | SFENCE" \n\t" | ||
1200 | :::"memory"); | ||
1201 | ✗ | } | |
1202 | |||
1203 | /** | ||
1204 | * Height should be a multiple of 2 and width should be a multiple of 16 | ||
1205 | * (If this is a problem for anyone then tell me, and I will fix it.) | ||
1206 | */ | ||
1207 | ✗ | static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | |
1208 | int width, int height, | ||
1209 | int lumStride, int chromStride, int dstStride) | ||
1210 | { | ||
1211 | //FIXME interpolate chroma | ||
1212 | ✗ | RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); | |
1213 | ✗ | } | |
1214 | |||
1215 | /** | ||
1216 | * Width should be a multiple of 16. | ||
1217 | */ | ||
1218 | ✗ | static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | |
1219 | int width, int height, | ||
1220 | int lumStride, int chromStride, int dstStride) | ||
1221 | { | ||
1222 | ✗ | RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1); | |
1223 | ✗ | } | |
1224 | |||
1225 | /** | ||
1226 | * Width should be a multiple of 16. | ||
1227 | */ | ||
1228 | ✗ | static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | |
1229 | int width, int height, | ||
1230 | int lumStride, int chromStride, int dstStride) | ||
1231 | { | ||
1232 | ✗ | RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1); | |
1233 | ✗ | } | |
1234 | |||
1235 | /** | ||
1236 | * Height should be a multiple of 2 and width should be a multiple of 16. | ||
1237 | * (If this is a problem for anyone then tell me, and I will fix it.) | ||
1238 | */ | ||
1239 | ✗ | static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | |
1240 | int width, int height, | ||
1241 | int lumStride, int chromStride, int srcStride) | ||
1242 | { | ||
1243 | int y; | ||
1244 | ✗ | const x86_reg chromWidth= width>>1; | |
1245 | ✗ | for (y=0; y<height; y+=2) { | |
1246 | ✗ | __asm__ volatile( | |
1247 | "xor %%"FF_REG_a", %%"FF_REG_a"\n\t" | ||
1248 | "pcmpeqw %%mm7, %%mm7 \n\t" | ||
1249 | "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... | ||
1250 | ".p2align 4 \n\t" | ||
1251 | "1: \n\t" | ||
1252 | PREFETCH" 64(%0, %%"FF_REG_a", 4) \n\t" | ||
1253 | "movq (%0, %%"FF_REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) | ||
1254 | "movq 8(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) | ||
1255 | "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) | ||
1256 | "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) | ||
1257 | "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) | ||
1258 | "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) | ||
1259 | "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) | ||
1260 | "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) | ||
1261 | "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | ||
1262 | "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) | ||
1263 | |||
1264 | MOVNTQ" %%mm2, (%1, %%"FF_REG_a", 2) \n\t" | ||
1265 | |||
1266 | "movq 16(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8) | ||
1267 | "movq 24(%0, %%"FF_REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12) | ||
1268 | "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) | ||
1269 | "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) | ||
1270 | "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) | ||
1271 | "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) | ||
1272 | "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) | ||
1273 | "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) | ||
1274 | "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) | ||
1275 | "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) | ||
1276 | |||
1277 | MOVNTQ" %%mm3, 8(%1, %%"FF_REG_a", 2) \n\t" | ||
1278 | |||
1279 | "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) | ||
1280 | "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) | ||
1281 | "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) | ||
1282 | "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) | ||
1283 | "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) | ||
1284 | "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) | ||
1285 | "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) | ||
1286 | "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) | ||
1287 | |||
1288 | MOVNTQ" %%mm0, (%3, %%"FF_REG_a") \n\t" | ||
1289 | MOVNTQ" %%mm2, (%2, %%"FF_REG_a") \n\t" | ||
1290 | |||
1291 | "add $8, %%"FF_REG_a" \n\t" | ||
1292 | "cmp %4, %%"FF_REG_a" \n\t" | ||
1293 | " jb 1b \n\t" | ||
1294 | ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) | ||
1295 | : "memory", "%"FF_REG_a | ||
1296 | ); | ||
1297 | |||
1298 | ✗ | ydst += lumStride; | |
1299 | ✗ | src += srcStride; | |
1300 | |||
1301 | ✗ | __asm__ volatile( | |
1302 | "xor %%"FF_REG_a", %%"FF_REG_a"\n\t" | ||
1303 | ".p2align 4 \n\t" | ||
1304 | "1: \n\t" | ||
1305 | PREFETCH" 64(%0, %%"FF_REG_a", 4) \n\t" | ||
1306 | "movq (%0, %%"FF_REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) | ||
1307 | "movq 8(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) | ||
1308 | "movq 16(%0, %%"FF_REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8) | ||
1309 | "movq 24(%0, %%"FF_REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12) | ||
1310 | "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0) | ||
1311 | "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4) | ||
1312 | "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8) | ||
1313 | "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12) | ||
1314 | "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) | ||
1315 | "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) | ||
1316 | |||
1317 | MOVNTQ" %%mm0, (%1, %%"FF_REG_a", 2) \n\t" | ||
1318 | MOVNTQ" %%mm2, 8(%1, %%"FF_REG_a", 2) \n\t" | ||
1319 | |||
1320 | "add $8, %%"FF_REG_a"\n\t" | ||
1321 | "cmp %4, %%"FF_REG_a"\n\t" | ||
1322 | " jb 1b \n\t" | ||
1323 | |||
1324 | ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) | ||
1325 | : "memory", "%"FF_REG_a | ||
1326 | ); | ||
1327 | ✗ | udst += chromStride; | |
1328 | ✗ | vdst += chromStride; | |
1329 | ✗ | ydst += lumStride; | |
1330 | ✗ | src += srcStride; | |
1331 | } | ||
1332 | ✗ | __asm__ volatile(EMMS" \n\t" | |
1333 | SFENCE" \n\t" | ||
1334 | :::"memory"); | ||
1335 | ✗ | } | |
1336 | |||
1337 | ✗ | static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride) | |
1338 | { | ||
1339 | int x,y; | ||
1340 | |||
1341 | ✗ | dst[0]= src[0]; | |
1342 | |||
1343 | // first line | ||
1344 | ✗ | for (x=0; x<srcWidth-1; x++) { | |
1345 | ✗ | dst[2*x+1]= (3*src[x] + src[x+1])>>2; | |
1346 | ✗ | dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; | |
1347 | } | ||
1348 | ✗ | dst[2*srcWidth-1]= src[srcWidth-1]; | |
1349 | |||
1350 | ✗ | dst+= dstStride; | |
1351 | |||
1352 | ✗ | for (y=1; y<srcHeight; y++) { | |
1353 | ✗ | x86_reg mmxSize= srcWidth&~15; | |
1354 | |||
1355 | ✗ | if (mmxSize) { | |
1356 | ✗ | __asm__ volatile( | |
1357 | "mov %4, %%"FF_REG_a" \n\t" | ||
1358 | "movq "MANGLE(mmx_ff)", %%mm0 \n\t" | ||
1359 | "movq (%0, %%"FF_REG_a"), %%mm4 \n\t" | ||
1360 | "movq %%mm4, %%mm2 \n\t" | ||
1361 | "psllq $8, %%mm4 \n\t" | ||
1362 | "pand %%mm0, %%mm2 \n\t" | ||
1363 | "por %%mm2, %%mm4 \n\t" | ||
1364 | "movq (%1, %%"FF_REG_a"), %%mm5 \n\t" | ||
1365 | "movq %%mm5, %%mm3 \n\t" | ||
1366 | "psllq $8, %%mm5 \n\t" | ||
1367 | "pand %%mm0, %%mm3 \n\t" | ||
1368 | "por %%mm3, %%mm5 \n\t" | ||
1369 | "1: \n\t" | ||
1370 | "movq (%0, %%"FF_REG_a"), %%mm0 \n\t" | ||
1371 | "movq (%1, %%"FF_REG_a"), %%mm1 \n\t" | ||
1372 | "movq 1(%0, %%"FF_REG_a"), %%mm2 \n\t" | ||
1373 | "movq 1(%1, %%"FF_REG_a"), %%mm3 \n\t" | ||
1374 | PAVGB" %%mm0, %%mm5 \n\t" | ||
1375 | PAVGB" %%mm0, %%mm3 \n\t" | ||
1376 | PAVGB" %%mm0, %%mm5 \n\t" | ||
1377 | PAVGB" %%mm0, %%mm3 \n\t" | ||
1378 | PAVGB" %%mm1, %%mm4 \n\t" | ||
1379 | PAVGB" %%mm1, %%mm2 \n\t" | ||
1380 | PAVGB" %%mm1, %%mm4 \n\t" | ||
1381 | PAVGB" %%mm1, %%mm2 \n\t" | ||
1382 | "movq %%mm5, %%mm7 \n\t" | ||
1383 | "movq %%mm4, %%mm6 \n\t" | ||
1384 | "punpcklbw %%mm3, %%mm5 \n\t" | ||
1385 | "punpckhbw %%mm3, %%mm7 \n\t" | ||
1386 | "punpcklbw %%mm2, %%mm4 \n\t" | ||
1387 | "punpckhbw %%mm2, %%mm6 \n\t" | ||
1388 | MOVNTQ" %%mm5, (%2, %%"FF_REG_a", 2) \n\t" | ||
1389 | MOVNTQ" %%mm7, 8(%2, %%"FF_REG_a", 2) \n\t" | ||
1390 | MOVNTQ" %%mm4, (%3, %%"FF_REG_a", 2) \n\t" | ||
1391 | MOVNTQ" %%mm6, 8(%3, %%"FF_REG_a", 2) \n\t" | ||
1392 | "add $8, %%"FF_REG_a" \n\t" | ||
1393 | "movq -1(%0, %%"FF_REG_a"), %%mm4 \n\t" | ||
1394 | "movq -1(%1, %%"FF_REG_a"), %%mm5 \n\t" | ||
1395 | " js 1b \n\t" | ||
1396 | ✗ | :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ), | |
1397 | ✗ | "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2), | |
1398 | ✗ | "g" (-mmxSize) | |
1399 | NAMED_CONSTRAINTS_ADD(mmx_ff) | ||
1400 | : "%"FF_REG_a | ||
1401 | ); | ||
1402 | } else { | ||
1403 | ✗ | mmxSize = 1; | |
1404 | ✗ | dst[0] = (src[0] * 3 + src[srcStride]) >> 2; | |
1405 | ✗ | dst[dstStride] = (src[0] + 3 * src[srcStride]) >> 2; | |
1406 | } | ||
1407 | |||
1408 | ✗ | for (x=mmxSize-1; x<srcWidth-1; x++) { | |
1409 | ✗ | dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2; | |
1410 | ✗ | dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2; | |
1411 | ✗ | dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2; | |
1412 | ✗ | dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2; | |
1413 | } | ||
1414 | ✗ | dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2; | |
1415 | ✗ | dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2; | |
1416 | |||
1417 | ✗ | dst+=dstStride*2; | |
1418 | ✗ | src+=srcStride; | |
1419 | } | ||
1420 | |||
1421 | // last line | ||
1422 | ✗ | dst[0]= src[0]; | |
1423 | |||
1424 | ✗ | for (x=0; x<srcWidth-1; x++) { | |
1425 | ✗ | dst[2*x+1]= (3*src[x] + src[x+1])>>2; | |
1426 | ✗ | dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; | |
1427 | } | ||
1428 | ✗ | dst[2*srcWidth-1]= src[srcWidth-1]; | |
1429 | |||
1430 | ✗ | __asm__ volatile(EMMS" \n\t" | |
1431 | SFENCE" \n\t" | ||
1432 | :::"memory"); | ||
1433 | ✗ | } | |
1434 | |||
1435 | /** | ||
1436 | * Height should be a multiple of 2 and width should be a multiple of 16. | ||
1437 | * (If this is a problem for anyone then tell me, and I will fix it.) | ||
1438 | * Chrominance data is only taken from every second line, others are ignored. | ||
1439 | * FIXME: Write HQ version. | ||
1440 | */ | ||
1441 | static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | ||
1442 | int width, int height, | ||
1443 | int lumStride, int chromStride, int srcStride) | ||
1444 | { | ||
1445 | int y; | ||
1446 | const x86_reg chromWidth= width>>1; | ||
1447 | for (y=0; y<height; y+=2) { | ||
1448 | __asm__ volatile( | ||
1449 | "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" | ||
1450 | "pcmpeqw %%mm7, %%mm7 \n\t" | ||
1451 | "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... | ||
1452 | ".p2align 4 \n\t" | ||
1453 | "1: \n\t" | ||
1454 | PREFETCH" 64(%0, %%"FF_REG_a", 4) \n\t" | ||
1455 | "movq (%0, %%"FF_REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0) | ||
1456 | "movq 8(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4) | ||
1457 | "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0) | ||
1458 | "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4) | ||
1459 | "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0) | ||
1460 | "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4) | ||
1461 | "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0) | ||
1462 | "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4) | ||
1463 | "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | ||
1464 | "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) | ||
1465 | |||
1466 | MOVNTQ" %%mm2, (%1, %%"FF_REG_a", 2) \n\t" | ||
1467 | |||
1468 | "movq 16(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8) | ||
1469 | "movq 24(%0, %%"FF_REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12) | ||
1470 | "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8) | ||
1471 | "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12) | ||
1472 | "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8) | ||
1473 | "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12) | ||
1474 | "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8) | ||
1475 | "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12) | ||
1476 | "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) | ||
1477 | "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) | ||
1478 | |||
1479 | MOVNTQ" %%mm3, 8(%1, %%"FF_REG_a", 2) \n\t" | ||
1480 | |||
1481 | "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) | ||
1482 | "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) | ||
1483 | "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) | ||
1484 | "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) | ||
1485 | "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) | ||
1486 | "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) | ||
1487 | "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) | ||
1488 | "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) | ||
1489 | |||
1490 | MOVNTQ" %%mm0, (%3, %%"FF_REG_a") \n\t" | ||
1491 | MOVNTQ" %%mm2, (%2, %%"FF_REG_a") \n\t" | ||
1492 | |||
1493 | "add $8, %%"FF_REG_a" \n\t" | ||
1494 | "cmp %4, %%"FF_REG_a" \n\t" | ||
1495 | " jb 1b \n\t" | ||
1496 | ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) | ||
1497 | : "memory", "%"FF_REG_a | ||
1498 | ); | ||
1499 | |||
1500 | ydst += lumStride; | ||
1501 | src += srcStride; | ||
1502 | |||
1503 | __asm__ volatile( | ||
1504 | "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" | ||
1505 | ".p2align 4 \n\t" | ||
1506 | "1: \n\t" | ||
1507 | PREFETCH" 64(%0, %%"FF_REG_a", 4) \n\t" | ||
1508 | "movq (%0, %%"FF_REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) | ||
1509 | "movq 8(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) | ||
1510 | "movq 16(%0, %%"FF_REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8) | ||
1511 | "movq 24(%0, %%"FF_REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12) | ||
1512 | "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0) | ||
1513 | "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4) | ||
1514 | "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8) | ||
1515 | "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12) | ||
1516 | "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) | ||
1517 | "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) | ||
1518 | |||
1519 | MOVNTQ" %%mm0, (%1, %%"FF_REG_a", 2) \n\t" | ||
1520 | MOVNTQ" %%mm2, 8(%1, %%"FF_REG_a", 2) \n\t" | ||
1521 | |||
1522 | "add $8, %%"FF_REG_a" \n\t" | ||
1523 | "cmp %4, %%"FF_REG_a" \n\t" | ||
1524 | " jb 1b \n\t" | ||
1525 | |||
1526 | ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) | ||
1527 | : "memory", "%"FF_REG_a | ||
1528 | ); | ||
1529 | udst += chromStride; | ||
1530 | vdst += chromStride; | ||
1531 | ydst += lumStride; | ||
1532 | src += srcStride; | ||
1533 | } | ||
1534 | __asm__ volatile(EMMS" \n\t" | ||
1535 | SFENCE" \n\t" | ||
1536 | :::"memory"); | ||
1537 | } | ||
1538 | |||
1539 | /** | ||
1540 | * Height should be a multiple of 2 and width should be a multiple of 2. | ||
1541 | * (If this is a problem for anyone then tell me, and I will fix it.) | ||
1542 | * Chrominance data is only taken from every second line, | ||
1543 | * others are ignored in the C version. | ||
1544 | * FIXME: Write HQ version. | ||
1545 | */ | ||
1546 | #if HAVE_7REGS | ||
1547 | ✗ | static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | |
1548 | int width, int height, | ||
1549 | int lumStride, int chromStride, int srcStride, | ||
1550 | int32_t *rgb2yuv) | ||
1551 | { | ||
1552 | #define BGR2Y_IDX "16*4+16*32" | ||
1553 | #define BGR2U_IDX "16*4+16*33" | ||
1554 | #define BGR2V_IDX "16*4+16*34" | ||
1555 | int y; | ||
1556 | ✗ | const x86_reg chromWidth= width>>1; | |
1557 | |||
1558 | ✗ | if (height > 2) { | |
1559 | ✗ | ff_rgb24toyv12_c(src, ydst, udst, vdst, width, 2, lumStride, chromStride, srcStride, rgb2yuv); | |
1560 | ✗ | src += 2*srcStride; | |
1561 | ✗ | ydst += 2*lumStride; | |
1562 | ✗ | udst += chromStride; | |
1563 | ✗ | vdst += chromStride; | |
1564 | ✗ | height -= 2; | |
1565 | } | ||
1566 | |||
1567 | ✗ | for (y=0; y<height-2; y+=2) { | |
1568 | int i; | ||
1569 | ✗ | for (i=0; i<2; i++) { | |
1570 | ✗ | __asm__ volatile( | |
1571 | "mov %2, %%"FF_REG_a"\n\t" | ||
1572 | "movq "BGR2Y_IDX"(%3), %%mm6 \n\t" | ||
1573 | "movq "MANGLE(ff_w1111)", %%mm5 \n\t" | ||
1574 | "pxor %%mm7, %%mm7 \n\t" | ||
1575 | "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_d" \n\t" | ||
1576 | ".p2align 4 \n\t" | ||
1577 | "1: \n\t" | ||
1578 | PREFETCH" 64(%0, %%"FF_REG_d") \n\t" | ||
1579 | "movd (%0, %%"FF_REG_d"), %%mm0 \n\t" | ||
1580 | "movd 3(%0, %%"FF_REG_d"), %%mm1 \n\t" | ||
1581 | "punpcklbw %%mm7, %%mm0 \n\t" | ||
1582 | "punpcklbw %%mm7, %%mm1 \n\t" | ||
1583 | "movd 6(%0, %%"FF_REG_d"), %%mm2 \n\t" | ||
1584 | "movd 9(%0, %%"FF_REG_d"), %%mm3 \n\t" | ||
1585 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
1586 | "punpcklbw %%mm7, %%mm3 \n\t" | ||
1587 | "pmaddwd %%mm6, %%mm0 \n\t" | ||
1588 | "pmaddwd %%mm6, %%mm1 \n\t" | ||
1589 | "pmaddwd %%mm6, %%mm2 \n\t" | ||
1590 | "pmaddwd %%mm6, %%mm3 \n\t" | ||
1591 | "psrad $8, %%mm0 \n\t" | ||
1592 | "psrad $8, %%mm1 \n\t" | ||
1593 | "psrad $8, %%mm2 \n\t" | ||
1594 | "psrad $8, %%mm3 \n\t" | ||
1595 | "packssdw %%mm1, %%mm0 \n\t" | ||
1596 | "packssdw %%mm3, %%mm2 \n\t" | ||
1597 | "pmaddwd %%mm5, %%mm0 \n\t" | ||
1598 | "pmaddwd %%mm5, %%mm2 \n\t" | ||
1599 | "packssdw %%mm2, %%mm0 \n\t" | ||
1600 | "psraw $7, %%mm0 \n\t" | ||
1601 | |||
1602 | "movd 12(%0, %%"FF_REG_d"), %%mm4 \n\t" | ||
1603 | "movd 15(%0, %%"FF_REG_d"), %%mm1 \n\t" | ||
1604 | "punpcklbw %%mm7, %%mm4 \n\t" | ||
1605 | "punpcklbw %%mm7, %%mm1 \n\t" | ||
1606 | "movd 18(%0, %%"FF_REG_d"), %%mm2 \n\t" | ||
1607 | "movd 21(%0, %%"FF_REG_d"), %%mm3 \n\t" | ||
1608 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
1609 | "punpcklbw %%mm7, %%mm3 \n\t" | ||
1610 | "pmaddwd %%mm6, %%mm4 \n\t" | ||
1611 | "pmaddwd %%mm6, %%mm1 \n\t" | ||
1612 | "pmaddwd %%mm6, %%mm2 \n\t" | ||
1613 | "pmaddwd %%mm6, %%mm3 \n\t" | ||
1614 | "psrad $8, %%mm4 \n\t" | ||
1615 | "psrad $8, %%mm1 \n\t" | ||
1616 | "psrad $8, %%mm2 \n\t" | ||
1617 | "psrad $8, %%mm3 \n\t" | ||
1618 | "packssdw %%mm1, %%mm4 \n\t" | ||
1619 | "packssdw %%mm3, %%mm2 \n\t" | ||
1620 | "pmaddwd %%mm5, %%mm4 \n\t" | ||
1621 | "pmaddwd %%mm5, %%mm2 \n\t" | ||
1622 | "add $24, %%"FF_REG_d"\n\t" | ||
1623 | "packssdw %%mm2, %%mm4 \n\t" | ||
1624 | "psraw $7, %%mm4 \n\t" | ||
1625 | |||
1626 | "packuswb %%mm4, %%mm0 \n\t" | ||
1627 | "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t" | ||
1628 | |||
1629 | MOVNTQ" %%mm0, (%1, %%"FF_REG_a") \n\t" | ||
1630 | "add $8, %%"FF_REG_a" \n\t" | ||
1631 | " js 1b \n\t" | ||
1632 | ✗ | : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width), "r"(rgb2yuv) | |
1633 | NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2YOffset) | ||
1634 | : "%"FF_REG_a, "%"FF_REG_d | ||
1635 | ); | ||
1636 | ✗ | ydst += lumStride; | |
1637 | ✗ | src += srcStride; | |
1638 | } | ||
1639 | ✗ | src -= srcStride*2; | |
1640 | ✗ | __asm__ volatile( | |
1641 | "mov %4, %%"FF_REG_a"\n\t" | ||
1642 | "movq "MANGLE(ff_w1111)", %%mm5 \n\t" | ||
1643 | "movq "BGR2U_IDX"(%5), %%mm6 \n\t" | ||
1644 | "pxor %%mm7, %%mm7 \n\t" | ||
1645 | "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_d" \n\t" | ||
1646 | "add %%"FF_REG_d", %%"FF_REG_d"\n\t" | ||
1647 | ".p2align 4 \n\t" | ||
1648 | "1: \n\t" | ||
1649 | PREFETCH" 64(%0, %%"FF_REG_d") \n\t" | ||
1650 | PREFETCH" 64(%1, %%"FF_REG_d") \n\t" | ||
1651 | "movq (%0, %%"FF_REG_d"), %%mm0 \n\t" | ||
1652 | "movq (%1, %%"FF_REG_d"), %%mm1 \n\t" | ||
1653 | "movq 6(%0, %%"FF_REG_d"), %%mm2 \n\t" | ||
1654 | "movq 6(%1, %%"FF_REG_d"), %%mm3 \n\t" | ||
1655 | PAVGB" %%mm1, %%mm0 \n\t" | ||
1656 | PAVGB" %%mm3, %%mm2 \n\t" | ||
1657 | "movq %%mm0, %%mm1 \n\t" | ||
1658 | "movq %%mm2, %%mm3 \n\t" | ||
1659 | "psrlq $24, %%mm0 \n\t" | ||
1660 | "psrlq $24, %%mm2 \n\t" | ||
1661 | PAVGB" %%mm1, %%mm0 \n\t" | ||
1662 | PAVGB" %%mm3, %%mm2 \n\t" | ||
1663 | "punpcklbw %%mm7, %%mm0 \n\t" | ||
1664 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
1665 | "movq "BGR2V_IDX"(%5), %%mm1 \n\t" | ||
1666 | "movq "BGR2V_IDX"(%5), %%mm3 \n\t" | ||
1667 | |||
1668 | "pmaddwd %%mm0, %%mm1 \n\t" | ||
1669 | "pmaddwd %%mm2, %%mm3 \n\t" | ||
1670 | "pmaddwd %%mm6, %%mm0 \n\t" | ||
1671 | "pmaddwd %%mm6, %%mm2 \n\t" | ||
1672 | "psrad $8, %%mm0 \n\t" | ||
1673 | "psrad $8, %%mm1 \n\t" | ||
1674 | "psrad $8, %%mm2 \n\t" | ||
1675 | "psrad $8, %%mm3 \n\t" | ||
1676 | "packssdw %%mm2, %%mm0 \n\t" | ||
1677 | "packssdw %%mm3, %%mm1 \n\t" | ||
1678 | "pmaddwd %%mm5, %%mm0 \n\t" | ||
1679 | "pmaddwd %%mm5, %%mm1 \n\t" | ||
1680 | "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 | ||
1681 | "psraw $7, %%mm0 \n\t" | ||
1682 | |||
1683 | "movq 12(%0, %%"FF_REG_d"), %%mm4 \n\t" | ||
1684 | "movq 12(%1, %%"FF_REG_d"), %%mm1 \n\t" | ||
1685 | "movq 18(%0, %%"FF_REG_d"), %%mm2 \n\t" | ||
1686 | "movq 18(%1, %%"FF_REG_d"), %%mm3 \n\t" | ||
1687 | PAVGB" %%mm1, %%mm4 \n\t" | ||
1688 | PAVGB" %%mm3, %%mm2 \n\t" | ||
1689 | "movq %%mm4, %%mm1 \n\t" | ||
1690 | "movq %%mm2, %%mm3 \n\t" | ||
1691 | "psrlq $24, %%mm4 \n\t" | ||
1692 | "psrlq $24, %%mm2 \n\t" | ||
1693 | PAVGB" %%mm1, %%mm4 \n\t" | ||
1694 | PAVGB" %%mm3, %%mm2 \n\t" | ||
1695 | "punpcklbw %%mm7, %%mm4 \n\t" | ||
1696 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
1697 | "movq "BGR2V_IDX"(%5), %%mm1 \n\t" | ||
1698 | "movq "BGR2V_IDX"(%5), %%mm3 \n\t" | ||
1699 | |||
1700 | "pmaddwd %%mm4, %%mm1 \n\t" | ||
1701 | "pmaddwd %%mm2, %%mm3 \n\t" | ||
1702 | "pmaddwd %%mm6, %%mm4 \n\t" | ||
1703 | "pmaddwd %%mm6, %%mm2 \n\t" | ||
1704 | "psrad $8, %%mm4 \n\t" | ||
1705 | "psrad $8, %%mm1 \n\t" | ||
1706 | "psrad $8, %%mm2 \n\t" | ||
1707 | "psrad $8, %%mm3 \n\t" | ||
1708 | "packssdw %%mm2, %%mm4 \n\t" | ||
1709 | "packssdw %%mm3, %%mm1 \n\t" | ||
1710 | "pmaddwd %%mm5, %%mm4 \n\t" | ||
1711 | "pmaddwd %%mm5, %%mm1 \n\t" | ||
1712 | "add $24, %%"FF_REG_d"\n\t" | ||
1713 | "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 | ||
1714 | "psraw $7, %%mm4 \n\t" | ||
1715 | |||
1716 | "movq %%mm0, %%mm1 \n\t" | ||
1717 | "punpckldq %%mm4, %%mm0 \n\t" | ||
1718 | "punpckhdq %%mm4, %%mm1 \n\t" | ||
1719 | "packsswb %%mm1, %%mm0 \n\t" | ||
1720 | "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t" | ||
1721 | "movd %%mm0, (%2, %%"FF_REG_a") \n\t" | ||
1722 | "punpckhdq %%mm0, %%mm0 \n\t" | ||
1723 | "movd %%mm0, (%3, %%"FF_REG_a") \n\t" | ||
1724 | "add $4, %%"FF_REG_a" \n\t" | ||
1725 | " js 1b \n\t" | ||
1726 | ✗ | : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth), "r"(rgb2yuv) | |
1727 | NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2UVOffset) | ||
1728 | : "%"FF_REG_a, "%"FF_REG_d | ||
1729 | ); | ||
1730 | |||
1731 | ✗ | udst += chromStride; | |
1732 | ✗ | vdst += chromStride; | |
1733 | ✗ | src += srcStride*2; | |
1734 | } | ||
1735 | |||
1736 | ✗ | __asm__ volatile(EMMS" \n\t" | |
1737 | SFENCE" \n\t" | ||
1738 | :::"memory"); | ||
1739 | |||
1740 | ✗ | ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride, rgb2yuv); | |
1741 | ✗ | } | |
1742 | #endif /* HAVE_7REGS */ | ||
1743 | #endif /* !COMPILE_TEMPLATE_SSE2 */ | ||
1744 | |||
1745 | #if !COMPILE_TEMPLATE_AVX && COMPILE_TEMPLATE_SSE2 | ||
1746 | 17 | static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest, | |
1747 | int width, int height, int src1Stride, | ||
1748 | int src2Stride, int dstStride) | ||
1749 | { | ||
1750 | int h; | ||
1751 | |||
1752 |
2/2✓ Branch 0 taken 1171 times.
✓ Branch 1 taken 17 times.
|
1188 | for (h=0; h < height; h++) { |
1753 | int w; | ||
1754 | |||
1755 |
2/2✓ Branch 0 taken 37 times.
✓ Branch 1 taken 1134 times.
|
1171 | if (width >= 16) { |
1756 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 37 times.
|
37 | if (!((((intptr_t)src1) | ((intptr_t)src2) | ((intptr_t)dest))&15)) { |
1757 | ✗ | __asm__( | |
1758 | "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" | ||
1759 | "1: \n\t" | ||
1760 | PREFETCH" 64(%1, %%"FF_REG_a") \n\t" | ||
1761 | PREFETCH" 64(%2, %%"FF_REG_a") \n\t" | ||
1762 | "movdqa (%1, %%"FF_REG_a"), %%xmm0 \n\t" | ||
1763 | "movdqa (%1, %%"FF_REG_a"), %%xmm1 \n\t" | ||
1764 | "movdqa (%2, %%"FF_REG_a"), %%xmm2 \n\t" | ||
1765 | "punpcklbw %%xmm2, %%xmm0 \n\t" | ||
1766 | "punpckhbw %%xmm2, %%xmm1 \n\t" | ||
1767 | "movntdq %%xmm0, (%0, %%"FF_REG_a", 2) \n\t" | ||
1768 | "movntdq %%xmm1, 16(%0, %%"FF_REG_a", 2) \n\t" | ||
1769 | "add $16, %%"FF_REG_a" \n\t" | ||
1770 | "cmp %3, %%"FF_REG_a" \n\t" | ||
1771 | " jb 1b \n\t" | ||
1772 | ✗ | ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15) | |
1773 | : "memory", XMM_CLOBBERS("xmm0", "xmm1", "xmm2",) "%"FF_REG_a | ||
1774 | ); | ||
1775 | } else | ||
1776 | 37 | __asm__( | |
1777 | "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" | ||
1778 | "1: \n\t" | ||
1779 | PREFETCH" 64(%1, %%"FF_REG_a") \n\t" | ||
1780 | PREFETCH" 64(%2, %%"FF_REG_a") \n\t" | ||
1781 | "movq (%1, %%"FF_REG_a"), %%mm0 \n\t" | ||
1782 | "movq 8(%1, %%"FF_REG_a"), %%mm2 \n\t" | ||
1783 | "movq %%mm0, %%mm1 \n\t" | ||
1784 | "movq %%mm2, %%mm3 \n\t" | ||
1785 | "movq (%2, %%"FF_REG_a"), %%mm4 \n\t" | ||
1786 | "movq 8(%2, %%"FF_REG_a"), %%mm5 \n\t" | ||
1787 | "punpcklbw %%mm4, %%mm0 \n\t" | ||
1788 | "punpckhbw %%mm4, %%mm1 \n\t" | ||
1789 | "punpcklbw %%mm5, %%mm2 \n\t" | ||
1790 | "punpckhbw %%mm5, %%mm3 \n\t" | ||
1791 | MOVNTQ" %%mm0, (%0, %%"FF_REG_a", 2) \n\t" | ||
1792 | MOVNTQ" %%mm1, 8(%0, %%"FF_REG_a", 2) \n\t" | ||
1793 | MOVNTQ" %%mm2, 16(%0, %%"FF_REG_a", 2) \n\t" | ||
1794 | MOVNTQ" %%mm3, 24(%0, %%"FF_REG_a", 2) \n\t" | ||
1795 | "add $16, %%"FF_REG_a" \n\t" | ||
1796 | "cmp %3, %%"FF_REG_a" \n\t" | ||
1797 | " jb 1b \n\t" | ||
1798 | 37 | ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15) | |
1799 | : "memory", "%"FF_REG_a | ||
1800 | ); | ||
1801 | |||
1802 | } | ||
1803 |
2/2✓ Branch 0 taken 9017 times.
✓ Branch 1 taken 1171 times.
|
10188 | for (w= (width&(~15)); w < width; w++) { |
1804 | 9017 | dest[2*w+0] = src1[w]; | |
1805 | 9017 | dest[2*w+1] = src2[w]; | |
1806 | } | ||
1807 | 1171 | dest += dstStride; | |
1808 | 1171 | src1 += src1Stride; | |
1809 | 1171 | src2 += src2Stride; | |
1810 | } | ||
1811 | 17 | __asm__( | |
1812 | EMMS" \n\t" | ||
1813 | SFENCE" \n\t" | ||
1814 | ::: "memory" | ||
1815 | ); | ||
1816 | 17 | } | |
1817 | #endif /* !COMPILE_TEMPLATE_AVX && COMPILE_TEMPLATE_SSE2 */ | ||
1818 | |||
1819 | #if !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL | ||
1820 | #if COMPILE_TEMPLATE_SSE2 && HAVE_X86ASM | ||
1821 | void RENAME(ff_nv12ToUV)(uint8_t *dstU, uint8_t *dstV, | ||
1822 | const uint8_t *unused, | ||
1823 | const uint8_t *src1, | ||
1824 | const uint8_t *src2, | ||
1825 | int w, | ||
1826 | uint32_t *unused2, | ||
1827 | void *opq); | ||
1828 | ✗ | static void RENAME(deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t *dst2, | |
1829 | int width, int height, int srcStride, | ||
1830 | int dst1Stride, int dst2Stride) | ||
1831 | { | ||
1832 | int h; | ||
1833 | |||
1834 | ✗ | for (h = 0; h < height; h++) { | |
1835 | ✗ | RENAME(ff_nv12ToUV)(dst1, dst2, NULL, src, NULL, width, NULL, NULL); | |
1836 | ✗ | src += srcStride; | |
1837 | ✗ | dst1 += dst1Stride; | |
1838 | ✗ | dst2 += dst2Stride; | |
1839 | } | ||
1840 | ✗ | __asm__( | |
1841 | SFENCE" \n\t" | ||
1842 | ::: "memory" | ||
1843 | ); | ||
1844 | ✗ | } | |
1845 | #endif /* COMPILE_TEMPLATE_SSE2 && HAVE_X86ASM */ | ||
1846 | #endif /* !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL */ | ||
1847 | |||
1848 | #if !COMPILE_TEMPLATE_SSE2 | ||
1849 | ✗ | static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2, | |
1850 | uint8_t *dst1, uint8_t *dst2, | ||
1851 | int width, int height, | ||
1852 | int srcStride1, int srcStride2, | ||
1853 | int dstStride1, int dstStride2) | ||
1854 | { | ||
1855 | x86_reg x, y; | ||
1856 | int w,h; | ||
1857 | ✗ | w=width/2; h=height/2; | |
1858 | ✗ | __asm__ volatile( | |
1859 | PREFETCH" %0 \n\t" | ||
1860 | PREFETCH" %1 \n\t" | ||
1861 | ✗ | ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory"); | |
1862 | ✗ | for (y=0;y<h;y++) { | |
1863 | ✗ | const uint8_t* s1=src1+srcStride1*(y>>1); | |
1864 | ✗ | uint8_t* d=dst1+dstStride1*y; | |
1865 | ✗ | x=0; | |
1866 | ✗ | for (;x<w-31;x+=32) { | |
1867 | ✗ | __asm__ volatile( | |
1868 | PREFETCH" 32(%1,%2) \n\t" | ||
1869 | "movq (%1,%2), %%mm0 \n\t" | ||
1870 | "movq 8(%1,%2), %%mm2 \n\t" | ||
1871 | "movq 16(%1,%2), %%mm4 \n\t" | ||
1872 | "movq 24(%1,%2), %%mm6 \n\t" | ||
1873 | "movq %%mm0, %%mm1 \n\t" | ||
1874 | "movq %%mm2, %%mm3 \n\t" | ||
1875 | "movq %%mm4, %%mm5 \n\t" | ||
1876 | "movq %%mm6, %%mm7 \n\t" | ||
1877 | "punpcklbw %%mm0, %%mm0 \n\t" | ||
1878 | "punpckhbw %%mm1, %%mm1 \n\t" | ||
1879 | "punpcklbw %%mm2, %%mm2 \n\t" | ||
1880 | "punpckhbw %%mm3, %%mm3 \n\t" | ||
1881 | "punpcklbw %%mm4, %%mm4 \n\t" | ||
1882 | "punpckhbw %%mm5, %%mm5 \n\t" | ||
1883 | "punpcklbw %%mm6, %%mm6 \n\t" | ||
1884 | "punpckhbw %%mm7, %%mm7 \n\t" | ||
1885 | MOVNTQ" %%mm0, (%0,%2,2) \n\t" | ||
1886 | MOVNTQ" %%mm1, 8(%0,%2,2) \n\t" | ||
1887 | MOVNTQ" %%mm2, 16(%0,%2,2) \n\t" | ||
1888 | MOVNTQ" %%mm3, 24(%0,%2,2) \n\t" | ||
1889 | MOVNTQ" %%mm4, 32(%0,%2,2) \n\t" | ||
1890 | MOVNTQ" %%mm5, 40(%0,%2,2) \n\t" | ||
1891 | MOVNTQ" %%mm6, 48(%0,%2,2) \n\t" | ||
1892 | MOVNTQ" %%mm7, 56(%0,%2,2)" | ||
1893 | :: "r"(d), "r"(s1), "r"(x) | ||
1894 | :"memory"); | ||
1895 | } | ||
1896 | ✗ | for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x]; | |
1897 | } | ||
1898 | ✗ | for (y=0;y<h;y++) { | |
1899 | ✗ | const uint8_t* s2=src2+srcStride2*(y>>1); | |
1900 | ✗ | uint8_t* d=dst2+dstStride2*y; | |
1901 | ✗ | x=0; | |
1902 | ✗ | for (;x<w-31;x+=32) { | |
1903 | ✗ | __asm__ volatile( | |
1904 | PREFETCH" 32(%1,%2) \n\t" | ||
1905 | "movq (%1,%2), %%mm0 \n\t" | ||
1906 | "movq 8(%1,%2), %%mm2 \n\t" | ||
1907 | "movq 16(%1,%2), %%mm4 \n\t" | ||
1908 | "movq 24(%1,%2), %%mm6 \n\t" | ||
1909 | "movq %%mm0, %%mm1 \n\t" | ||
1910 | "movq %%mm2, %%mm3 \n\t" | ||
1911 | "movq %%mm4, %%mm5 \n\t" | ||
1912 | "movq %%mm6, %%mm7 \n\t" | ||
1913 | "punpcklbw %%mm0, %%mm0 \n\t" | ||
1914 | "punpckhbw %%mm1, %%mm1 \n\t" | ||
1915 | "punpcklbw %%mm2, %%mm2 \n\t" | ||
1916 | "punpckhbw %%mm3, %%mm3 \n\t" | ||
1917 | "punpcklbw %%mm4, %%mm4 \n\t" | ||
1918 | "punpckhbw %%mm5, %%mm5 \n\t" | ||
1919 | "punpcklbw %%mm6, %%mm6 \n\t" | ||
1920 | "punpckhbw %%mm7, %%mm7 \n\t" | ||
1921 | MOVNTQ" %%mm0, (%0,%2,2) \n\t" | ||
1922 | MOVNTQ" %%mm1, 8(%0,%2,2) \n\t" | ||
1923 | MOVNTQ" %%mm2, 16(%0,%2,2) \n\t" | ||
1924 | MOVNTQ" %%mm3, 24(%0,%2,2) \n\t" | ||
1925 | MOVNTQ" %%mm4, 32(%0,%2,2) \n\t" | ||
1926 | MOVNTQ" %%mm5, 40(%0,%2,2) \n\t" | ||
1927 | MOVNTQ" %%mm6, 48(%0,%2,2) \n\t" | ||
1928 | MOVNTQ" %%mm7, 56(%0,%2,2)" | ||
1929 | :: "r"(d), "r"(s2), "r"(x) | ||
1930 | :"memory"); | ||
1931 | } | ||
1932 | ✗ | for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x]; | |
1933 | } | ||
1934 | ✗ | __asm__( | |
1935 | EMMS" \n\t" | ||
1936 | SFENCE" \n\t" | ||
1937 | ::: "memory" | ||
1938 | ); | ||
1939 | ✗ | } | |
1940 | |||
1941 | ✗ | static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, | |
1942 | uint8_t *dst, | ||
1943 | int width, int height, | ||
1944 | int srcStride1, int srcStride2, | ||
1945 | int srcStride3, int dstStride) | ||
1946 | { | ||
1947 | x86_reg x; | ||
1948 | int y,w,h; | ||
1949 | ✗ | w=width/2; h=height; | |
1950 | ✗ | for (y=0;y<h;y++) { | |
1951 | ✗ | const uint8_t* yp=src1+srcStride1*y; | |
1952 | ✗ | const uint8_t* up=src2+srcStride2*(y>>2); | |
1953 | ✗ | const uint8_t* vp=src3+srcStride3*(y>>2); | |
1954 | ✗ | uint8_t* d=dst+dstStride*y; | |
1955 | ✗ | x=0; | |
1956 | ✗ | for (;x<w-7;x+=8) { | |
1957 | ✗ | __asm__ volatile( | |
1958 | PREFETCH" 32(%1, %0) \n\t" | ||
1959 | PREFETCH" 32(%2, %0) \n\t" | ||
1960 | PREFETCH" 32(%3, %0) \n\t" | ||
1961 | "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ | ||
1962 | "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */ | ||
1963 | "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */ | ||
1964 | "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ | ||
1965 | "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */ | ||
1966 | "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */ | ||
1967 | "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */ | ||
1968 | "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */ | ||
1969 | "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */ | ||
1970 | "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */ | ||
1971 | |||
1972 | "movq %%mm1, %%mm6 \n\t" | ||
1973 | "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/ | ||
1974 | "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/ | ||
1975 | "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/ | ||
1976 | MOVNTQ" %%mm0, (%4, %0, 8) \n\t" | ||
1977 | MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t" | ||
1978 | |||
1979 | "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/ | ||
1980 | "movq 8(%1, %0, 4), %%mm0 \n\t" | ||
1981 | "movq %%mm0, %%mm3 \n\t" | ||
1982 | "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/ | ||
1983 | "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/ | ||
1984 | MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t" | ||
1985 | MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t" | ||
1986 | |||
1987 | "movq %%mm4, %%mm6 \n\t" | ||
1988 | "movq 16(%1, %0, 4), %%mm0 \n\t" | ||
1989 | "movq %%mm0, %%mm3 \n\t" | ||
1990 | "punpcklbw %%mm5, %%mm4 \n\t" | ||
1991 | "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/ | ||
1992 | "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/ | ||
1993 | MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t" | ||
1994 | MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t" | ||
1995 | |||
1996 | "punpckhbw %%mm5, %%mm6 \n\t" | ||
1997 | "movq 24(%1, %0, 4), %%mm0 \n\t" | ||
1998 | "movq %%mm0, %%mm3 \n\t" | ||
1999 | "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/ | ||
2000 | "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/ | ||
2001 | MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t" | ||
2002 | MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t" | ||
2003 | |||
2004 | : "+r" (x) | ||
2005 | : "r"(yp), "r" (up), "r"(vp), "r"(d) | ||
2006 | :"memory"); | ||
2007 | } | ||
2008 | ✗ | for (; x<w; x++) { | |
2009 | ✗ | const int x2 = x<<2; | |
2010 | ✗ | d[8*x+0] = yp[x2]; | |
2011 | ✗ | d[8*x+1] = up[x]; | |
2012 | ✗ | d[8*x+2] = yp[x2+1]; | |
2013 | ✗ | d[8*x+3] = vp[x]; | |
2014 | ✗ | d[8*x+4] = yp[x2+2]; | |
2015 | ✗ | d[8*x+5] = up[x]; | |
2016 | ✗ | d[8*x+6] = yp[x2+3]; | |
2017 | ✗ | d[8*x+7] = vp[x]; | |
2018 | } | ||
2019 | } | ||
2020 | ✗ | __asm__( | |
2021 | EMMS" \n\t" | ||
2022 | SFENCE" \n\t" | ||
2023 | ::: "memory" | ||
2024 | ); | ||
2025 | ✗ | } | |
2026 | |||
2027 | ✗ | static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count) | |
2028 | { | ||
2029 | ✗ | dst += count; | |
2030 | ✗ | src += 2*count; | |
2031 | ✗ | count= - count; | |
2032 | |||
2033 | ✗ | if(count <= -16) { | |
2034 | ✗ | count += 15; | |
2035 | ✗ | __asm__ volatile( | |
2036 | "pcmpeqw %%mm7, %%mm7 \n\t" | ||
2037 | "psrlw $8, %%mm7 \n\t" | ||
2038 | "1: \n\t" | ||
2039 | "movq -30(%1, %0, 2), %%mm0 \n\t" | ||
2040 | "movq -22(%1, %0, 2), %%mm1 \n\t" | ||
2041 | "movq -14(%1, %0, 2), %%mm2 \n\t" | ||
2042 | "movq -6(%1, %0, 2), %%mm3 \n\t" | ||
2043 | "pand %%mm7, %%mm0 \n\t" | ||
2044 | "pand %%mm7, %%mm1 \n\t" | ||
2045 | "pand %%mm7, %%mm2 \n\t" | ||
2046 | "pand %%mm7, %%mm3 \n\t" | ||
2047 | "packuswb %%mm1, %%mm0 \n\t" | ||
2048 | "packuswb %%mm3, %%mm2 \n\t" | ||
2049 | MOVNTQ" %%mm0,-15(%2, %0) \n\t" | ||
2050 | MOVNTQ" %%mm2,- 7(%2, %0) \n\t" | ||
2051 | "add $16, %0 \n\t" | ||
2052 | " js 1b \n\t" | ||
2053 | : "+r"(count) | ||
2054 | : "r"(src), "r"(dst) | ||
2055 | ); | ||
2056 | ✗ | count -= 15; | |
2057 | } | ||
2058 | ✗ | while(count<0) { | |
2059 | ✗ | dst[count]= src[2*count]; | |
2060 | ✗ | count++; | |
2061 | } | ||
2062 | ✗ | } | |
2063 | |||
2064 | ✗ | static void RENAME(extract_odd)(const uint8_t *src, uint8_t *dst, x86_reg count) | |
2065 | { | ||
2066 | ✗ | src ++; | |
2067 | ✗ | dst += count; | |
2068 | ✗ | src += 2*count; | |
2069 | ✗ | count= - count; | |
2070 | |||
2071 | ✗ | if(count < -16) { | |
2072 | ✗ | count += 16; | |
2073 | ✗ | __asm__ volatile( | |
2074 | "pcmpeqw %%mm7, %%mm7 \n\t" | ||
2075 | "psrlw $8, %%mm7 \n\t" | ||
2076 | "1: \n\t" | ||
2077 | "movq -32(%1, %0, 2), %%mm0 \n\t" | ||
2078 | "movq -24(%1, %0, 2), %%mm1 \n\t" | ||
2079 | "movq -16(%1, %0, 2), %%mm2 \n\t" | ||
2080 | "movq -8(%1, %0, 2), %%mm3 \n\t" | ||
2081 | "pand %%mm7, %%mm0 \n\t" | ||
2082 | "pand %%mm7, %%mm1 \n\t" | ||
2083 | "pand %%mm7, %%mm2 \n\t" | ||
2084 | "pand %%mm7, %%mm3 \n\t" | ||
2085 | "packuswb %%mm1, %%mm0 \n\t" | ||
2086 | "packuswb %%mm3, %%mm2 \n\t" | ||
2087 | MOVNTQ" %%mm0,-16(%2, %0) \n\t" | ||
2088 | MOVNTQ" %%mm2,- 8(%2, %0) \n\t" | ||
2089 | "add $16, %0 \n\t" | ||
2090 | " js 1b \n\t" | ||
2091 | : "+r"(count) | ||
2092 | : "r"(src), "r"(dst) | ||
2093 | ); | ||
2094 | ✗ | count -= 16; | |
2095 | } | ||
2096 | ✗ | while(count<0) { | |
2097 | ✗ | dst[count]= src[2*count]; | |
2098 | ✗ | count++; | |
2099 | } | ||
2100 | ✗ | } | |
2101 | |||
2102 | #if ARCH_X86_32 | ||
2103 | static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count) | ||
2104 | { | ||
2105 | dst0+= count; | ||
2106 | dst1+= count; | ||
2107 | src += 4*count; | ||
2108 | count= - count; | ||
2109 | if(count <= -8) { | ||
2110 | count += 7; | ||
2111 | __asm__ volatile( | ||
2112 | "pcmpeqw %%mm7, %%mm7 \n\t" | ||
2113 | "psrlw $8, %%mm7 \n\t" | ||
2114 | "1: \n\t" | ||
2115 | "movq -28(%1, %0, 4), %%mm0 \n\t" | ||
2116 | "movq -20(%1, %0, 4), %%mm1 \n\t" | ||
2117 | "movq -12(%1, %0, 4), %%mm2 \n\t" | ||
2118 | "movq -4(%1, %0, 4), %%mm3 \n\t" | ||
2119 | "pand %%mm7, %%mm0 \n\t" | ||
2120 | "pand %%mm7, %%mm1 \n\t" | ||
2121 | "pand %%mm7, %%mm2 \n\t" | ||
2122 | "pand %%mm7, %%mm3 \n\t" | ||
2123 | "packuswb %%mm1, %%mm0 \n\t" | ||
2124 | "packuswb %%mm3, %%mm2 \n\t" | ||
2125 | "movq %%mm0, %%mm1 \n\t" | ||
2126 | "movq %%mm2, %%mm3 \n\t" | ||
2127 | "psrlw $8, %%mm0 \n\t" | ||
2128 | "psrlw $8, %%mm2 \n\t" | ||
2129 | "pand %%mm7, %%mm1 \n\t" | ||
2130 | "pand %%mm7, %%mm3 \n\t" | ||
2131 | "packuswb %%mm2, %%mm0 \n\t" | ||
2132 | "packuswb %%mm3, %%mm1 \n\t" | ||
2133 | MOVNTQ" %%mm0,- 7(%3, %0) \n\t" | ||
2134 | MOVNTQ" %%mm1,- 7(%2, %0) \n\t" | ||
2135 | "add $8, %0 \n\t" | ||
2136 | " js 1b \n\t" | ||
2137 | : "+r"(count) | ||
2138 | : "r"(src), "r"(dst0), "r"(dst1) | ||
2139 | ); | ||
2140 | count -= 7; | ||
2141 | } | ||
2142 | while(count<0) { | ||
2143 | dst0[count]= src[4*count+0]; | ||
2144 | dst1[count]= src[4*count+2]; | ||
2145 | count++; | ||
2146 | } | ||
2147 | } | ||
2148 | #endif /* ARCH_X86_32 */ | ||
2149 | |||
2150 | ✗ | static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count) | |
2151 | { | ||
2152 | ✗ | dst0 += count; | |
2153 | ✗ | dst1 += count; | |
2154 | ✗ | src0 += 4*count; | |
2155 | ✗ | src1 += 4*count; | |
2156 | ✗ | count= - count; | |
2157 | #ifdef PAVGB | ||
2158 | ✗ | if(count <= -8) { | |
2159 | ✗ | count += 7; | |
2160 | ✗ | __asm__ volatile( | |
2161 | "pcmpeqw %%mm7, %%mm7 \n\t" | ||
2162 | "psrlw $8, %%mm7 \n\t" | ||
2163 | "1: \n\t" | ||
2164 | "movq -28(%1, %0, 4), %%mm0 \n\t" | ||
2165 | "movq -20(%1, %0, 4), %%mm1 \n\t" | ||
2166 | "movq -12(%1, %0, 4), %%mm2 \n\t" | ||
2167 | "movq -4(%1, %0, 4), %%mm3 \n\t" | ||
2168 | PAVGB" -28(%2, %0, 4), %%mm0 \n\t" | ||
2169 | PAVGB" -20(%2, %0, 4), %%mm1 \n\t" | ||
2170 | PAVGB" -12(%2, %0, 4), %%mm2 \n\t" | ||
2171 | PAVGB" - 4(%2, %0, 4), %%mm3 \n\t" | ||
2172 | "pand %%mm7, %%mm0 \n\t" | ||
2173 | "pand %%mm7, %%mm1 \n\t" | ||
2174 | "pand %%mm7, %%mm2 \n\t" | ||
2175 | "pand %%mm7, %%mm3 \n\t" | ||
2176 | "packuswb %%mm1, %%mm0 \n\t" | ||
2177 | "packuswb %%mm3, %%mm2 \n\t" | ||
2178 | "movq %%mm0, %%mm1 \n\t" | ||
2179 | "movq %%mm2, %%mm3 \n\t" | ||
2180 | "psrlw $8, %%mm0 \n\t" | ||
2181 | "psrlw $8, %%mm2 \n\t" | ||
2182 | "pand %%mm7, %%mm1 \n\t" | ||
2183 | "pand %%mm7, %%mm3 \n\t" | ||
2184 | "packuswb %%mm2, %%mm0 \n\t" | ||
2185 | "packuswb %%mm3, %%mm1 \n\t" | ||
2186 | MOVNTQ" %%mm0,- 7(%4, %0) \n\t" | ||
2187 | MOVNTQ" %%mm1,- 7(%3, %0) \n\t" | ||
2188 | "add $8, %0 \n\t" | ||
2189 | " js 1b \n\t" | ||
2190 | : "+r"(count) | ||
2191 | : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1) | ||
2192 | ); | ||
2193 | ✗ | count -= 7; | |
2194 | } | ||
2195 | #endif | ||
2196 | ✗ | while(count<0) { | |
2197 | ✗ | dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1; | |
2198 | ✗ | dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1; | |
2199 | ✗ | count++; | |
2200 | } | ||
2201 | ✗ | } | |
2202 | |||
2203 | ✗ | static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count) | |
2204 | { | ||
2205 | ✗ | dst0+= count; | |
2206 | ✗ | dst1+= count; | |
2207 | ✗ | src += 4*count; | |
2208 | ✗ | count= - count; | |
2209 | ✗ | if(count <= -8) { | |
2210 | ✗ | count += 7; | |
2211 | ✗ | __asm__ volatile( | |
2212 | "pcmpeqw %%mm7, %%mm7 \n\t" | ||
2213 | "psrlw $8, %%mm7 \n\t" | ||
2214 | "1: \n\t" | ||
2215 | "movq -28(%1, %0, 4), %%mm0 \n\t" | ||
2216 | "movq -20(%1, %0, 4), %%mm1 \n\t" | ||
2217 | "movq -12(%1, %0, 4), %%mm2 \n\t" | ||
2218 | "movq -4(%1, %0, 4), %%mm3 \n\t" | ||
2219 | "psrlw $8, %%mm0 \n\t" | ||
2220 | "psrlw $8, %%mm1 \n\t" | ||
2221 | "psrlw $8, %%mm2 \n\t" | ||
2222 | "psrlw $8, %%mm3 \n\t" | ||
2223 | "packuswb %%mm1, %%mm0 \n\t" | ||
2224 | "packuswb %%mm3, %%mm2 \n\t" | ||
2225 | "movq %%mm0, %%mm1 \n\t" | ||
2226 | "movq %%mm2, %%mm3 \n\t" | ||
2227 | "psrlw $8, %%mm0 \n\t" | ||
2228 | "psrlw $8, %%mm2 \n\t" | ||
2229 | "pand %%mm7, %%mm1 \n\t" | ||
2230 | "pand %%mm7, %%mm3 \n\t" | ||
2231 | "packuswb %%mm2, %%mm0 \n\t" | ||
2232 | "packuswb %%mm3, %%mm1 \n\t" | ||
2233 | MOVNTQ" %%mm0,- 7(%3, %0) \n\t" | ||
2234 | MOVNTQ" %%mm1,- 7(%2, %0) \n\t" | ||
2235 | "add $8, %0 \n\t" | ||
2236 | " js 1b \n\t" | ||
2237 | : "+r"(count) | ||
2238 | : "r"(src), "r"(dst0), "r"(dst1) | ||
2239 | ); | ||
2240 | ✗ | count -= 7; | |
2241 | } | ||
2242 | ✗ | src++; | |
2243 | ✗ | while(count<0) { | |
2244 | ✗ | dst0[count]= src[4*count+0]; | |
2245 | ✗ | dst1[count]= src[4*count+2]; | |
2246 | ✗ | count++; | |
2247 | } | ||
2248 | ✗ | } | |
2249 | |||
2250 | ✗ | static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count) | |
2251 | { | ||
2252 | ✗ | dst0 += count; | |
2253 | ✗ | dst1 += count; | |
2254 | ✗ | src0 += 4*count; | |
2255 | ✗ | src1 += 4*count; | |
2256 | ✗ | count= - count; | |
2257 | #ifdef PAVGB | ||
2258 | ✗ | if(count <= -8) { | |
2259 | ✗ | count += 7; | |
2260 | ✗ | __asm__ volatile( | |
2261 | "pcmpeqw %%mm7, %%mm7 \n\t" | ||
2262 | "psrlw $8, %%mm7 \n\t" | ||
2263 | "1: \n\t" | ||
2264 | "movq -28(%1, %0, 4), %%mm0 \n\t" | ||
2265 | "movq -20(%1, %0, 4), %%mm1 \n\t" | ||
2266 | "movq -12(%1, %0, 4), %%mm2 \n\t" | ||
2267 | "movq -4(%1, %0, 4), %%mm3 \n\t" | ||
2268 | PAVGB" -28(%2, %0, 4), %%mm0 \n\t" | ||
2269 | PAVGB" -20(%2, %0, 4), %%mm1 \n\t" | ||
2270 | PAVGB" -12(%2, %0, 4), %%mm2 \n\t" | ||
2271 | PAVGB" - 4(%2, %0, 4), %%mm3 \n\t" | ||
2272 | "psrlw $8, %%mm0 \n\t" | ||
2273 | "psrlw $8, %%mm1 \n\t" | ||
2274 | "psrlw $8, %%mm2 \n\t" | ||
2275 | "psrlw $8, %%mm3 \n\t" | ||
2276 | "packuswb %%mm1, %%mm0 \n\t" | ||
2277 | "packuswb %%mm3, %%mm2 \n\t" | ||
2278 | "movq %%mm0, %%mm1 \n\t" | ||
2279 | "movq %%mm2, %%mm3 \n\t" | ||
2280 | "psrlw $8, %%mm0 \n\t" | ||
2281 | "psrlw $8, %%mm2 \n\t" | ||
2282 | "pand %%mm7, %%mm1 \n\t" | ||
2283 | "pand %%mm7, %%mm3 \n\t" | ||
2284 | "packuswb %%mm2, %%mm0 \n\t" | ||
2285 | "packuswb %%mm3, %%mm1 \n\t" | ||
2286 | MOVNTQ" %%mm0,- 7(%4, %0) \n\t" | ||
2287 | MOVNTQ" %%mm1,- 7(%3, %0) \n\t" | ||
2288 | "add $8, %0 \n\t" | ||
2289 | " js 1b \n\t" | ||
2290 | : "+r"(count) | ||
2291 | : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1) | ||
2292 | ); | ||
2293 | ✗ | count -= 7; | |
2294 | } | ||
2295 | #endif | ||
2296 | ✗ | src0++; | |
2297 | ✗ | src1++; | |
2298 | ✗ | while(count<0) { | |
2299 | ✗ | dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1; | |
2300 | ✗ | dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1; | |
2301 | ✗ | count++; | |
2302 | } | ||
2303 | ✗ | } | |
2304 | |||
2305 | ✗ | static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, | |
2306 | int width, int height, | ||
2307 | int lumStride, int chromStride, int srcStride) | ||
2308 | { | ||
2309 | int y; | ||
2310 | ✗ | const int chromWidth = AV_CEIL_RSHIFT(width, 1); | |
2311 | |||
2312 | ✗ | for (y=0; y<height; y++) { | |
2313 | ✗ | RENAME(extract_even)(src, ydst, width); | |
2314 | ✗ | if(y&1) { | |
2315 | ✗ | RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth); | |
2316 | ✗ | udst+= chromStride; | |
2317 | ✗ | vdst+= chromStride; | |
2318 | } | ||
2319 | |||
2320 | ✗ | src += srcStride; | |
2321 | ✗ | ydst+= lumStride; | |
2322 | } | ||
2323 | ✗ | __asm__( | |
2324 | EMMS" \n\t" | ||
2325 | SFENCE" \n\t" | ||
2326 | ::: "memory" | ||
2327 | ); | ||
2328 | ✗ | } | |
2329 | |||
2330 | ✗ | static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, | |
2331 | int width, int height, | ||
2332 | int lumStride, int chromStride, int srcStride) | ||
2333 | { | ||
2334 | int y; | ||
2335 | ✗ | const int chromWidth = AV_CEIL_RSHIFT(width, 1); | |
2336 | |||
2337 | ✗ | for (y=0; y<height; y++) { | |
2338 | ✗ | RENAME(extract_even)(src, ydst, width); | |
2339 | ✗ | RENAME(extract_odd2)(src, udst, vdst, chromWidth); | |
2340 | |||
2341 | ✗ | src += srcStride; | |
2342 | ✗ | ydst+= lumStride; | |
2343 | ✗ | udst+= chromStride; | |
2344 | ✗ | vdst+= chromStride; | |
2345 | } | ||
2346 | ✗ | __asm__( | |
2347 | EMMS" \n\t" | ||
2348 | SFENCE" \n\t" | ||
2349 | ::: "memory" | ||
2350 | ); | ||
2351 | ✗ | } | |
2352 | |||
2353 | ✗ | static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, | |
2354 | int width, int height, | ||
2355 | int lumStride, int chromStride, int srcStride) | ||
2356 | { | ||
2357 | int y; | ||
2358 | ✗ | const int chromWidth = AV_CEIL_RSHIFT(width, 1); | |
2359 | |||
2360 | ✗ | for (y=0; y<height; y++) { | |
2361 | ✗ | RENAME(extract_odd)(src, ydst, width); | |
2362 | ✗ | if(y&1) { | |
2363 | ✗ | RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth); | |
2364 | ✗ | udst+= chromStride; | |
2365 | ✗ | vdst+= chromStride; | |
2366 | } | ||
2367 | |||
2368 | ✗ | src += srcStride; | |
2369 | ✗ | ydst+= lumStride; | |
2370 | } | ||
2371 | ✗ | __asm__( | |
2372 | EMMS" \n\t" | ||
2373 | SFENCE" \n\t" | ||
2374 | ::: "memory" | ||
2375 | ); | ||
2376 | ✗ | } | |
2377 | |||
2378 | #if ARCH_X86_32 | ||
2379 | static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, | ||
2380 | int width, int height, | ||
2381 | int lumStride, int chromStride, int srcStride) | ||
2382 | { | ||
2383 | int y; | ||
2384 | const int chromWidth = AV_CEIL_RSHIFT(width, 1); | ||
2385 | |||
2386 | for (y=0; y<height; y++) { | ||
2387 | RENAME(extract_odd)(src, ydst, width); | ||
2388 | RENAME(extract_even2)(src, udst, vdst, chromWidth); | ||
2389 | |||
2390 | src += srcStride; | ||
2391 | ydst+= lumStride; | ||
2392 | udst+= chromStride; | ||
2393 | vdst+= chromStride; | ||
2394 | } | ||
2395 | __asm__( | ||
2396 | EMMS" \n\t" | ||
2397 | SFENCE" \n\t" | ||
2398 | ::: "memory" | ||
2399 | ); | ||
2400 | } | ||
2401 | #endif /* ARCH_X86_32 */ | ||
2402 | #endif /* !COMPILE_TEMPLATE_SSE2 */ | ||
2403 | |||
2404 | 76 | static av_cold void RENAME(rgb2rgb_init)(void) | |
2405 | { | ||
2406 | #if !COMPILE_TEMPLATE_SSE2 | ||
2407 | 32 | rgb15to16 = RENAME(rgb15to16); | |
2408 | 32 | rgb15tobgr24 = RENAME(rgb15tobgr24); | |
2409 | 32 | rgb15to32 = RENAME(rgb15to32); | |
2410 | 32 | rgb16tobgr24 = RENAME(rgb16tobgr24); | |
2411 | 32 | rgb16to32 = RENAME(rgb16to32); | |
2412 | 32 | rgb16to15 = RENAME(rgb16to15); | |
2413 | 32 | rgb24tobgr16 = RENAME(rgb24tobgr16); | |
2414 | 32 | rgb24tobgr15 = RENAME(rgb24tobgr15); | |
2415 | 32 | rgb24tobgr32 = RENAME(rgb24tobgr32); | |
2416 | 32 | rgb32to16 = RENAME(rgb32to16); | |
2417 | 32 | rgb32to15 = RENAME(rgb32to15); | |
2418 | 32 | rgb32tobgr24 = RENAME(rgb32tobgr24); | |
2419 | 32 | rgb24to15 = RENAME(rgb24to15); | |
2420 | 32 | rgb24to16 = RENAME(rgb24to16); | |
2421 | 32 | rgb24tobgr24 = RENAME(rgb24tobgr24); | |
2422 | 32 | rgb32tobgr16 = RENAME(rgb32tobgr16); | |
2423 | 32 | rgb32tobgr15 = RENAME(rgb32tobgr15); | |
2424 | 32 | yv12toyuy2 = RENAME(yv12toyuy2); | |
2425 | 32 | yv12touyvy = RENAME(yv12touyvy); | |
2426 | 32 | yuv422ptoyuy2 = RENAME(yuv422ptoyuy2); | |
2427 | 32 | yuv422ptouyvy = RENAME(yuv422ptouyvy); | |
2428 | 32 | yuy2toyv12 = RENAME(yuy2toyv12); | |
2429 | 32 | vu9_to_vu12 = RENAME(vu9_to_vu12); | |
2430 | 32 | yvu9_to_yuy2 = RENAME(yvu9_to_yuy2); | |
2431 | #if ARCH_X86_32 | ||
2432 | uyvytoyuv422 = RENAME(uyvytoyuv422); | ||
2433 | #endif | ||
2434 | 32 | yuyvtoyuv422 = RENAME(yuyvtoyuv422); | |
2435 | |||
2436 | 32 | planar2x = RENAME(planar2x); | |
2437 | #if HAVE_7REGS | ||
2438 | 32 | ff_rgb24toyv12 = RENAME(rgb24toyv12); | |
2439 | #endif /* HAVE_7REGS */ | ||
2440 | |||
2441 | 32 | yuyvtoyuv420 = RENAME(yuyvtoyuv420); | |
2442 | 32 | uyvytoyuv420 = RENAME(uyvytoyuv420); | |
2443 | #endif /* !COMPILE_TEMPLATE_SSE2 */ | ||
2444 | |||
2445 | #if !COMPILE_TEMPLATE_AVX && COMPILE_TEMPLATE_SSE2 | ||
2446 | 28 | interleaveBytes = RENAME(interleaveBytes); | |
2447 | #endif /* !COMPILE_TEMPLATE_AVX && COMPILE_TEMPLATE_SSE2 */ | ||
2448 | #if !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL | ||
2449 | #if COMPILE_TEMPLATE_SSE2 && HAVE_X86ASM | ||
2450 | 44 | deinterleaveBytes = RENAME(deinterleaveBytes); | |
2451 | #endif | ||
2452 | #endif | ||
2453 | 76 | } | |
2454 |