FFmpeg coverage


Directory: ../../../ffmpeg/
File: src/libswscale/x86/swscale_template.c
Date: 2025-01-20 09:27:23
Exec Total Coverage
Lines: 23 237 9.7%
Functions: 1 22 4.5%
Branches: 25 69 36.2%

Line Branch Exec Source
1 /*
2 * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include <stdint.h>
22
23 #include "libavutil/x86/asm.h"
24 #include "libswscale/swscale_internal.h"
25
26 #undef REAL_MOVNTQ
27 #undef MOVNTQ
28 #undef MOVNTQ2
29 #undef PREFETCH
30
31
32 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
33 #define MOVNTQ2 "movntq "
34 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
35
36 #define YSCALEYUV2PACKEDX_UV \
37 __asm__ volatile(\
38 "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
39 ".p2align 4 \n\t"\
40 "nop \n\t"\
41 "1: \n\t"\
42 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d" \n\t"\
43 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
44 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
45 "movq %%mm3, %%mm4 \n\t"\
46 ".p2align 4 \n\t"\
47 "2: \n\t"\
48 "movq 8(%%"FF_REG_d"), %%mm0 \n\t" /* filterCoeff */\
49 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm2 \n\t" /* UsrcData */\
50 "add %6, %%"FF_REG_S" \n\t" \
51 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm5 \n\t" /* VsrcData */\
52 "add $16, %%"FF_REG_d" \n\t"\
53 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
54 "pmulhw %%mm0, %%mm2 \n\t"\
55 "pmulhw %%mm0, %%mm5 \n\t"\
56 "paddw %%mm2, %%mm3 \n\t"\
57 "paddw %%mm5, %%mm4 \n\t"\
58 "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
59 " jnz 2b \n\t"\
60
61 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
62 "lea "offset"(%0), %%"FF_REG_d" \n\t"\
63 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
64 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
65 "movq "#dst1", "#dst2" \n\t"\
66 ".p2align 4 \n\t"\
67 "2: \n\t"\
68 "movq 8(%%"FF_REG_d"), "#coeff" \n\t" /* filterCoeff */\
69 "movq (%%"FF_REG_S", %%"FF_REG_a", 2), "#src1" \n\t" /* Y1srcData */\
70 "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), "#src2" \n\t" /* Y2srcData */\
71 "add $16, %%"FF_REG_d" \n\t"\
72 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
73 "pmulhw "#coeff", "#src1" \n\t"\
74 "pmulhw "#coeff", "#src2" \n\t"\
75 "paddw "#src1", "#dst1" \n\t"\
76 "paddw "#src2", "#dst2" \n\t"\
77 "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
78 " jnz 2b \n\t"\
79
80 #define YSCALEYUV2PACKEDX \
81 YSCALEYUV2PACKEDX_UV \
82 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
83
84 #define YSCALEYUV2PACKEDX_END \
85 :: "r" (&c->redDither), \
86 "m" (dummy), "m" (dummy), "m" (dummy),\
87 "r" (dest), "m" (dstW_reg), "m"(uv_off) \
88 NAMED_CONSTRAINTS_ADD(bF8,bFC) \
89 : "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_S \
90 );
91
92 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
93 __asm__ volatile(\
94 "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
95 ".p2align 4 \n\t"\
96 "nop \n\t"\
97 "1: \n\t"\
98 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d" \n\t"\
99 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
100 "pxor %%mm4, %%mm4 \n\t"\
101 "pxor %%mm5, %%mm5 \n\t"\
102 "pxor %%mm6, %%mm6 \n\t"\
103 "pxor %%mm7, %%mm7 \n\t"\
104 ".p2align 4 \n\t"\
105 "2: \n\t"\
106 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm0 \n\t" /* UsrcData */\
107 "add %6, %%"FF_REG_S" \n\t" \
108 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm2 \n\t" /* VsrcData */\
109 "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
110 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm1 \n\t" /* UsrcData */\
111 "movq %%mm0, %%mm3 \n\t"\
112 "punpcklwd %%mm1, %%mm0 \n\t"\
113 "punpckhwd %%mm1, %%mm3 \n\t"\
114 "movq "STR(APCK_COEF)"(%%"FF_REG_d"),%%mm1 \n\t" /* filterCoeff */\
115 "pmaddwd %%mm1, %%mm0 \n\t"\
116 "pmaddwd %%mm1, %%mm3 \n\t"\
117 "paddd %%mm0, %%mm4 \n\t"\
118 "paddd %%mm3, %%mm5 \n\t"\
119 "add %6, %%"FF_REG_S" \n\t" \
120 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm3 \n\t" /* VsrcData */\
121 "mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
122 "add $"STR(APCK_SIZE)", %%"FF_REG_d" \n\t"\
123 "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
124 "movq %%mm2, %%mm0 \n\t"\
125 "punpcklwd %%mm3, %%mm2 \n\t"\
126 "punpckhwd %%mm3, %%mm0 \n\t"\
127 "pmaddwd %%mm1, %%mm2 \n\t"\
128 "pmaddwd %%mm1, %%mm0 \n\t"\
129 "paddd %%mm2, %%mm6 \n\t"\
130 "paddd %%mm0, %%mm7 \n\t"\
131 " jnz 2b \n\t"\
132 "psrad $16, %%mm4 \n\t"\
133 "psrad $16, %%mm5 \n\t"\
134 "psrad $16, %%mm6 \n\t"\
135 "psrad $16, %%mm7 \n\t"\
136 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
137 "packssdw %%mm5, %%mm4 \n\t"\
138 "packssdw %%mm7, %%mm6 \n\t"\
139 "paddw %%mm0, %%mm4 \n\t"\
140 "paddw %%mm0, %%mm6 \n\t"\
141 "movq %%mm4, "U_TEMP"(%0) \n\t"\
142 "movq %%mm6, "V_TEMP"(%0) \n\t"\
143
144 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
145 "lea "offset"(%0), %%"FF_REG_d" \n\t"\
146 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
147 "pxor %%mm1, %%mm1 \n\t"\
148 "pxor %%mm5, %%mm5 \n\t"\
149 "pxor %%mm7, %%mm7 \n\t"\
150 "pxor %%mm6, %%mm6 \n\t"\
151 ".p2align 4 \n\t"\
152 "2: \n\t"\
153 "movq (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
154 "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
155 "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
156 "movq (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
157 "movq %%mm0, %%mm3 \n\t"\
158 "punpcklwd %%mm4, %%mm0 \n\t"\
159 "punpckhwd %%mm4, %%mm3 \n\t"\
160 "movq "STR(APCK_COEF)"(%%"FF_REG_d"), %%mm4 \n\t" /* filterCoeff */\
161 "pmaddwd %%mm4, %%mm0 \n\t"\
162 "pmaddwd %%mm4, %%mm3 \n\t"\
163 "paddd %%mm0, %%mm1 \n\t"\
164 "paddd %%mm3, %%mm5 \n\t"\
165 "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
166 "mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
167 "add $"STR(APCK_SIZE)", %%"FF_REG_d" \n\t"\
168 "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
169 "movq %%mm2, %%mm0 \n\t"\
170 "punpcklwd %%mm3, %%mm2 \n\t"\
171 "punpckhwd %%mm3, %%mm0 \n\t"\
172 "pmaddwd %%mm4, %%mm2 \n\t"\
173 "pmaddwd %%mm4, %%mm0 \n\t"\
174 "paddd %%mm2, %%mm7 \n\t"\
175 "paddd %%mm0, %%mm6 \n\t"\
176 " jnz 2b \n\t"\
177 "psrad $16, %%mm1 \n\t"\
178 "psrad $16, %%mm5 \n\t"\
179 "psrad $16, %%mm7 \n\t"\
180 "psrad $16, %%mm6 \n\t"\
181 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
182 "packssdw %%mm5, %%mm1 \n\t"\
183 "packssdw %%mm6, %%mm7 \n\t"\
184 "paddw %%mm0, %%mm1 \n\t"\
185 "paddw %%mm0, %%mm7 \n\t"\
186 "movq "U_TEMP"(%0), %%mm3 \n\t"\
187 "movq "V_TEMP"(%0), %%mm4 \n\t"\
188
189 #define YSCALEYUV2PACKEDX_ACCURATE \
190 YSCALEYUV2PACKEDX_ACCURATE_UV \
191 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
192
193 #define YSCALEYUV2RGBX \
194 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
195 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
196 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
197 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
198 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
199 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
200 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
201 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
202 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
203 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
204 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
205 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
206 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
207 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
208 "paddw %%mm3, %%mm4 \n\t"\
209 "movq %%mm2, %%mm0 \n\t"\
210 "movq %%mm5, %%mm6 \n\t"\
211 "movq %%mm4, %%mm3 \n\t"\
212 "punpcklwd %%mm2, %%mm2 \n\t"\
213 "punpcklwd %%mm5, %%mm5 \n\t"\
214 "punpcklwd %%mm4, %%mm4 \n\t"\
215 "paddw %%mm1, %%mm2 \n\t"\
216 "paddw %%mm1, %%mm5 \n\t"\
217 "paddw %%mm1, %%mm4 \n\t"\
218 "punpckhwd %%mm0, %%mm0 \n\t"\
219 "punpckhwd %%mm6, %%mm6 \n\t"\
220 "punpckhwd %%mm3, %%mm3 \n\t"\
221 "paddw %%mm7, %%mm0 \n\t"\
222 "paddw %%mm7, %%mm6 \n\t"\
223 "paddw %%mm7, %%mm3 \n\t"\
224 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
225 "packuswb %%mm0, %%mm2 \n\t"\
226 "packuswb %%mm6, %%mm5 \n\t"\
227 "packuswb %%mm3, %%mm4 \n\t"\
228
229 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
230 "movq "#b", "#q2" \n\t" /* B */\
231 "movq "#r", "#t" \n\t" /* R */\
232 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
233 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
234 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
235 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
236 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
237 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
238 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
239 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
240 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
241 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
242 \
243 MOVNTQ( q0, (dst, index, 4))\
244 MOVNTQ( b, 8(dst, index, 4))\
245 MOVNTQ( q2, 16(dst, index, 4))\
246 MOVNTQ( q3, 24(dst, index, 4))\
247 \
248 "add $8, "#index" \n\t"\
249 "cmp "dstw", "#index" \n\t"\
250 " jb 1b \n\t"
251 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
252
253 static void RENAME(yuv2rgb32_X_ar)(SwsInternal *c, const int16_t *lumFilter,
254 const int16_t **lumSrc, int lumFilterSize,
255 const int16_t *chrFilter, const int16_t **chrUSrc,
256 const int16_t **chrVSrc,
257 int chrFilterSize, const int16_t **alpSrc,
258 uint8_t *dest, int dstW, int dstY)
259 {
260 x86_reg dummy=0;
261 x86_reg dstW_reg = dstW;
262 x86_reg uv_off = c->uv_offx2;
263
264 if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
265 YSCALEYUV2PACKEDX_ACCURATE
266 YSCALEYUV2RGBX
267 "movq %%mm2, "U_TEMP"(%0) \n\t"
268 "movq %%mm4, "V_TEMP"(%0) \n\t"
269 "movq %%mm5, "Y_TEMP"(%0) \n\t"
270 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
271 "movq "Y_TEMP"(%0), %%mm5 \n\t"
272 "psraw $3, %%mm1 \n\t"
273 "psraw $3, %%mm7 \n\t"
274 "packuswb %%mm7, %%mm1 \n\t"
275 WRITEBGR32(%4, "%5", %%FF_REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
276 YSCALEYUV2PACKEDX_END
277 } else {
278 YSCALEYUV2PACKEDX_ACCURATE
279 YSCALEYUV2RGBX
280 "pcmpeqd %%mm7, %%mm7 \n\t"
281 WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
282 YSCALEYUV2PACKEDX_END
283 }
284 }
285
286 static void RENAME(yuv2rgb32_X)(SwsInternal *c, const int16_t *lumFilter,
287 const int16_t **lumSrc, int lumFilterSize,
288 const int16_t *chrFilter, const int16_t **chrUSrc,
289 const int16_t **chrVSrc,
290 int chrFilterSize, const int16_t **alpSrc,
291 uint8_t *dest, int dstW, int dstY)
292 {
293 x86_reg dummy=0;
294 x86_reg dstW_reg = dstW;
295 x86_reg uv_off = c->uv_offx2;
296
297 if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
298 YSCALEYUV2PACKEDX
299 YSCALEYUV2RGBX
300 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
301 "psraw $3, %%mm1 \n\t"
302 "psraw $3, %%mm7 \n\t"
303 "packuswb %%mm7, %%mm1 \n\t"
304 WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
305 YSCALEYUV2PACKEDX_END
306 } else {
307 YSCALEYUV2PACKEDX
308 YSCALEYUV2RGBX
309 "pcmpeqd %%mm7, %%mm7 \n\t"
310 WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
311 YSCALEYUV2PACKEDX_END
312 }
313 }
314
315 static void RENAME(yuv2bgr32_X)(SwsInternal *c, const int16_t *lumFilter,
316 const int16_t **lumSrc, int lumFilterSize,
317 const int16_t *chrFilter, const int16_t **chrUSrc,
318 const int16_t **chrVSrc,
319 int chrFilterSize, const int16_t **alpSrc,
320 uint8_t *dest, int dstW, int dstY)
321 {
322 x86_reg dummy=0;
323 x86_reg dstW_reg = dstW;
324 x86_reg uv_off = c->uv_offx2;
325
326 if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
327 YSCALEYUV2PACKEDX
328 YSCALEYUV2RGBX
329 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
330 "psraw $3, %%mm1 \n\t"
331 "psraw $3, %%mm7 \n\t"
332 "packuswb %%mm7, %%mm1 \n\t"
333 WRITEBGR32(%4, "%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
334 YSCALEYUV2PACKEDX_END
335 } else {
336 YSCALEYUV2PACKEDX
337 YSCALEYUV2RGBX
338 "pcmpeqd %%mm7, %%mm7 \n\t"
339 WRITEBGR32(%4, "%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
340 YSCALEYUV2PACKEDX_END
341 }
342 }
343
344 #define REAL_WRITERGB16(dst, dstw, index) \
345 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
346 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
347 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
348 "psrlq $3, %%mm2 \n\t"\
349 \
350 "movq %%mm2, %%mm1 \n\t"\
351 "movq %%mm4, %%mm3 \n\t"\
352 \
353 "punpcklbw %%mm7, %%mm3 \n\t"\
354 "punpcklbw %%mm5, %%mm2 \n\t"\
355 "punpckhbw %%mm7, %%mm4 \n\t"\
356 "punpckhbw %%mm5, %%mm1 \n\t"\
357 \
358 "psllq $3, %%mm3 \n\t"\
359 "psllq $3, %%mm4 \n\t"\
360 \
361 "por %%mm3, %%mm2 \n\t"\
362 "por %%mm4, %%mm1 \n\t"\
363 \
364 MOVNTQ(%%mm2, (dst, index, 2))\
365 MOVNTQ(%%mm1, 8(dst, index, 2))\
366 \
367 "add $8, "#index" \n\t"\
368 "cmp "dstw", "#index" \n\t"\
369 " jb 1b \n\t"
370 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
371
372 static void RENAME(yuv2rgb565_X_ar)(SwsInternal *c, const int16_t *lumFilter,
373 const int16_t **lumSrc, int lumFilterSize,
374 const int16_t *chrFilter, const int16_t **chrUSrc,
375 const int16_t **chrVSrc,
376 int chrFilterSize, const int16_t **alpSrc,
377 uint8_t *dest, int dstW, int dstY)
378 {
379 x86_reg dummy=0;
380 x86_reg dstW_reg = dstW;
381 x86_reg uv_off = c->uv_offx2;
382
383 YSCALEYUV2PACKEDX_ACCURATE
384 YSCALEYUV2RGBX
385 "pxor %%mm7, %%mm7 \n\t"
386 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
387 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
388 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
389 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
390 WRITERGB16(%4, "%5", %%FF_REGa)
391 YSCALEYUV2PACKEDX_END
392 }
393
394 static void RENAME(yuv2rgb565_X)(SwsInternal *c, const int16_t *lumFilter,
395 const int16_t **lumSrc, int lumFilterSize,
396 const int16_t *chrFilter, const int16_t **chrUSrc,
397 const int16_t **chrVSrc,
398 int chrFilterSize, const int16_t **alpSrc,
399 uint8_t *dest, int dstW, int dstY)
400 {
401 x86_reg dummy=0;
402 x86_reg dstW_reg = dstW;
403 x86_reg uv_off = c->uv_offx2;
404
405 YSCALEYUV2PACKEDX
406 YSCALEYUV2RGBX
407 "pxor %%mm7, %%mm7 \n\t"
408 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
409 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
410 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
411 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
412 WRITERGB16(%4, "%5", %%FF_REGa)
413 YSCALEYUV2PACKEDX_END
414 }
415
416 #define REAL_WRITERGB15(dst, dstw, index) \
417 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
418 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
419 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
420 "psrlq $3, %%mm2 \n\t"\
421 "psrlq $1, %%mm5 \n\t"\
422 \
423 "movq %%mm2, %%mm1 \n\t"\
424 "movq %%mm4, %%mm3 \n\t"\
425 \
426 "punpcklbw %%mm7, %%mm3 \n\t"\
427 "punpcklbw %%mm5, %%mm2 \n\t"\
428 "punpckhbw %%mm7, %%mm4 \n\t"\
429 "punpckhbw %%mm5, %%mm1 \n\t"\
430 \
431 "psllq $2, %%mm3 \n\t"\
432 "psllq $2, %%mm4 \n\t"\
433 \
434 "por %%mm3, %%mm2 \n\t"\
435 "por %%mm4, %%mm1 \n\t"\
436 \
437 MOVNTQ(%%mm2, (dst, index, 2))\
438 MOVNTQ(%%mm1, 8(dst, index, 2))\
439 \
440 "add $8, "#index" \n\t"\
441 "cmp "dstw", "#index" \n\t"\
442 " jb 1b \n\t"
443 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
444
445 static void RENAME(yuv2rgb555_X_ar)(SwsInternal *c, const int16_t *lumFilter,
446 const int16_t **lumSrc, int lumFilterSize,
447 const int16_t *chrFilter, const int16_t **chrUSrc,
448 const int16_t **chrVSrc,
449 int chrFilterSize, const int16_t **alpSrc,
450 uint8_t *dest, int dstW, int dstY)
451 {
452 x86_reg dummy=0;
453 x86_reg dstW_reg = dstW;
454 x86_reg uv_off = c->uv_offx2;
455
456 YSCALEYUV2PACKEDX_ACCURATE
457 YSCALEYUV2RGBX
458 "pxor %%mm7, %%mm7 \n\t"
459 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
460 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
461 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
462 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
463 WRITERGB15(%4, "%5", %%FF_REGa)
464 YSCALEYUV2PACKEDX_END
465 }
466
467 static void RENAME(yuv2rgb555_X)(SwsInternal *c, const int16_t *lumFilter,
468 const int16_t **lumSrc, int lumFilterSize,
469 const int16_t *chrFilter, const int16_t **chrUSrc,
470 const int16_t **chrVSrc,
471 int chrFilterSize, const int16_t **alpSrc,
472 uint8_t *dest, int dstW, int dstY)
473 {
474 x86_reg dummy=0;
475 x86_reg dstW_reg = dstW;
476 x86_reg uv_off = c->uv_offx2;
477
478 YSCALEYUV2PACKEDX
479 YSCALEYUV2RGBX
480 "pxor %%mm7, %%mm7 \n\t"
481 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
482 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
483 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
484 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
485 WRITERGB15(%4, "%5", %%FF_REGa)
486 YSCALEYUV2PACKEDX_END
487 }
488
489 #define WRITEBGR24MMX(dst, dstw, index) \
490 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
491 "movq %%mm2, %%mm1 \n\t" /* B */\
492 "movq %%mm5, %%mm6 \n\t" /* R */\
493 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
494 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
495 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
496 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
497 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
498 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
499 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
500 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
501 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
502 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
503 \
504 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
505 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
506 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
507 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
508 \
509 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
510 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
511 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
512 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
513 \
514 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
515 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
516 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
517 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
518 \
519 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
520 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
521 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
522 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
523 MOVNTQ(%%mm0, (dst))\
524 \
525 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
526 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
527 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
528 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
529 MOVNTQ(%%mm6, 8(dst))\
530 \
531 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
532 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
533 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
534 MOVNTQ(%%mm5, 16(dst))\
535 \
536 "add $24, "#dst" \n\t"\
537 \
538 "add $8, "#index" \n\t"\
539 "cmp "dstw", "#index" \n\t"\
540 " jb 1b \n\t"
541
542 #define WRITEBGR24MMXEXT(dst, dstw, index) \
543 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
544 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
545 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
546 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
547 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
548 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
549 \
550 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
551 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
552 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
553 \
554 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
555 "por %%mm1, %%mm6 \n\t"\
556 "por %%mm3, %%mm6 \n\t"\
557 MOVNTQ(%%mm6, (dst))\
558 \
559 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
560 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
561 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
562 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
563 \
564 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
565 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
566 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
567 \
568 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
569 "por %%mm3, %%mm6 \n\t"\
570 MOVNTQ(%%mm6, 8(dst))\
571 \
572 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
573 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
574 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
575 \
576 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
577 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
578 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
579 \
580 "por %%mm1, %%mm3 \n\t"\
581 "por %%mm3, %%mm6 \n\t"\
582 MOVNTQ(%%mm6, 16(dst))\
583 \
584 "add $24, "#dst" \n\t"\
585 \
586 "add $8, "#index" \n\t"\
587 "cmp "dstw", "#index" \n\t"\
588 " jb 1b \n\t"
589
590 #undef WRITEBGR24
591 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index)
592
593 #if HAVE_6REGS
594 static void RENAME(yuv2bgr24_X_ar)(SwsInternal *c, const int16_t *lumFilter,
595 const int16_t **lumSrc, int lumFilterSize,
596 const int16_t *chrFilter, const int16_t **chrUSrc,
597 const int16_t **chrVSrc,
598 int chrFilterSize, const int16_t **alpSrc,
599 uint8_t *dest, int dstW, int dstY)
600 {
601 x86_reg dummy=0;
602 x86_reg dstW_reg = dstW;
603 x86_reg uv_off = c->uv_offx2;
604
605 YSCALEYUV2PACKEDX_ACCURATE
606 YSCALEYUV2RGBX
607 "pxor %%mm7, %%mm7 \n\t"
608 "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_c"\n\t" //FIXME optimize
609 "add %4, %%"FF_REG_c" \n\t"
610 WRITEBGR24(%%FF_REGc, "%5", %%FF_REGa)
611 :: "r" (&c->redDither),
612 "m" (dummy), "m" (dummy), "m" (dummy),
613 "r" (dest), "m" (dstW_reg), "m"(uv_off)
614 NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
615 : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S
616 );
617 }
618
619 static void RENAME(yuv2bgr24_X)(SwsInternal *c, const int16_t *lumFilter,
620 const int16_t **lumSrc, int lumFilterSize,
621 const int16_t *chrFilter, const int16_t **chrUSrc,
622 const int16_t **chrVSrc,
623 int chrFilterSize, const int16_t **alpSrc,
624 uint8_t *dest, int dstW, int dstY)
625 {
626 x86_reg dummy=0;
627 x86_reg dstW_reg = dstW;
628 x86_reg uv_off = c->uv_offx2;
629
630 YSCALEYUV2PACKEDX
631 YSCALEYUV2RGBX
632 "pxor %%mm7, %%mm7 \n\t"
633 "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_c" \n\t" //FIXME optimize
634 "add %4, %%"FF_REG_c" \n\t"
635 WRITEBGR24(%%FF_REGc, "%5", %%FF_REGa)
636 :: "r" (&c->redDither),
637 "m" (dummy), "m" (dummy), "m" (dummy),
638 "r" (dest), "m" (dstW_reg), "m"(uv_off)
639 NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
640 : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S
641 );
642 }
643 #endif /* HAVE_6REGS */
644
645 #define REAL_WRITEYUY2(dst, dstw, index) \
646 "packuswb %%mm3, %%mm3 \n\t"\
647 "packuswb %%mm4, %%mm4 \n\t"\
648 "packuswb %%mm7, %%mm1 \n\t"\
649 "punpcklbw %%mm4, %%mm3 \n\t"\
650 "movq %%mm1, %%mm7 \n\t"\
651 "punpcklbw %%mm3, %%mm1 \n\t"\
652 "punpckhbw %%mm3, %%mm7 \n\t"\
653 \
654 MOVNTQ(%%mm1, (dst, index, 2))\
655 MOVNTQ(%%mm7, 8(dst, index, 2))\
656 \
657 "add $8, "#index" \n\t"\
658 "cmp "dstw", "#index" \n\t"\
659 " jb 1b \n\t"
660 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
661
662 static void RENAME(yuv2yuyv422_X_ar)(SwsInternal *c, const int16_t *lumFilter,
663 const int16_t **lumSrc, int lumFilterSize,
664 const int16_t *chrFilter, const int16_t **chrUSrc,
665 const int16_t **chrVSrc,
666 int chrFilterSize, const int16_t **alpSrc,
667 uint8_t *dest, int dstW, int dstY)
668 {
669 x86_reg dummy=0;
670 x86_reg dstW_reg = dstW;
671 x86_reg uv_off = c->uv_offx2;
672
673 YSCALEYUV2PACKEDX_ACCURATE
674 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
675 "psraw $3, %%mm3 \n\t"
676 "psraw $3, %%mm4 \n\t"
677 "psraw $3, %%mm1 \n\t"
678 "psraw $3, %%mm7 \n\t"
679 WRITEYUY2(%4, "%5", %%FF_REGa)
680 YSCALEYUV2PACKEDX_END
681 }
682
683 static void RENAME(yuv2yuyv422_X)(SwsInternal *c, const int16_t *lumFilter,
684 const int16_t **lumSrc, int lumFilterSize,
685 const int16_t *chrFilter, const int16_t **chrUSrc,
686 const int16_t **chrVSrc,
687 int chrFilterSize, const int16_t **alpSrc,
688 uint8_t *dest, int dstW, int dstY)
689 {
690 x86_reg dummy=0;
691 x86_reg dstW_reg = dstW;
692 x86_reg uv_off = c->uv_offx2;
693
694 YSCALEYUV2PACKEDX
695 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
696 "psraw $3, %%mm3 \n\t"
697 "psraw $3, %%mm4 \n\t"
698 "psraw $3, %%mm1 \n\t"
699 "psraw $3, %%mm7 \n\t"
700 WRITEYUY2(%4, "%5", %%FF_REGa)
701 YSCALEYUV2PACKEDX_END
702 }
703
704 #define REAL_YSCALEYUV2RGB_UV(index, c) \
705 "xor "#index", "#index" \n\t"\
706 ".p2align 4 \n\t"\
707 "1: \n\t"\
708 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
709 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
710 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
711 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
712 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
713 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
714 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
715 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
716 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
717 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
718 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
719 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
720 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
721 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
722 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
723 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
724 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
725 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
726 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
727 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
728 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
729 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
730
731 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
732 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
733 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
734 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
735 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
736 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
737 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
738 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
739 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
740 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
741 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
742 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
743 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
744
745 #define REAL_YSCALEYUV2RGB_COEFF(c) \
746 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
747 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
748 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
749 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
750 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
751 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
752 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
753 "paddw %%mm3, %%mm4 \n\t"\
754 "movq %%mm2, %%mm0 \n\t"\
755 "movq %%mm5, %%mm6 \n\t"\
756 "movq %%mm4, %%mm3 \n\t"\
757 "punpcklwd %%mm2, %%mm2 \n\t"\
758 "punpcklwd %%mm5, %%mm5 \n\t"\
759 "punpcklwd %%mm4, %%mm4 \n\t"\
760 "paddw %%mm1, %%mm2 \n\t"\
761 "paddw %%mm1, %%mm5 \n\t"\
762 "paddw %%mm1, %%mm4 \n\t"\
763 "punpckhwd %%mm0, %%mm0 \n\t"\
764 "punpckhwd %%mm6, %%mm6 \n\t"\
765 "punpckhwd %%mm3, %%mm3 \n\t"\
766 "paddw %%mm7, %%mm0 \n\t"\
767 "paddw %%mm7, %%mm6 \n\t"\
768 "paddw %%mm7, %%mm3 \n\t"\
769 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
770 "packuswb %%mm0, %%mm2 \n\t"\
771 "packuswb %%mm6, %%mm5 \n\t"\
772 "packuswb %%mm3, %%mm4 \n\t"\
773
774 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
775
776 #define YSCALEYUV2RGB(index, c) \
777 REAL_YSCALEYUV2RGB_UV(index, c) \
778 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
779 REAL_YSCALEYUV2RGB_COEFF(c)
780
781 /**
782 * vertical bilinear scale YV12 to RGB
783 */
784 static void RENAME(yuv2rgb32_2)(SwsInternal *c, const int16_t *buf[2],
785 const int16_t *ubuf[2], const int16_t *vbuf[2],
786 const int16_t *abuf[2], uint8_t *dest,
787 int dstW, int yalpha, int uvalpha, int y)
788 {
789 const int16_t *buf0 = buf[0], *buf1 = buf[1],
790 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
791
792 if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
793 const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
794 #if ARCH_X86_64
795 __asm__ volatile(
796 YSCALEYUV2RGB(%%r8, %5)
797 YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
798 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
799 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
800 "packuswb %%mm7, %%mm1 \n\t"
801 WRITEBGR32(%4, DSTW_OFFSET"(%5)", %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
802 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
803 "a" (&c->redDither),
804 "r" (abuf0), "r" (abuf1)
805 : "%r8"
806 );
807 #else
808 c->u_temp=(intptr_t)abuf0;
809 c->v_temp=(intptr_t)abuf1;
810 __asm__ volatile(
811 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
812 "mov %4, %%"FF_REG_b" \n\t"
813 "push %%"FF_REG_BP" \n\t"
814 YSCALEYUV2RGB(%%FF_REGBP, %5)
815 "push %0 \n\t"
816 "push %1 \n\t"
817 "mov "U_TEMP"(%5), %0 \n\t"
818 "mov "V_TEMP"(%5), %1 \n\t"
819 YSCALEYUV2RGB_YA(%%FF_REGBP, %5, %0, %1)
820 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
821 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
822 "packuswb %%mm7, %%mm1 \n\t"
823 "pop %1 \n\t"
824 "pop %0 \n\t"
825 WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
826 "pop %%"FF_REG_BP" \n\t"
827 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
828 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
829 "a" (&c->redDither)
830 );
831 #endif
832 } else {
833 __asm__ volatile(
834 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
835 "mov %4, %%"FF_REG_b" \n\t"
836 "push %%"FF_REG_BP" \n\t"
837 YSCALEYUV2RGB(%%FF_REGBP, %5)
838 "pcmpeqd %%mm7, %%mm7 \n\t"
839 WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
840 "pop %%"FF_REG_BP" \n\t"
841 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
842 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
843 "a" (&c->redDither)
844 );
845 }
846 }
847
848 static void RENAME(yuv2bgr24_2)(SwsInternal *c, const int16_t *buf[2],
849 const int16_t *ubuf[2], const int16_t *vbuf[2],
850 const int16_t *abuf[2], uint8_t *dest,
851 int dstW, int yalpha, int uvalpha, int y)
852 {
853 const int16_t *buf0 = buf[0], *buf1 = buf[1],
854 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
855
856 __asm__ volatile(
857 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
858 "mov %4, %%"FF_REG_b" \n\t"
859 "push %%"FF_REG_BP" \n\t"
860 YSCALEYUV2RGB(%%FF_REGBP, %5)
861 "pxor %%mm7, %%mm7 \n\t"
862 WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
863 "pop %%"FF_REG_BP" \n\t"
864 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
865 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
866 "a" (&c->redDither)
867 NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
868 );
869 }
870
871 static void RENAME(yuv2rgb555_2)(SwsInternal *c, const int16_t *buf[2],
872 const int16_t *ubuf[2], const int16_t *vbuf[2],
873 const int16_t *abuf[2], uint8_t *dest,
874 int dstW, int yalpha, int uvalpha, int y)
875 {
876 const int16_t *buf0 = buf[0], *buf1 = buf[1],
877 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
878
879 __asm__ volatile(
880 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
881 "mov %4, %%"FF_REG_b" \n\t"
882 "push %%"FF_REG_BP" \n\t"
883 YSCALEYUV2RGB(%%FF_REGBP, %5)
884 "pxor %%mm7, %%mm7 \n\t"
885 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
886 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
887 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
888 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
889 WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
890 "pop %%"FF_REG_BP" \n\t"
891 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
892 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
893 "a" (&c->redDither)
894 NAMED_CONSTRAINTS_ADD(bF8)
895 );
896 }
897
898 static void RENAME(yuv2rgb565_2)(SwsInternal *c, const int16_t *buf[2],
899 const int16_t *ubuf[2], const int16_t *vbuf[2],
900 const int16_t *abuf[2], uint8_t *dest,
901 int dstW, int yalpha, int uvalpha, int y)
902 {
903 const int16_t *buf0 = buf[0], *buf1 = buf[1],
904 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
905
906 __asm__ volatile(
907 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
908 "mov %4, %%"FF_REG_b" \n\t"
909 "push %%"FF_REG_BP" \n\t"
910 YSCALEYUV2RGB(%%FF_REGBP, %5)
911 "pxor %%mm7, %%mm7 \n\t"
912 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
913 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
914 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
915 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
916 WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
917 "pop %%"FF_REG_BP" \n\t"
918 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
919 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
920 "a" (&c->redDither)
921 NAMED_CONSTRAINTS_ADD(bF8,bFC)
922 );
923 }
924
925 #define REAL_YSCALEYUV2PACKED(index, c) \
926 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
927 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
928 "psraw $3, %%mm0 \n\t"\
929 "psraw $3, %%mm1 \n\t"\
930 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
931 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
932 "xor "#index", "#index" \n\t"\
933 ".p2align 4 \n\t"\
934 "1: \n\t"\
935 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
936 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
937 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
938 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
939 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
940 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
941 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
942 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
943 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
944 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
945 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
946 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
947 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
948 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
949 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
950 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
951 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
952 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
953 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
954 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
955 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
956 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
957 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
958 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
959 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
960 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
961 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
962
963 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
964
965 static void RENAME(yuv2yuyv422_2)(SwsInternal *c, const int16_t *buf[2],
966 const int16_t *ubuf[2], const int16_t *vbuf[2],
967 const int16_t *abuf[2], uint8_t *dest,
968 int dstW, int yalpha, int uvalpha, int y)
969 {
970 const int16_t *buf0 = buf[0], *buf1 = buf[1],
971 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
972
973 __asm__ volatile(
974 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
975 "mov %4, %%"FF_REG_b" \n\t"
976 "push %%"FF_REG_BP" \n\t"
977 YSCALEYUV2PACKED(%%FF_REGBP, %5)
978 WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
979 "pop %%"FF_REG_BP" \n\t"
980 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
981 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
982 "a" (&c->redDither)
983 );
984 }
985
986 #define REAL_YSCALEYUV2RGB1(index, c) \
987 "xor "#index", "#index" \n\t"\
988 ".p2align 4 \n\t"\
989 "1: \n\t"\
990 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
991 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
992 "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
993 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
994 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
995 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
996 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
997 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
998 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
999 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
1000 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
1001 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
1002 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
1003 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
1004 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
1005 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1006 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1007 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
1008 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
1009 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
1010 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
1011 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
1012 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
1013 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
1014 "paddw %%mm3, %%mm4 \n\t"\
1015 "movq %%mm2, %%mm0 \n\t"\
1016 "movq %%mm5, %%mm6 \n\t"\
1017 "movq %%mm4, %%mm3 \n\t"\
1018 "punpcklwd %%mm2, %%mm2 \n\t"\
1019 "punpcklwd %%mm5, %%mm5 \n\t"\
1020 "punpcklwd %%mm4, %%mm4 \n\t"\
1021 "paddw %%mm1, %%mm2 \n\t"\
1022 "paddw %%mm1, %%mm5 \n\t"\
1023 "paddw %%mm1, %%mm4 \n\t"\
1024 "punpckhwd %%mm0, %%mm0 \n\t"\
1025 "punpckhwd %%mm6, %%mm6 \n\t"\
1026 "punpckhwd %%mm3, %%mm3 \n\t"\
1027 "paddw %%mm7, %%mm0 \n\t"\
1028 "paddw %%mm7, %%mm6 \n\t"\
1029 "paddw %%mm7, %%mm3 \n\t"\
1030 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
1031 "packuswb %%mm0, %%mm2 \n\t"\
1032 "packuswb %%mm6, %%mm5 \n\t"\
1033 "packuswb %%mm3, %%mm4 \n\t"\
1034
1035 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
1036
1037 // do vertical chrominance interpolation
1038 #define REAL_YSCALEYUV2RGB1b(index, c) \
1039 "xor "#index", "#index" \n\t"\
1040 ".p2align 4 \n\t"\
1041 "1: \n\t"\
1042 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
1043 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
1044 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1045 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
1046 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
1047 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1048 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
1049 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
1050 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
1051 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
1052 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
1053 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
1054 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
1055 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
1056 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
1057 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
1058 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
1059 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
1060 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
1061 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1062 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1063 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
1064 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
1065 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
1066 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
1067 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
1068 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
1069 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
1070 "paddw %%mm3, %%mm4 \n\t"\
1071 "movq %%mm2, %%mm0 \n\t"\
1072 "movq %%mm5, %%mm6 \n\t"\
1073 "movq %%mm4, %%mm3 \n\t"\
1074 "punpcklwd %%mm2, %%mm2 \n\t"\
1075 "punpcklwd %%mm5, %%mm5 \n\t"\
1076 "punpcklwd %%mm4, %%mm4 \n\t"\
1077 "paddw %%mm1, %%mm2 \n\t"\
1078 "paddw %%mm1, %%mm5 \n\t"\
1079 "paddw %%mm1, %%mm4 \n\t"\
1080 "punpckhwd %%mm0, %%mm0 \n\t"\
1081 "punpckhwd %%mm6, %%mm6 \n\t"\
1082 "punpckhwd %%mm3, %%mm3 \n\t"\
1083 "paddw %%mm7, %%mm0 \n\t"\
1084 "paddw %%mm7, %%mm6 \n\t"\
1085 "paddw %%mm7, %%mm3 \n\t"\
1086 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
1087 "packuswb %%mm0, %%mm2 \n\t"\
1088 "packuswb %%mm6, %%mm5 \n\t"\
1089 "packuswb %%mm3, %%mm4 \n\t"\
1090
1091 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
1092
1093 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
1094 "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
1095 "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
1096 "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
1097 "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
1098 "packuswb %%mm1, %%mm7 \n\t"
1099 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
1100
1101 /**
1102 * YV12 to RGB without scaling or interpolating
1103 */
1104 static void RENAME(yuv2rgb32_1)(SwsInternal *c, const int16_t *buf0,
1105 const int16_t *ubuf[2], const int16_t *vbuf[2],
1106 const int16_t *abuf0, uint8_t *dest,
1107 int dstW, int uvalpha, int y)
1108 {
1109 const int16_t *ubuf0 = ubuf[0];
1110 const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1111
1112 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1113 const int16_t *ubuf1 = ubuf[0];
1114 if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
1115 __asm__ volatile(
1116 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1117 "mov %4, %%"FF_REG_b" \n\t"
1118 "push %%"FF_REG_BP" \n\t"
1119 YSCALEYUV2RGB1(%%FF_REGBP, %5)
1120 YSCALEYUV2RGB1_ALPHA(%%FF_REGBP)
1121 WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1122 "pop %%"FF_REG_BP" \n\t"
1123 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1124 :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1125 "a" (&c->redDither)
1126 );
1127 } else {
1128 __asm__ volatile(
1129 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1130 "mov %4, %%"FF_REG_b" \n\t"
1131 "push %%"FF_REG_BP" \n\t"
1132 YSCALEYUV2RGB1(%%FF_REGBP, %5)
1133 "pcmpeqd %%mm7, %%mm7 \n\t"
1134 WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1135 "pop %%"FF_REG_BP" \n\t"
1136 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1137 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1138 "a" (&c->redDither)
1139 );
1140 }
1141 } else {
1142 const int16_t *ubuf1 = ubuf[1];
1143 if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
1144 __asm__ volatile(
1145 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1146 "mov %4, %%"FF_REG_b" \n\t"
1147 "push %%"FF_REG_BP" \n\t"
1148 YSCALEYUV2RGB1b(%%FF_REGBP, %5)
1149 YSCALEYUV2RGB1_ALPHA(%%FF_REGBP)
1150 WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1151 "pop %%"FF_REG_BP" \n\t"
1152 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1153 :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1154 "a" (&c->redDither)
1155 );
1156 } else {
1157 __asm__ volatile(
1158 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1159 "mov %4, %%"FF_REG_b" \n\t"
1160 "push %%"FF_REG_BP" \n\t"
1161 YSCALEYUV2RGB1b(%%FF_REGBP, %5)
1162 "pcmpeqd %%mm7, %%mm7 \n\t"
1163 WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1164 "pop %%"FF_REG_BP" \n\t"
1165 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1166 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1167 "a" (&c->redDither)
1168 );
1169 }
1170 }
1171 }
1172
1173 static void RENAME(yuv2bgr24_1)(SwsInternal *c, const int16_t *buf0,
1174 const int16_t *ubuf[2], const int16_t *vbuf[2],
1175 const int16_t *abuf0, uint8_t *dest,
1176 int dstW, int uvalpha, int y)
1177 {
1178 const int16_t *ubuf0 = ubuf[0];
1179 const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1180
1181 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1182 const int16_t *ubuf1 = ubuf[0];
1183 __asm__ volatile(
1184 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1185 "mov %4, %%"FF_REG_b" \n\t"
1186 "push %%"FF_REG_BP" \n\t"
1187 YSCALEYUV2RGB1(%%FF_REGBP, %5)
1188 "pxor %%mm7, %%mm7 \n\t"
1189 WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1190 "pop %%"FF_REG_BP" \n\t"
1191 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1192 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1193 "a" (&c->redDither)
1194 NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
1195 );
1196 } else {
1197 const int16_t *ubuf1 = ubuf[1];
1198 __asm__ volatile(
1199 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1200 "mov %4, %%"FF_REG_b" \n\t"
1201 "push %%"FF_REG_BP" \n\t"
1202 YSCALEYUV2RGB1b(%%FF_REGBP, %5)
1203 "pxor %%mm7, %%mm7 \n\t"
1204 WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1205 "pop %%"FF_REG_BP" \n\t"
1206 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1207 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1208 "a" (&c->redDither)
1209 NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
1210 );
1211 }
1212 }
1213
1214 static void RENAME(yuv2rgb555_1)(SwsInternal *c, const int16_t *buf0,
1215 const int16_t *ubuf[2], const int16_t *vbuf[2],
1216 const int16_t *abuf0, uint8_t *dest,
1217 int dstW, int uvalpha, int y)
1218 {
1219 const int16_t *ubuf0 = ubuf[0];
1220 const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1221
1222 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1223 const int16_t *ubuf1 = ubuf[0];
1224 __asm__ volatile(
1225 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1226 "mov %4, %%"FF_REG_b" \n\t"
1227 "push %%"FF_REG_BP" \n\t"
1228 YSCALEYUV2RGB1(%%FF_REGBP, %5)
1229 "pxor %%mm7, %%mm7 \n\t"
1230 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1231 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1232 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1233 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1234 WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1235 "pop %%"FF_REG_BP" \n\t"
1236 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1237 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1238 "a" (&c->redDither)
1239 NAMED_CONSTRAINTS_ADD(bF8)
1240 );
1241 } else {
1242 const int16_t *ubuf1 = ubuf[1];
1243 __asm__ volatile(
1244 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1245 "mov %4, %%"FF_REG_b" \n\t"
1246 "push %%"FF_REG_BP" \n\t"
1247 YSCALEYUV2RGB1b(%%FF_REGBP, %5)
1248 "pxor %%mm7, %%mm7 \n\t"
1249 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1250 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1251 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1252 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1253 WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1254 "pop %%"FF_REG_BP" \n\t"
1255 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1256 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1257 "a" (&c->redDither)
1258 NAMED_CONSTRAINTS_ADD(bF8)
1259 );
1260 }
1261 }
1262
1263 static void RENAME(yuv2rgb565_1)(SwsInternal *c, const int16_t *buf0,
1264 const int16_t *ubuf[2], const int16_t *vbuf[2],
1265 const int16_t *abuf0, uint8_t *dest,
1266 int dstW, int uvalpha, int y)
1267 {
1268 const int16_t *ubuf0 = ubuf[0];
1269 const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1270
1271 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1272 const int16_t *ubuf1 = ubuf[0];
1273 __asm__ volatile(
1274 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1275 "mov %4, %%"FF_REG_b" \n\t"
1276 "push %%"FF_REG_BP" \n\t"
1277 YSCALEYUV2RGB1(%%FF_REGBP, %5)
1278 "pxor %%mm7, %%mm7 \n\t"
1279 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1280 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1281 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1282 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1283 WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1284 "pop %%"FF_REG_BP" \n\t"
1285 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1286 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1287 "a" (&c->redDither)
1288 NAMED_CONSTRAINTS_ADD(bF8,bFC)
1289 );
1290 } else {
1291 const int16_t *ubuf1 = ubuf[1];
1292 __asm__ volatile(
1293 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1294 "mov %4, %%"FF_REG_b" \n\t"
1295 "push %%"FF_REG_BP" \n\t"
1296 YSCALEYUV2RGB1b(%%FF_REGBP, %5)
1297 "pxor %%mm7, %%mm7 \n\t"
1298 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1299 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1300 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1301 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1302 WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1303 "pop %%"FF_REG_BP" \n\t"
1304 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1305 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1306 "a" (&c->redDither)
1307 NAMED_CONSTRAINTS_ADD(bF8,bFC)
1308 );
1309 }
1310 }
1311
1312 #define REAL_YSCALEYUV2PACKED1(index, c) \
1313 "xor "#index", "#index" \n\t"\
1314 ".p2align 4 \n\t"\
1315 "1: \n\t"\
1316 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
1317 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1318 "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
1319 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1320 "psraw $7, %%mm3 \n\t" \
1321 "psraw $7, %%mm4 \n\t" \
1322 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
1323 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
1324 "psraw $7, %%mm1 \n\t" \
1325 "psraw $7, %%mm7 \n\t" \
1326
1327 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
1328
1329 #define REAL_YSCALEYUV2PACKED1b(index, c) \
1330 "xor "#index", "#index" \n\t"\
1331 ".p2align 4 \n\t"\
1332 "1: \n\t"\
1333 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
1334 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
1335 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1336 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
1337 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
1338 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1339 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
1340 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
1341 "psrlw $8, %%mm3 \n\t" \
1342 "psrlw $8, %%mm4 \n\t" \
1343 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
1344 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
1345 "psraw $7, %%mm1 \n\t" \
1346 "psraw $7, %%mm7 \n\t"
1347 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
1348
1349 static void RENAME(yuv2yuyv422_1)(SwsInternal *c, const int16_t *buf0,
1350 const int16_t *ubuf[2], const int16_t *vbuf[2],
1351 const int16_t *abuf0, uint8_t *dest,
1352 int dstW, int uvalpha, int y)
1353 {
1354 const int16_t *ubuf0 = ubuf[0];
1355 const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1356
1357 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1358 const int16_t *ubuf1 = ubuf[0];
1359 __asm__ volatile(
1360 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1361 "mov %4, %%"FF_REG_b" \n\t"
1362 "push %%"FF_REG_BP" \n\t"
1363 YSCALEYUV2PACKED1(%%FF_REGBP, %5)
1364 WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1365 "pop %%"FF_REG_BP" \n\t"
1366 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1367 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1368 "a" (&c->redDither)
1369 );
1370 } else {
1371 const int16_t *ubuf1 = ubuf[1];
1372 __asm__ volatile(
1373 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1374 "mov %4, %%"FF_REG_b" \n\t"
1375 "push %%"FF_REG_BP" \n\t"
1376 YSCALEYUV2PACKED1b(%%FF_REGBP, %5)
1377 WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1378 "pop %%"FF_REG_BP" \n\t"
1379 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1380 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1381 "a" (&c->redDither)
1382 );
1383 }
1384 }
1385 11799 static av_cold void RENAME(sws_init_swscale)(SwsInternal *c)
1386 {
1387 11799 enum AVPixelFormat dstFormat = c->opts.dst_format;
1388
1389 11799 c->use_mmx_vfilter= 0;
1390
6/6
✓ Branch 1 taken 6929 times.
✓ Branch 2 taken 4870 times.
✓ Branch 4 taken 3575 times.
✓ Branch 5 taken 3354 times.
✓ Branch 7 taken 3552 times.
✓ Branch 8 taken 23 times.
11799 if (!is16BPS(dstFormat) && !isNBPS(dstFormat) && !isSemiPlanarYUV(dstFormat)
1391
2/4
✓ Branch 0 taken 3552 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 3552 times.
✗ Branch 3 not taken.
3552 && dstFormat != AV_PIX_FMT_GRAYF32BE && dstFormat != AV_PIX_FMT_GRAYF32LE
1392
2/2
✓ Branch 0 taken 2507 times.
✓ Branch 1 taken 1045 times.
3552 && !(c->opts.flags & SWS_BITEXACT)) {
1393
2/2
✓ Branch 0 taken 22 times.
✓ Branch 1 taken 2485 times.
2507 if (c->opts.flags & SWS_ACCURATE_RND) {
1394
1/2
✓ Branch 0 taken 22 times.
✗ Branch 1 not taken.
22 if (!(c->opts.flags & SWS_FULL_CHR_H_INT)) {
1395
1/6
✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 22 times.
22 switch (c->opts.dst_format) {
1396 case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break;
1397 #if HAVE_6REGS
1398 case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break;
1399 #endif
1400 case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break;
1401 case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break;
1402 case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
1403 22 default: break;
1404 }
1405 }
1406 } else {
1407 2485 c->use_mmx_vfilter= 1;
1408
2/2
✓ Branch 0 taken 859 times.
✓ Branch 1 taken 1626 times.
2485 if (!(c->opts.flags & SWS_FULL_CHR_H_INT)) {
1409
1/7
✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✓ Branch 6 taken 859 times.
859 switch (c->opts.dst_format) {
1410 case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
1411 case AV_PIX_FMT_BGR32: c->yuv2packedX = RENAME(yuv2bgr32_X); break;
1412 #if HAVE_6REGS
1413 case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break;
1414 #endif
1415 case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break;
1416 case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break;
1417 case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
1418 859 default: break;
1419 }
1420 }
1421 }
1422
2/2
✓ Branch 0 taken 881 times.
✓ Branch 1 taken 1626 times.
2507 if (!(c->opts.flags & SWS_FULL_CHR_H_INT)) {
1423
1/6
✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 881 times.
881 switch (c->opts.dst_format) {
1424 case AV_PIX_FMT_RGB32:
1425 c->yuv2packed1 = RENAME(yuv2rgb32_1);
1426 c->yuv2packed2 = RENAME(yuv2rgb32_2);
1427 break;
1428 case AV_PIX_FMT_BGR24:
1429 c->yuv2packed1 = RENAME(yuv2bgr24_1);
1430 c->yuv2packed2 = RENAME(yuv2bgr24_2);
1431 break;
1432 case AV_PIX_FMT_RGB555:
1433 c->yuv2packed1 = RENAME(yuv2rgb555_1);
1434 c->yuv2packed2 = RENAME(yuv2rgb555_2);
1435 break;
1436 case AV_PIX_FMT_RGB565:
1437 c->yuv2packed1 = RENAME(yuv2rgb565_1);
1438 c->yuv2packed2 = RENAME(yuv2rgb565_2);
1439 break;
1440 case AV_PIX_FMT_YUYV422:
1441 c->yuv2packed1 = RENAME(yuv2yuyv422_1);
1442 c->yuv2packed2 = RENAME(yuv2yuyv422_2);
1443 break;
1444 881 default:
1445 881 break;
1446 }
1447 }
1448 }
1449
1450
4/4
✓ Branch 0 taken 11485 times.
✓ Branch 1 taken 314 times.
✓ Branch 2 taken 10780 times.
✓ Branch 3 taken 705 times.
11799 if (c->srcBpc == 8 && c->dstBpc <= 14) {
1451 // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one).
1452
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 10780 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
10780 if (c->opts.flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) {
1453 c->hyscale_fast = ff_hyscale_fast_mmxext;
1454 c->hcscale_fast = ff_hcscale_fast_mmxext;
1455 } else {
1456 10780 c->hyscale_fast = NULL;
1457 10780 c->hcscale_fast = NULL;
1458 }
1459 }
1460 11799 }
1461