FFmpeg coverage


Directory: ../../../ffmpeg/
File: src/libswscale/x86/ops.c
Date: 2025-09-25 04:14:09
Exec Total Coverage
Lines: 128 137 93.4%
Functions: 10 10 100.0%
Branches: 106 150 70.7%

Line Branch Exec Source
1 /**
2 * Copyright (C) 2025 Niklas Haas
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include <float.h>
22
23 #include "libavutil/avassert.h"
24 #include "libavutil/mem.h"
25
26 #include "../ops_chain.h"
27
28 #define DECL_ENTRY(TYPE, NAME, ...) \
29 static const SwsOpEntry op_##NAME = { \
30 .type = SWS_PIXEL_##TYPE, \
31 __VA_ARGS__ \
32 }
33
34 #define DECL_ASM(TYPE, NAME, ...) \
35 void ff_##NAME(void); \
36 DECL_ENTRY(TYPE, NAME, \
37 .func = ff_##NAME, \
38 __VA_ARGS__)
39
40 #define DECL_PATTERN(TYPE, NAME, X, Y, Z, W, ...) \
41 DECL_ASM(TYPE, p##X##Y##Z##W##_##NAME, \
42 .unused = { !X, !Y, !Z, !W }, \
43 __VA_ARGS__ \
44 )
45
46 #define REF_PATTERN(NAME, X, Y, Z, W) \
47 &op_p##X##Y##Z##W##_##NAME
48
49 #define DECL_COMMON_PATTERNS(TYPE, NAME, ...) \
50 DECL_PATTERN(TYPE, NAME, 1, 0, 0, 0, __VA_ARGS__); \
51 DECL_PATTERN(TYPE, NAME, 1, 0, 0, 1, __VA_ARGS__); \
52 DECL_PATTERN(TYPE, NAME, 1, 1, 1, 0, __VA_ARGS__); \
53 DECL_PATTERN(TYPE, NAME, 1, 1, 1, 1, __VA_ARGS__) \
54
55 #define REF_COMMON_PATTERNS(NAME) \
56 REF_PATTERN(NAME, 1, 0, 0, 0), \
57 REF_PATTERN(NAME, 1, 0, 0, 1), \
58 REF_PATTERN(NAME, 1, 1, 1, 0), \
59 REF_PATTERN(NAME, 1, 1, 1, 1)
60
61 #define DECL_RW(EXT, TYPE, NAME, OP, ELEMS, PACKED, FRAC) \
62 DECL_ASM(TYPE, NAME##ELEMS##EXT, \
63 .op = SWS_OP_##OP, \
64 .rw = { .elems = ELEMS, .packed = PACKED, .frac = FRAC }, \
65 );
66
67 #define DECL_PACKED_RW(EXT, DEPTH) \
68 DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 2, true, 0) \
69 DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 3, true, 0) \
70 DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 4, true, 0) \
71 DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 2, true, 0) \
72 DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 3, true, 0) \
73 DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 4, true, 0) \
74
75 #define DECL_PACK_UNPACK(EXT, TYPE, X, Y, Z, W) \
76 DECL_ASM(TYPE, pack_##X##Y##Z##W##EXT, \
77 .op = SWS_OP_PACK, \
78 .pack.pattern = {X, Y, Z, W}, \
79 ); \
80 \
81 DECL_ASM(TYPE, unpack_##X##Y##Z##W##EXT, \
82 .op = SWS_OP_UNPACK, \
83 .pack.pattern = {X, Y, Z, W}, \
84 ); \
85
86 6 static int setup_swap_bytes(const SwsOp *op, SwsOpPriv *out)
87 {
88 6 const int mask = ff_sws_pixel_type_size(op->type) - 1;
89
2/2
✓ Branch 0 taken 96 times.
✓ Branch 1 taken 6 times.
102 for (int i = 0; i < 16; i++)
90 96 out->u8[i] = (i & ~mask) | (mask - (i & mask));
91 6 return 0;
92 }
93
94 #define DECL_SWAP_BYTES(EXT, TYPE, X, Y, Z, W) \
95 DECL_ENTRY(TYPE, p##X##Y##Z##W##_swap_bytes_##TYPE##EXT, \
96 .op = SWS_OP_SWAP_BYTES, \
97 .unused = { !X, !Y, !Z, !W }, \
98 .func = ff_p##X##Y##Z##W##_shuffle##EXT, \
99 .setup = setup_swap_bytes, \
100 );
101
102 #define DECL_CLEAR_ALPHA(EXT, IDX) \
103 DECL_ASM(U8, clear_alpha##IDX##EXT, \
104 .op = SWS_OP_CLEAR, \
105 .clear_value = -1, \
106 .unused[IDX] = true, \
107 ); \
108
109 #define DECL_CLEAR_ZERO(EXT, IDX) \
110 DECL_ASM(U8, clear_zero##IDX##EXT, \
111 .op = SWS_OP_CLEAR, \
112 .clear_value = 0, \
113 .unused[IDX] = true, \
114 );
115
116 84 static int setup_clear(const SwsOp *op, SwsOpPriv *out)
117 {
118
2/2
✓ Branch 0 taken 336 times.
✓ Branch 1 taken 84 times.
420 for (int i = 0; i < 4; i++)
119 336 out->u32[i] = (uint32_t) op->c.q4[i].num;
120 84 return 0;
121 }
122
123 #define DECL_CLEAR(EXT, X, Y, Z, W) \
124 DECL_PATTERN(U8, clear##EXT, X, Y, Z, W, \
125 .op = SWS_OP_CLEAR, \
126 .setup = setup_clear, \
127 .flexible = true, \
128 );
129
130 #define DECL_SWIZZLE(EXT, X, Y, Z, W) \
131 DECL_ASM(U8, swizzle_##X##Y##Z##W##EXT, \
132 .op = SWS_OP_SWIZZLE, \
133 .swizzle.in = {X, Y, Z, W}, \
134 );
135
136 #define DECL_CONVERT(EXT, FROM, TO) \
137 DECL_COMMON_PATTERNS(FROM, convert_##FROM##_##TO##EXT, \
138 .op = SWS_OP_CONVERT, \
139 .convert.to = SWS_PIXEL_##TO, \
140 );
141
142 #define DECL_EXPAND(EXT, FROM, TO) \
143 DECL_COMMON_PATTERNS(FROM, expand_##FROM##_##TO##EXT, \
144 .op = SWS_OP_CONVERT, \
145 .convert.to = SWS_PIXEL_##TO, \
146 .convert.expand = true, \
147 );
148
149 64 static int setup_shift(const SwsOp *op, SwsOpPriv *out)
150 {
151 64 out->u16[0] = op->c.u;
152 64 return 0;
153 }
154
155 #define DECL_SHIFT16(EXT) \
156 DECL_COMMON_PATTERNS(U16, lshift16##EXT, \
157 .op = SWS_OP_LSHIFT, \
158 .setup = setup_shift, \
159 .flexible = true, \
160 ); \
161 \
162 DECL_COMMON_PATTERNS(U16, rshift16##EXT, \
163 .op = SWS_OP_RSHIFT, \
164 .setup = setup_shift, \
165 .flexible = true, \
166 );
167
168 #define DECL_MIN_MAX(EXT) \
169 DECL_COMMON_PATTERNS(F32, min##EXT, \
170 .op = SWS_OP_MIN, \
171 .setup = ff_sws_setup_q4, \
172 .flexible = true, \
173 ); \
174 \
175 DECL_COMMON_PATTERNS(F32, max##EXT, \
176 .op = SWS_OP_MAX, \
177 .setup = ff_sws_setup_q4, \
178 .flexible = true, \
179 );
180
181 #define DECL_SCALE(EXT) \
182 DECL_COMMON_PATTERNS(F32, scale##EXT, \
183 .op = SWS_OP_SCALE, \
184 .setup = ff_sws_setup_q, \
185 );
186
187 /* 2x2 matrix fits inside SwsOpPriv directly; save an indirect in this case */
188 static_assert(sizeof(SwsOpPriv) >= sizeof(float[2][2]), "2x2 dither matrix too large");
189 36 static int setup_dither(const SwsOp *op, SwsOpPriv *out)
190 {
191 36 const int size = 1 << op->dither.size_log2;
192 36 float *matrix = out->f32;
193
2/2
✓ Branch 0 taken 28 times.
✓ Branch 1 taken 8 times.
36 if (size > 2) {
194 28 matrix = out->ptr = av_mallocz(size * size * sizeof(*matrix));
195
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 28 times.
28 if (!matrix)
196 return AVERROR(ENOMEM);
197 }
198
199
2/2
✓ Branch 0 taken 349524 times.
✓ Branch 1 taken 36 times.
349560 for (int i = 0; i < size * size; i++)
200 349524 matrix[i] = (float) op->dither.matrix[i].num / op->dither.matrix[i].den;
201
202 36 return 0;
203 }
204
205 #define DECL_DITHER(EXT, SIZE) \
206 DECL_COMMON_PATTERNS(F32, dither##SIZE##EXT, \
207 .op = SWS_OP_DITHER, \
208 .setup = setup_dither, \
209 .free = (1 << SIZE) > 2 ? av_free : NULL, \
210 .dither_size = SIZE, \
211 );
212
213 21 static int setup_linear(const SwsOp *op, SwsOpPriv *out)
214 {
215 21 float *matrix = out->ptr = av_mallocz(sizeof(float[4][5]));
216
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 21 times.
21 if (!matrix)
217 return AVERROR(ENOMEM);
218
219
2/2
✓ Branch 0 taken 84 times.
✓ Branch 1 taken 21 times.
105 for (int y = 0; y < 4; y++) {
220
2/2
✓ Branch 0 taken 420 times.
✓ Branch 1 taken 84 times.
504 for (int x = 0; x < 5; x++)
221 420 matrix[y * 5 + x] = (float) op->lin.m[y][x].num / op->lin.m[y][x].den;
222 }
223
224 21 return 0;
225 }
226
227 #define DECL_LINEAR(EXT, NAME, MASK) \
228 DECL_ASM(F32, NAME##EXT, \
229 .op = SWS_OP_LINEAR, \
230 .setup = setup_linear, \
231 .free = av_free, \
232 .linear_mask = (MASK), \
233 );
234
235 #define DECL_FUNCS_8(SIZE, EXT, FLAG) \
236 DECL_RW(EXT, U8, read_planar, READ, 1, false, 0) \
237 DECL_RW(EXT, U8, read_planar, READ, 2, false, 0) \
238 DECL_RW(EXT, U8, read_planar, READ, 3, false, 0) \
239 DECL_RW(EXT, U8, read_planar, READ, 4, false, 0) \
240 DECL_RW(EXT, U8, write_planar, WRITE, 1, false, 0) \
241 DECL_RW(EXT, U8, write_planar, WRITE, 2, false, 0) \
242 DECL_RW(EXT, U8, write_planar, WRITE, 3, false, 0) \
243 DECL_RW(EXT, U8, write_planar, WRITE, 4, false, 0) \
244 DECL_RW(EXT, U8, read_nibbles, READ, 1, false, 1) \
245 DECL_RW(EXT, U8, read_bits, READ, 1, false, 3) \
246 DECL_RW(EXT, U8, write_bits, WRITE, 1, false, 3) \
247 DECL_PACKED_RW(EXT, 8) \
248 DECL_PACK_UNPACK(EXT, U8, 1, 2, 1, 0) \
249 DECL_PACK_UNPACK(EXT, U8, 3, 3, 2, 0) \
250 DECL_PACK_UNPACK(EXT, U8, 2, 3, 3, 0) \
251 void ff_p1000_shuffle##EXT(void); \
252 void ff_p1001_shuffle##EXT(void); \
253 void ff_p1110_shuffle##EXT(void); \
254 void ff_p1111_shuffle##EXT(void); \
255 DECL_SWIZZLE(EXT, 3, 0, 1, 2) \
256 DECL_SWIZZLE(EXT, 3, 0, 2, 1) \
257 DECL_SWIZZLE(EXT, 2, 1, 0, 3) \
258 DECL_SWIZZLE(EXT, 3, 2, 1, 0) \
259 DECL_SWIZZLE(EXT, 3, 1, 0, 2) \
260 DECL_SWIZZLE(EXT, 3, 2, 0, 1) \
261 DECL_SWIZZLE(EXT, 1, 2, 0, 3) \
262 DECL_SWIZZLE(EXT, 1, 0, 2, 3) \
263 DECL_SWIZZLE(EXT, 2, 0, 1, 3) \
264 DECL_SWIZZLE(EXT, 2, 3, 1, 0) \
265 DECL_SWIZZLE(EXT, 2, 1, 3, 0) \
266 DECL_SWIZZLE(EXT, 1, 2, 3, 0) \
267 DECL_SWIZZLE(EXT, 1, 3, 2, 0) \
268 DECL_SWIZZLE(EXT, 0, 2, 1, 3) \
269 DECL_SWIZZLE(EXT, 0, 2, 3, 1) \
270 DECL_SWIZZLE(EXT, 0, 3, 1, 2) \
271 DECL_SWIZZLE(EXT, 3, 1, 2, 0) \
272 DECL_SWIZZLE(EXT, 0, 3, 2, 1) \
273 DECL_SWIZZLE(EXT, 0, 0, 0, 3) \
274 DECL_SWIZZLE(EXT, 3, 0, 0, 0) \
275 DECL_SWIZZLE(EXT, 0, 0, 0, 1) \
276 DECL_SWIZZLE(EXT, 1, 0, 0, 0) \
277 DECL_CLEAR_ALPHA(EXT, 0) \
278 DECL_CLEAR_ALPHA(EXT, 1) \
279 DECL_CLEAR_ALPHA(EXT, 3) \
280 DECL_CLEAR_ZERO(EXT, 0) \
281 DECL_CLEAR_ZERO(EXT, 1) \
282 DECL_CLEAR_ZERO(EXT, 3) \
283 DECL_CLEAR(EXT, 1, 1, 1, 0) \
284 DECL_CLEAR(EXT, 0, 1, 1, 1) \
285 DECL_CLEAR(EXT, 0, 0, 1, 1) \
286 DECL_CLEAR(EXT, 1, 0, 0, 1) \
287 DECL_CLEAR(EXT, 1, 1, 0, 0) \
288 DECL_CLEAR(EXT, 0, 1, 0, 1) \
289 DECL_CLEAR(EXT, 1, 0, 1, 0) \
290 DECL_CLEAR(EXT, 1, 0, 0, 0) \
291 DECL_CLEAR(EXT, 0, 1, 0, 0) \
292 DECL_CLEAR(EXT, 0, 0, 1, 0) \
293 \
294 static const SwsOpTable ops8##EXT = { \
295 .cpu_flags = AV_CPU_FLAG_##FLAG, \
296 .block_size = SIZE, \
297 .entries = { \
298 &op_read_planar1##EXT, \
299 &op_read_planar2##EXT, \
300 &op_read_planar3##EXT, \
301 &op_read_planar4##EXT, \
302 &op_write_planar1##EXT, \
303 &op_write_planar2##EXT, \
304 &op_write_planar3##EXT, \
305 &op_write_planar4##EXT, \
306 &op_read8_packed2##EXT, \
307 &op_read8_packed3##EXT, \
308 &op_read8_packed4##EXT, \
309 &op_write8_packed2##EXT, \
310 &op_write8_packed3##EXT, \
311 &op_write8_packed4##EXT, \
312 &op_read_nibbles1##EXT, \
313 &op_read_bits1##EXT, \
314 &op_write_bits1##EXT, \
315 &op_pack_1210##EXT, \
316 &op_pack_3320##EXT, \
317 &op_pack_2330##EXT, \
318 &op_unpack_1210##EXT, \
319 &op_unpack_3320##EXT, \
320 &op_unpack_2330##EXT, \
321 &op_swizzle_3012##EXT, \
322 &op_swizzle_3021##EXT, \
323 &op_swizzle_2103##EXT, \
324 &op_swizzle_3210##EXT, \
325 &op_swizzle_3102##EXT, \
326 &op_swizzle_3201##EXT, \
327 &op_swizzle_1203##EXT, \
328 &op_swizzle_1023##EXT, \
329 &op_swizzle_2013##EXT, \
330 &op_swizzle_2310##EXT, \
331 &op_swizzle_2130##EXT, \
332 &op_swizzle_1230##EXT, \
333 &op_swizzle_1320##EXT, \
334 &op_swizzle_0213##EXT, \
335 &op_swizzle_0231##EXT, \
336 &op_swizzle_0312##EXT, \
337 &op_swizzle_3120##EXT, \
338 &op_swizzle_0321##EXT, \
339 &op_swizzle_0003##EXT, \
340 &op_swizzle_0001##EXT, \
341 &op_swizzle_3000##EXT, \
342 &op_swizzle_1000##EXT, \
343 &op_clear_alpha0##EXT, \
344 &op_clear_alpha1##EXT, \
345 &op_clear_alpha3##EXT, \
346 &op_clear_zero0##EXT, \
347 &op_clear_zero1##EXT, \
348 &op_clear_zero3##EXT, \
349 REF_PATTERN(clear##EXT, 1, 1, 1, 0), \
350 REF_PATTERN(clear##EXT, 0, 1, 1, 1), \
351 REF_PATTERN(clear##EXT, 0, 0, 1, 1), \
352 REF_PATTERN(clear##EXT, 1, 0, 0, 1), \
353 REF_PATTERN(clear##EXT, 1, 1, 0, 0), \
354 REF_PATTERN(clear##EXT, 0, 1, 0, 1), \
355 REF_PATTERN(clear##EXT, 1, 0, 1, 0), \
356 REF_PATTERN(clear##EXT, 1, 0, 0, 0), \
357 REF_PATTERN(clear##EXT, 0, 1, 0, 0), \
358 REF_PATTERN(clear##EXT, 0, 0, 1, 0), \
359 NULL \
360 }, \
361 };
362
363 #define DECL_FUNCS_16(SIZE, EXT, FLAG) \
364 DECL_PACKED_RW(EXT, 16) \
365 DECL_PACK_UNPACK(EXT, U16, 4, 4, 4, 0) \
366 DECL_PACK_UNPACK(EXT, U16, 5, 5, 5, 0) \
367 DECL_PACK_UNPACK(EXT, U16, 5, 6, 5, 0) \
368 DECL_SWAP_BYTES(EXT, U16, 1, 0, 0, 0) \
369 DECL_SWAP_BYTES(EXT, U16, 1, 0, 0, 1) \
370 DECL_SWAP_BYTES(EXT, U16, 1, 1, 1, 0) \
371 DECL_SWAP_BYTES(EXT, U16, 1, 1, 1, 1) \
372 DECL_SHIFT16(EXT) \
373 DECL_CONVERT(EXT, U8, U16) \
374 DECL_CONVERT(EXT, U16, U8) \
375 DECL_EXPAND(EXT, U8, U16) \
376 \
377 static const SwsOpTable ops16##EXT = { \
378 .cpu_flags = AV_CPU_FLAG_##FLAG, \
379 .block_size = SIZE, \
380 .entries = { \
381 &op_read16_packed2##EXT, \
382 &op_read16_packed3##EXT, \
383 &op_read16_packed4##EXT, \
384 &op_write16_packed2##EXT, \
385 &op_write16_packed3##EXT, \
386 &op_write16_packed4##EXT, \
387 &op_pack_4440##EXT, \
388 &op_pack_5550##EXT, \
389 &op_pack_5650##EXT, \
390 &op_unpack_4440##EXT, \
391 &op_unpack_5550##EXT, \
392 &op_unpack_5650##EXT, \
393 REF_COMMON_PATTERNS(swap_bytes_U16##EXT), \
394 REF_COMMON_PATTERNS(convert_U8_U16##EXT), \
395 REF_COMMON_PATTERNS(convert_U16_U8##EXT), \
396 REF_COMMON_PATTERNS(expand_U8_U16##EXT), \
397 REF_COMMON_PATTERNS(lshift16##EXT), \
398 REF_COMMON_PATTERNS(rshift16##EXT), \
399 NULL \
400 }, \
401 };
402
403 #define DECL_FUNCS_32(SIZE, EXT, FLAG) \
404 DECL_PACKED_RW(_m2##EXT, 32) \
405 DECL_PACK_UNPACK(_m2##EXT, U32, 10, 10, 10, 2) \
406 DECL_PACK_UNPACK(_m2##EXT, U32, 2, 10, 10, 10) \
407 DECL_SWAP_BYTES(_m2##EXT, U32, 1, 0, 0, 0) \
408 DECL_SWAP_BYTES(_m2##EXT, U32, 1, 0, 0, 1) \
409 DECL_SWAP_BYTES(_m2##EXT, U32, 1, 1, 1, 0) \
410 DECL_SWAP_BYTES(_m2##EXT, U32, 1, 1, 1, 1) \
411 DECL_CONVERT(EXT, U8, U32) \
412 DECL_CONVERT(EXT, U32, U8) \
413 DECL_CONVERT(EXT, U16, U32) \
414 DECL_CONVERT(EXT, U32, U16) \
415 DECL_CONVERT(EXT, U8, F32) \
416 DECL_CONVERT(EXT, F32, U8) \
417 DECL_CONVERT(EXT, U16, F32) \
418 DECL_CONVERT(EXT, F32, U16) \
419 DECL_EXPAND(EXT, U8, U32) \
420 DECL_MIN_MAX(EXT) \
421 DECL_SCALE(EXT) \
422 DECL_DITHER(EXT, 0) \
423 DECL_DITHER(EXT, 1) \
424 DECL_DITHER(EXT, 2) \
425 DECL_DITHER(EXT, 3) \
426 DECL_DITHER(EXT, 4) \
427 DECL_DITHER(EXT, 5) \
428 DECL_DITHER(EXT, 6) \
429 DECL_DITHER(EXT, 7) \
430 DECL_DITHER(EXT, 8) \
431 DECL_LINEAR(EXT, luma, SWS_MASK_LUMA) \
432 DECL_LINEAR(EXT, alpha, SWS_MASK_ALPHA) \
433 DECL_LINEAR(EXT, lumalpha, SWS_MASK_LUMA | SWS_MASK_ALPHA) \
434 DECL_LINEAR(EXT, dot3, 0x7) \
435 DECL_LINEAR(EXT, row0, SWS_MASK_ROW(0)) \
436 DECL_LINEAR(EXT, row0a, SWS_MASK_ROW(0) | SWS_MASK_ALPHA) \
437 DECL_LINEAR(EXT, diag3, SWS_MASK_DIAG3) \
438 DECL_LINEAR(EXT, diag4, SWS_MASK_DIAG4) \
439 DECL_LINEAR(EXT, diagoff3, SWS_MASK_DIAG3 | SWS_MASK_OFF3) \
440 DECL_LINEAR(EXT, matrix3, SWS_MASK_MAT3) \
441 DECL_LINEAR(EXT, affine3, SWS_MASK_MAT3 | SWS_MASK_OFF3) \
442 DECL_LINEAR(EXT, affine3a, SWS_MASK_MAT3 | SWS_MASK_OFF3 | SWS_MASK_ALPHA) \
443 DECL_LINEAR(EXT, matrix4, SWS_MASK_MAT4) \
444 DECL_LINEAR(EXT, affine4, SWS_MASK_MAT4 | SWS_MASK_OFF4) \
445 \
446 static const SwsOpTable ops32##EXT = { \
447 .cpu_flags = AV_CPU_FLAG_##FLAG, \
448 .block_size = SIZE, \
449 .entries = { \
450 &op_read32_packed2_m2##EXT, \
451 &op_read32_packed3_m2##EXT, \
452 &op_read32_packed4_m2##EXT, \
453 &op_write32_packed2_m2##EXT, \
454 &op_write32_packed3_m2##EXT, \
455 &op_write32_packed4_m2##EXT, \
456 &op_pack_1010102_m2##EXT, \
457 &op_pack_2101010_m2##EXT, \
458 &op_unpack_1010102_m2##EXT, \
459 &op_unpack_2101010_m2##EXT, \
460 REF_COMMON_PATTERNS(swap_bytes_U32_m2##EXT), \
461 REF_COMMON_PATTERNS(convert_U8_U32##EXT), \
462 REF_COMMON_PATTERNS(convert_U32_U8##EXT), \
463 REF_COMMON_PATTERNS(convert_U16_U32##EXT), \
464 REF_COMMON_PATTERNS(convert_U32_U16##EXT), \
465 REF_COMMON_PATTERNS(convert_U8_F32##EXT), \
466 REF_COMMON_PATTERNS(convert_F32_U8##EXT), \
467 REF_COMMON_PATTERNS(convert_U16_F32##EXT), \
468 REF_COMMON_PATTERNS(convert_F32_U16##EXT), \
469 REF_COMMON_PATTERNS(expand_U8_U32##EXT), \
470 REF_COMMON_PATTERNS(min##EXT), \
471 REF_COMMON_PATTERNS(max##EXT), \
472 REF_COMMON_PATTERNS(scale##EXT), \
473 REF_COMMON_PATTERNS(dither0##EXT), \
474 REF_COMMON_PATTERNS(dither1##EXT), \
475 REF_COMMON_PATTERNS(dither2##EXT), \
476 REF_COMMON_PATTERNS(dither3##EXT), \
477 REF_COMMON_PATTERNS(dither4##EXT), \
478 REF_COMMON_PATTERNS(dither5##EXT), \
479 REF_COMMON_PATTERNS(dither6##EXT), \
480 REF_COMMON_PATTERNS(dither7##EXT), \
481 REF_COMMON_PATTERNS(dither8##EXT), \
482 &op_luma##EXT, \
483 &op_alpha##EXT, \
484 &op_lumalpha##EXT, \
485 &op_dot3##EXT, \
486 &op_row0##EXT, \
487 &op_row0a##EXT, \
488 &op_diag3##EXT, \
489 &op_diag4##EXT, \
490 &op_diagoff3##EXT, \
491 &op_matrix3##EXT, \
492 &op_affine3##EXT, \
493 &op_affine3a##EXT, \
494 &op_matrix4##EXT, \
495 &op_affine4##EXT, \
496 NULL \
497 }, \
498 };
499
500 DECL_FUNCS_8(16, _m1_sse4, SSE4)
501 DECL_FUNCS_8(32, _m1_avx2, AVX2)
502 DECL_FUNCS_8(32, _m2_sse4, SSE4)
503 DECL_FUNCS_8(64, _m2_avx2, AVX2)
504
505 DECL_FUNCS_16(16, _m1_avx2, AVX2)
506 DECL_FUNCS_16(32, _m2_avx2, AVX2)
507
508 DECL_FUNCS_32(16, _avx2, AVX2)
509
510 4667 static av_const int get_mmsize(const int cpu_flags)
511 {
512
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 4667 times.
4667 if (cpu_flags & AV_CPU_FLAG_AVX512)
513 return 64;
514
2/2
✓ Branch 0 taken 359 times.
✓ Branch 1 taken 4308 times.
4667 else if (cpu_flags & AV_CPU_FLAG_AVX2)
515 359 return 32;
516
2/2
✓ Branch 0 taken 1795 times.
✓ Branch 1 taken 2513 times.
4308 else if (cpu_flags & AV_CPU_FLAG_SSE4)
517 1795 return 16;
518 else
519 2513 return AVERROR(ENOTSUP);
520 }
521
522 /**
523 * Returns true if the operation's implementation only depends on the block
524 * size, and not the underlying pixel type
525 */
526 4613 static bool op_is_type_invariant(const SwsOp *op)
527 {
528
3/3
✓ Branch 0 taken 2664 times.
✓ Branch 1 taken 222 times.
✓ Branch 2 taken 1727 times.
4613 switch (op->op) {
529 2664 case SWS_OP_READ:
530 case SWS_OP_WRITE:
531
4/4
✓ Branch 0 taken 2520 times.
✓ Branch 1 taken 144 times.
✓ Branch 2 taken 2496 times.
✓ Branch 3 taken 24 times.
2664 return !op->rw.packed && !op->rw.frac;
532 222 case SWS_OP_SWIZZLE:
533 case SWS_OP_CLEAR:
534 222 return true;
535 }
536
537 1727 return false;
538 }
539
540 2154 static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out)
541 {
542 uint8_t shuffle[16];
543 int read_bytes, write_bytes;
544 int pixels;
545
546 /* Solve the shuffle mask for one 128-bit lane only */
547 2154 pixels = ff_sws_solve_shuffle(ops, shuffle, 16, 0x80, &read_bytes, &write_bytes);
548
2/2
✓ Branch 0 taken 2130 times.
✓ Branch 1 taken 24 times.
2154 if (pixels < 0)
549 2130 return pixels;
550
551 /* We can't shuffle acress lanes, so restrict the vector size to XMM
552 * whenever the read/write size would be a subset of the full vector */
553
3/4
✓ Branch 0 taken 12 times.
✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 12 times.
24 if (read_bytes < 16 || write_bytes < 16)
554 12 mmsize = 16;
555
556 24 const int num_lanes = mmsize / 16;
557 24 const int in_total = num_lanes * read_bytes;
558 24 const int out_total = num_lanes * write_bytes;
559
2/2
✓ Branch 0 taken 18 times.
✓ Branch 1 taken 6 times.
42 const int read_size = in_total <= 4 ? 4 : /* movd */
560
2/2
✓ Branch 0 taken 12 times.
✓ Branch 1 taken 6 times.
18 in_total <= 8 ? 8 : /* movq */
561 mmsize; /* movu */
562
563 24 *out = (SwsCompiledOp) {
564 24 .priv = av_memdup(shuffle, sizeof(shuffle)),
565 .free = av_free,
566 24 .block_size = pixels * num_lanes,
567 24 .over_read = read_size - in_total,
568 24 .over_write = mmsize - out_total,
569
1/2
✓ Branch 0 taken 24 times.
✗ Branch 1 not taken.
48 .cpu_flags = mmsize > 32 ? AV_CPU_FLAG_AVX512 :
570
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 22 times.
24 mmsize > 16 ? AV_CPU_FLAG_AVX2 :
571 AV_CPU_FLAG_SSE4,
572 };
573
574
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 24 times.
24 if (!out->priv)
575 return AVERROR(ENOMEM);
576
577 #define ASSIGN_SHUFFLE_FUNC(IN, OUT, EXT) \
578 do { \
579 SWS_DECL_FUNC(ff_packed_shuffle##IN##_##OUT##_##EXT); \
580 if (in_total == IN && out_total == OUT) \
581 out->func = ff_packed_shuffle##IN##_##OUT##_##EXT; \
582 } while (0)
583
584
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 24 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
24 ASSIGN_SHUFFLE_FUNC( 5, 15, sse4);
585
3/4
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 18 times.
✓ Branch 2 taken 6 times.
✗ Branch 3 not taken.
24 ASSIGN_SHUFFLE_FUNC( 4, 16, sse4);
586
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 24 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
24 ASSIGN_SHUFFLE_FUNC( 2, 12, sse4);
587
3/4
✓ Branch 0 taken 10 times.
✓ Branch 1 taken 14 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 10 times.
24 ASSIGN_SHUFFLE_FUNC(16, 8, sse4);
588
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 24 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
24 ASSIGN_SHUFFLE_FUNC(10, 15, sse4);
589
3/4
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 18 times.
✓ Branch 2 taken 6 times.
✗ Branch 3 not taken.
24 ASSIGN_SHUFFLE_FUNC( 8, 16, sse4);
590
3/4
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 18 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 6 times.
24 ASSIGN_SHUFFLE_FUNC( 4, 12, sse4);
591
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 24 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
24 ASSIGN_SHUFFLE_FUNC(15, 15, sse4);
592
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 24 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
24 ASSIGN_SHUFFLE_FUNC(12, 16, sse4);
593
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 24 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
24 ASSIGN_SHUFFLE_FUNC( 6, 12, sse4);
594
3/4
✓ Branch 0 taken 10 times.
✓ Branch 1 taken 14 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 10 times.
24 ASSIGN_SHUFFLE_FUNC(16, 12, sse4);
595
3/4
✓ Branch 0 taken 10 times.
✓ Branch 1 taken 14 times.
✓ Branch 2 taken 10 times.
✗ Branch 3 not taken.
24 ASSIGN_SHUFFLE_FUNC(16, 16, sse4);
596
3/4
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 18 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 6 times.
24 ASSIGN_SHUFFLE_FUNC( 8, 12, sse4);
597
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 24 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
24 ASSIGN_SHUFFLE_FUNC(12, 12, sse4);
598
3/4
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 22 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
24 ASSIGN_SHUFFLE_FUNC(32, 32, avx2);
599
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 24 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
24 ASSIGN_SHUFFLE_FUNC(64, 64, avx512);
600 av_assert1(out->func);
601 24 return 0;
602 }
603
604 /* Normalize clear values into 32-bit integer constants */
605 84 static void normalize_clear(SwsOp *op)
606 {
607 static_assert(sizeof(uint32_t) == sizeof(int), "int size mismatch");
608 SwsOpPriv priv;
609 union {
610 uint32_t u32;
611 int i;
612 } c;
613
614 84 ff_sws_setup_q4(op, &priv);
615
2/2
✓ Branch 0 taken 336 times.
✓ Branch 1 taken 84 times.
420 for (int i = 0; i < 4; i++) {
616
2/2
✓ Branch 0 taken 114 times.
✓ Branch 1 taken 222 times.
336 if (!op->c.q4[i].den)
617 114 continue;
618
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 222 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
222 switch (ff_sws_pixel_type_size(op->type)) {
619 case 1: c.u32 = 0x1010101 * priv.u8[i]; break;
620 222 case 2: c.u32 = priv.u16[i] << 16 | priv.u16[i]; break;
621 case 4: c.u32 = priv.u32[i]; break;
622 }
623
624 222 op->c.q4[i].num = c.i;
625 222 op->c.q4[i].den = 1;
626 }
627 84 }
628
629 4667 static int compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out)
630 {
631 4667 const int cpu_flags = av_get_cpu_flags();
632 4667 const int mmsize = get_mmsize(cpu_flags);
633
2/2
✓ Branch 0 taken 2513 times.
✓ Branch 1 taken 2154 times.
4667 if (mmsize < 0)
634 2513 return mmsize;
635
636 av_assert1(ops->num_ops > 0);
637 2154 const SwsOp read = ops->ops[0];
638 2154 const SwsOp write = ops->ops[ops->num_ops - 1];
639 int ret;
640
641 /* Special fast path for in-place packed shuffle */
642 2154 ret = solve_shuffle(ops, mmsize, out);
643
2/2
✓ Branch 0 taken 24 times.
✓ Branch 1 taken 2130 times.
2154 if (ret != AVERROR(ENOTSUP))
644 24 return ret;
645
646 2130 SwsOpChain *chain = ff_sws_op_chain_alloc();
647
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2130 times.
2130 if (!chain)
648 return AVERROR(ENOMEM);
649
650 2130 *out = (SwsCompiledOp) {
651 .priv = chain,
652 .free = ff_sws_op_chain_free_cb,
653
654 /* Use at most two full YMM regs during the widest precision section */
655 2130 .block_size = 2 * FFMIN(mmsize, 32) / ff_sws_op_list_max_size(ops),
656 };
657
658 /* 3-component reads/writes process one extra garbage word */
659
4/4
✓ Branch 0 taken 72 times.
✓ Branch 1 taken 2058 times.
✓ Branch 2 taken 24 times.
✓ Branch 3 taken 48 times.
2130 if (read.rw.packed && read.rw.elems == 3)
660 24 out->over_read = sizeof(uint32_t);
661
4/4
✓ Branch 0 taken 72 times.
✓ Branch 1 taken 2058 times.
✓ Branch 2 taken 24 times.
✓ Branch 3 taken 48 times.
2130 if (write.rw.packed && write.rw.elems == 3)
662 24 out->over_write = sizeof(uint32_t);
663
664 static const SwsOpTable *const tables[] = {
665 &ops8_m1_sse4,
666 &ops8_m1_avx2,
667 &ops8_m2_sse4,
668 &ops8_m2_avx2,
669 &ops16_m1_avx2,
670 &ops16_m2_avx2,
671 &ops32_avx2,
672 };
673
674 do {
675 4613 int op_block_size = out->block_size;
676 4613 SwsOp *op = &ops->ops[0];
677
678
2/2
✓ Branch 1 taken 2718 times.
✓ Branch 2 taken 1895 times.
4613 if (op_is_type_invariant(op)) {
679
2/2
✓ Branch 0 taken 84 times.
✓ Branch 1 taken 2634 times.
2718 if (op->op == SWS_OP_CLEAR)
680 84 normalize_clear(op);
681 2718 op_block_size *= ff_sws_pixel_type_size(op->type);
682 2718 op->type = SWS_PIXEL_U8;
683 }
684
685 4613 ret = ff_sws_op_compile_tables(tables, FF_ARRAY_ELEMS(tables), ops,
686 op_block_size, chain);
687
2/2
✓ Branch 0 taken 2483 times.
✓ Branch 1 taken 2130 times.
4613 } while (ret == AVERROR(EAGAIN));
688
2/2
✓ Branch 0 taken 1647 times.
✓ Branch 1 taken 483 times.
2130 if (ret < 0) {
689 1647 ff_sws_op_chain_free(chain);
690 1647 return ret;
691 }
692
693 #define ASSIGN_PROCESS_FUNC(NAME) \
694 do { \
695 SWS_DECL_FUNC(NAME); \
696 void NAME##_return(void); \
697 ret = ff_sws_op_chain_append(chain, NAME##_return, \
698 NULL, &(SwsOpPriv) {0}); \
699 out->func = NAME; \
700 } while (0)
701
702
2/2
✓ Branch 0 taken 27 times.
✓ Branch 1 taken 456 times.
483 const int read_planes = read.rw.packed ? 1 : read.rw.elems;
703
2/2
✓ Branch 0 taken 27 times.
✓ Branch 1 taken 456 times.
483 const int write_planes = write.rw.packed ? 1 : write.rw.elems;
704
4/5
✓ Branch 0 taken 56 times.
✓ Branch 1 taken 18 times.
✓ Branch 2 taken 102 times.
✓ Branch 3 taken 307 times.
✗ Branch 4 not taken.
483 switch (FFMAX(read_planes, write_planes)) {
705 56 case 1: ASSIGN_PROCESS_FUNC(ff_sws_process1_x86); break;
706 18 case 2: ASSIGN_PROCESS_FUNC(ff_sws_process2_x86); break;
707 102 case 3: ASSIGN_PROCESS_FUNC(ff_sws_process3_x86); break;
708 307 case 4: ASSIGN_PROCESS_FUNC(ff_sws_process4_x86); break;
709 }
710
711
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 483 times.
483 if (ret < 0) {
712 ff_sws_op_chain_free(chain);
713 return ret;
714 }
715
716 483 out->cpu_flags = chain->cpu_flags;
717 483 return 0;
718 }
719
720 const SwsOpBackend backend_x86 = {
721 .name = "x86",
722 .compile = compile,
723 };
724