FFmpeg coverage


Directory: ../../../ffmpeg/
File: src/libswscale/x86/ops.c
Date: 2026-04-20 20:24:43
Exec Total Coverage
Lines: 268 283 94.7%
Functions: 18 18 100.0%
Branches: 197 233 84.5%

Line Branch Exec Source
1 /**
2 * Copyright (C) 2025 Niklas Haas
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include <float.h>
22
23 #include "libavutil/avassert.h"
24 #include "libavutil/mem.h"
25
26 #include "../ops_chain.h"
27
28 #define DECL_ENTRY(TYPE, MASK, NAME, ...) \
29 static const SwsOpEntry op_##NAME = { \
30 .type = SWS_PIXEL_##TYPE, \
31 .mask = MASK, \
32 __VA_ARGS__ \
33 }
34
35 #define DECL_ASM(TYPE, MASK, NAME, ...) \
36 void ff_##NAME(void); \
37 DECL_ENTRY(TYPE, MASK, NAME, \
38 .func = ff_##NAME, \
39 __VA_ARGS__)
40
41 #define DECL_PATTERN(TYPE, NAME, X, Y, Z, W, ...) \
42 DECL_ASM(TYPE, SWS_COMP_MASK(X, Y, Z, W), p##X##Y##Z##W##_##NAME, \
43 __VA_ARGS__ \
44 )
45
46 #define REF_PATTERN(NAME, X, Y, Z, W) \
47 &op_p##X##Y##Z##W##_##NAME
48
49 #define DECL_COMMON_PATTERNS(TYPE, NAME, ...) \
50 DECL_PATTERN(TYPE, NAME, 1, 0, 0, 0, __VA_ARGS__); \
51 DECL_PATTERN(TYPE, NAME, 1, 0, 0, 1, __VA_ARGS__); \
52 DECL_PATTERN(TYPE, NAME, 1, 1, 1, 0, __VA_ARGS__); \
53 DECL_PATTERN(TYPE, NAME, 1, 1, 1, 1, __VA_ARGS__) \
54
55 #define REF_COMMON_PATTERNS(NAME) \
56 REF_PATTERN(NAME, 1, 0, 0, 0), \
57 REF_PATTERN(NAME, 1, 0, 0, 1), \
58 REF_PATTERN(NAME, 1, 1, 1, 0), \
59 REF_PATTERN(NAME, 1, 1, 1, 1)
60
61 31946 static int setup_rw(const SwsImplParams *params, SwsImplResult *out)
62 {
63 31946 const SwsOp *op = params->op;
64
65 /* 3-component reads/writes process one extra garbage word */
66
4/4
✓ Branch 0 taken 11240 times.
✓ Branch 1 taken 20706 times.
✓ Branch 2 taken 1485 times.
✓ Branch 3 taken 9755 times.
31946 if (op->rw.packed && op->rw.elems == 3) {
67
2/3
✓ Branch 0 taken 812 times.
✓ Branch 1 taken 673 times.
✗ Branch 2 not taken.
1485 switch (op->op) {
68 812 case SWS_OP_READ: out->over_read = sizeof(uint32_t); break;
69 673 case SWS_OP_WRITE: out->over_write = sizeof(uint32_t); break;
70 }
71 }
72
73 31946 return 0;
74 }
75
76 #define DECL_RW(EXT, TYPE, NAME, OP, ELEMS, PACKED, FRAC) \
77 DECL_ASM(TYPE, SWS_COMP_ELEMS(ELEMS), NAME##ELEMS##EXT, \
78 .op = SWS_OP_##OP, \
79 .rw = { .elems = ELEMS, .packed = PACKED, .frac = FRAC }, \
80 .setup = setup_rw, \
81 );
82
83 #define DECL_PACKED_RW(EXT, DEPTH) \
84 DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 2, true, 0) \
85 DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 3, true, 0) \
86 DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 4, true, 0) \
87 DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 2, true, 0) \
88 DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 3, true, 0) \
89 DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 4, true, 0) \
90
91 #define DECL_PACK_UNPACK(EXT, TYPE, X, Y, Z, W) \
92 DECL_ASM(TYPE, SWS_COMP(0), pack_##X##Y##Z##W##EXT, \
93 .op = SWS_OP_PACK, \
94 .pack.pattern = {X, Y, Z, W}, \
95 ); \
96 \
97 DECL_ASM(TYPE, SWS_COMP_MASK(X, Y, Z, W), unpack_##X##Y##Z##W##EXT, \
98 .op = SWS_OP_UNPACK, \
99 .pack.pattern = {X, Y, Z, W}, \
100 ); \
101
102 10007 static int setup_swap_bytes(const SwsImplParams *params, SwsImplResult *out)
103 {
104 10007 const int mask = ff_sws_pixel_type_size(params->op->type) - 1;
105
2/2
✓ Branch 0 taken 160112 times.
✓ Branch 1 taken 10007 times.
170119 for (int i = 0; i < 16; i++)
106 160112 out->priv.u8[i] = (i & ~mask) | (mask - (i & mask));
107 10007 return 0;
108 }
109
110 #define DECL_SWAP_BYTES(EXT, TYPE, X, Y, Z, W) \
111 DECL_ENTRY(TYPE, SWS_COMP_MASK(X, Y, Z, W), \
112 p##X##Y##Z##W##_swap_bytes_##TYPE##EXT, \
113 .op = SWS_OP_SWAP_BYTES, \
114 .func = ff_p##X##Y##Z##W##_shuffle##EXT, \
115 .setup = setup_swap_bytes, \
116 );
117
118 #define DECL_CLEAR_ALPHA(EXT, IDX) \
119 DECL_ASM(U8, SWS_COMP_ALL, clear_alpha##IDX##EXT, \
120 .op = SWS_OP_CLEAR, \
121 .clear.mask = SWS_COMP(IDX), \
122 .clear.value[IDX] = { -1, 1 }, \
123 ); \
124
125 #define DECL_CLEAR_ZERO(EXT, IDX) \
126 DECL_ASM(U8, SWS_COMP_ALL, clear_zero##IDX##EXT, \
127 .op = SWS_OP_CLEAR, \
128 .clear.mask = SWS_COMP(IDX), \
129 .clear.value[IDX] = { 0, 1 }, \
130 );
131
132 1840 static int setup_clear(const SwsImplParams *params, SwsImplResult *out)
133 {
134 1840 const SwsOp *op = params->op;
135
2/2
✓ Branch 0 taken 7360 times.
✓ Branch 1 taken 1840 times.
9200 for (int i = 0; i < 4; i++)
136 7360 out->priv.u32[i] = (uint32_t) op->clear.value[i].num;
137 1840 return 0;
138 }
139
140 #define DECL_CLEAR(EXT, X, Y, Z, W) \
141 DECL_ASM(U8, SWS_COMP_ALL, p##X##Y##Z##W##_clear##EXT, \
142 .op = SWS_OP_CLEAR, \
143 .setup = setup_clear, \
144 .clear.mask = SWS_COMP_MASK(X, Y, Z, W), \
145 );
146
147 #define DECL_SWIZZLE(EXT, X, Y, Z, W) \
148 DECL_ASM(U8, SWS_COMP_ALL, swizzle_##X##Y##Z##W##EXT, \
149 .op = SWS_OP_SWIZZLE, \
150 .swizzle.in = {X, Y, Z, W}, \
151 );
152
153 #define DECL_CONVERT(EXT, FROM, TO) \
154 DECL_COMMON_PATTERNS(FROM, convert_##FROM##_##TO##EXT, \
155 .op = SWS_OP_CONVERT, \
156 .convert.to = SWS_PIXEL_##TO, \
157 );
158
159 #define DECL_EXPAND(EXT, FROM, TO) \
160 DECL_COMMON_PATTERNS(FROM, expand_##FROM##_##TO##EXT, \
161 .op = SWS_OP_CONVERT, \
162 .convert.to = SWS_PIXEL_##TO, \
163 .convert.expand = true, \
164 );
165
166 2767 static int setup_shift(const SwsImplParams *params, SwsImplResult *out)
167 {
168 2767 out->priv.u16[0] = params->op->shift.amount;
169 2767 return 0;
170 }
171
172 #define DECL_SHIFT16(EXT) \
173 DECL_COMMON_PATTERNS(U16, lshift16##EXT, \
174 .op = SWS_OP_LSHIFT, \
175 .setup = setup_shift, \
176 .flexible = true, \
177 ); \
178 \
179 DECL_COMMON_PATTERNS(U16, rshift16##EXT, \
180 .op = SWS_OP_RSHIFT, \
181 .setup = setup_shift, \
182 .flexible = true, \
183 );
184
185 #define DECL_MIN_MAX(EXT) \
186 DECL_COMMON_PATTERNS(F32, min##EXT, \
187 .op = SWS_OP_MIN, \
188 .setup = ff_sws_setup_clamp, \
189 .flexible = true, \
190 ); \
191 \
192 DECL_COMMON_PATTERNS(F32, max##EXT, \
193 .op = SWS_OP_MAX, \
194 .setup = ff_sws_setup_clamp, \
195 .flexible = true, \
196 );
197
198 #define DECL_SCALE(EXT) \
199 DECL_COMMON_PATTERNS(F32, scale##EXT, \
200 .op = SWS_OP_SCALE, \
201 .setup = ff_sws_setup_scale, \
202 .flexible = true, \
203 );
204
205 #define DECL_EXPAND_BITS(EXT, BITS) \
206 DECL_ASM(U##BITS, SWS_COMP(0), expand_bits##BITS##EXT, \
207 .op = SWS_OP_SCALE, \
208 .scale = { .num = ((1 << (BITS)) - 1), .den = 1 }, \
209 );
210
211 6922 static int setup_dither(const SwsImplParams *params, SwsImplResult *out)
212 {
213 6922 const SwsOp *op = params->op;
214 /* 1x1 matrix / single constant */
215
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 6918 times.
6922 if (!op->dither.size_log2) {
216 4 const AVRational k = op->dither.matrix[0];
217 4 out->priv.f32[0] = (float) k.num / k.den;
218 4 return 0;
219 }
220
221 6918 const int size = 1 << op->dither.size_log2;
222 6918 const int8_t *off = op->dither.y_offset;
223 6918 int max_offset = 0;
224
2/2
✓ Branch 0 taken 27672 times.
✓ Branch 1 taken 6918 times.
34590 for (int i = 0; i < 4; i++) {
225
2/2
✓ Branch 0 taken 18006 times.
✓ Branch 1 taken 9666 times.
27672 if (off[i] >= 0)
226 18006 max_offset = FFMAX(max_offset, off[i] & (size - 1));
227 }
228
229 /* Allocate extra rows to allow over-reading for row offsets. Note that
230 * max_offset is currently never larger than 5, so the extra space needed
231 * for this over-allocation is bounded by 5 * size * sizeof(float),
232 * typically 320 bytes for a 16x16 dither matrix. */
233 6918 const int stride = size * sizeof(float);
234 6918 const int num_rows = size + max_offset;
235 6918 float *matrix = out->priv.ptr = av_mallocz(num_rows * stride);
236
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 6918 times.
6918 if (!matrix)
237 return AVERROR(ENOMEM);
238 6918 out->free = ff_op_priv_free;
239
240
2/2
✓ Branch 0 taken 2112336 times.
✓ Branch 1 taken 6918 times.
2119254 for (int i = 0; i < size * size; i++)
241 2112336 matrix[i] = (float) op->dither.matrix[i].num / op->dither.matrix[i].den;
242
243 6918 memcpy(&matrix[size * size], matrix, max_offset * stride);
244
245 /* Store relative pointer offset to each row inside extra space */
246 static_assert(sizeof(out->priv.ptr) <= sizeof(int16_t[4]),
247 ">8 byte pointers not supported");
248 assert(max_offset * stride <= INT16_MAX);
249 6918 int16_t *off_out = &out->priv.i16[4];
250
2/2
✓ Branch 0 taken 27672 times.
✓ Branch 1 taken 6918 times.
34590 for (int i = 0; i < 4; i++)
251
2/2
✓ Branch 0 taken 18006 times.
✓ Branch 1 taken 9666 times.
27672 off_out[i] = off[i] >= 0 ? (off[i] & (size - 1)) * stride : -1;
252
253 6918 return 0;
254 }
255
256 #define DECL_DITHER0(EXT) \
257 DECL_COMMON_PATTERNS(F32, dither0##EXT, \
258 .op = SWS_OP_DITHER, \
259 .setup = setup_dither, \
260 );
261
262 #define DECL_DITHER(EXT, SIZE) \
263 DECL_ASM(F32, SWS_COMP_ALL, dither##SIZE##EXT, \
264 .op = SWS_OP_DITHER, \
265 .setup = setup_dither, \
266 .dither_size = SIZE, \
267 );
268
269 8535 static int setup_linear(const SwsImplParams *params, SwsImplResult *out)
270 {
271 8535 const SwsOp *op = params->op;
272
273 8535 float *matrix = out->priv.ptr = av_mallocz(sizeof(float[4][5]));
274
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 8535 times.
8535 if (!matrix)
275 return AVERROR(ENOMEM);
276 8535 out->free = ff_op_priv_free;
277
278
2/2
✓ Branch 0 taken 34140 times.
✓ Branch 1 taken 8535 times.
42675 for (int y = 0; y < 4; y++) {
279
2/2
✓ Branch 0 taken 170700 times.
✓ Branch 1 taken 34140 times.
204840 for (int x = 0; x < 5; x++)
280 170700 matrix[y * 5 + x] = (float) op->lin.m[y][x].num / op->lin.m[y][x].den;
281 }
282
283 8535 return 0;
284 }
285
286 #define DECL_LINEAR(EXT, NAME, MASK) \
287 DECL_ASM(F32, SWS_COMP_ALL, NAME##EXT, \
288 .op = SWS_OP_LINEAR, \
289 .setup = setup_linear, \
290 .linear_mask = (MASK), \
291 );
292
293 120 static bool check_filter_fma(const SwsImplParams *params)
294 {
295 120 const SwsOp *op = params->op;
296 120 SwsContext *ctx = params->ctx;
297
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 120 times.
120 if (!(ctx->flags & SWS_BITEXACT))
298 return true;
299
300
2/2
✓ Branch 0 taken 40 times.
✓ Branch 1 taken 80 times.
120 if (!ff_sws_pixel_type_is_int(op->type))
301 40 return false;
302
303 /* Check if maximum/minimum partial sum fits losslessly inside float */
304 80 AVRational max_range = { 1 << 24, 1 };
305 80 AVRational min_range = { -(1 << 24), 1 };
306 const AVRational scale = Q(SWS_FILTER_SCALE);
307
308
2/2
✓ Branch 0 taken 140 times.
✓ Branch 1 taken 40 times.
180 for (int i = 0; i < op->rw.elems; i++) {
309 140 const AVRational min = av_mul_q(op->comps.min[i], scale);
310 140 const AVRational max = av_mul_q(op->comps.max[i], scale);
311
4/4
✓ Branch 1 taken 128 times.
✓ Branch 2 taken 12 times.
✓ Branch 4 taken 28 times.
✓ Branch 5 taken 100 times.
140 if (av_cmp_q(min, min_range) < 0 || av_cmp_q(max_range, max) < 0)
312 40 return false;
313 }
314
315 40 return true;
316 }
317
318 120 static int setup_filter_v(const SwsImplParams *params, SwsImplResult *out)
319 {
320 120 const SwsFilterWeights *filter = params->op->rw.kernel;
321 static_assert(sizeof(out->priv.ptr) <= sizeof(int32_t[2]),
322 ">8 byte pointers not supported");
323
324 /* Pre-convert weights to float */
325 120 float *weights = av_calloc(filter->num_weights, sizeof(float));
326
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 120 times.
120 if (!weights)
327 return AVERROR(ENOMEM);
328
329
2/2
✓ Branch 0 taken 6912 times.
✓ Branch 1 taken 120 times.
7032 for (int i = 0; i < filter->num_weights; i++)
330 6912 weights[i] = (float) filter->weights[i] / SWS_FILTER_SCALE;
331
332 120 out->priv.ptr = weights;
333 120 out->priv.uptr[1] = filter->filter_size;
334 120 out->free = ff_op_priv_free;
335 120 return 0;
336 }
337
338 168 static int hscale_sizeof_weight(const SwsOp *op)
339 {
340
3/4
✓ Branch 0 taken 56 times.
✓ Branch 1 taken 56 times.
✓ Branch 2 taken 56 times.
✗ Branch 3 not taken.
168 switch (op->type) {
341 56 case SWS_PIXEL_U8: return sizeof(int16_t);
342 56 case SWS_PIXEL_U16: return sizeof(int16_t);
343 56 case SWS_PIXEL_F32: return sizeof(float);
344 default: return 0;
345 }
346 }
347
348 140 static int setup_filter_h(const SwsImplParams *params, SwsImplResult *out)
349 {
350 140 const SwsOp *op = params->op;
351 140 const SwsFilterWeights *filter = op->rw.kernel;
352
353 /**
354 * `vpgatherdd` gathers 32 bits at a time; so if we're filtering a smaller
355 * size, we need to gather 2/4 taps simultaneously and unroll the inner
356 * loop over several packed samples.
357 */
358 140 const int pixel_size = ff_sws_pixel_type_size(op->type);
359 140 const int taps_align = sizeof(int32_t) / pixel_size;
360 140 const int filter_size = filter->filter_size;
361 140 const int block_size = params->table->block_size;
362 140 const size_t aligned_size = FFALIGN(filter_size, taps_align);
363 140 const size_t line_size = FFALIGN(filter->dst_size, block_size);
364 av_assert1(FFALIGN(line_size, taps_align) == line_size);
365
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 140 times.
140 if (aligned_size > INT_MAX)
366 return AVERROR(EINVAL);
367
368 union {
369 void *ptr;
370 int16_t *i16;
371 float *f32;
372 } weights;
373
374 140 const int sizeof_weight = hscale_sizeof_weight(op);
375 140 weights.ptr = av_calloc(line_size, sizeof_weight * aligned_size);
376
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 140 times.
140 if (!weights.ptr)
377 return AVERROR(ENOMEM);
378
379 /**
380 * Transpose filter weights to group (aligned) taps by block
381 */
382 140 const int mmsize = block_size * 2;
383 140 const int gather_size = mmsize / sizeof(int32_t); /* pixels per vpgatherdd */
384
2/2
✓ Branch 0 taken 560 times.
✓ Branch 1 taken 140 times.
700 for (size_t x = 0; x < line_size; x += block_size) {
385 560 const int elems = FFMIN(block_size, filter->dst_size - x);
386
2/2
✓ Branch 0 taken 1824 times.
✓ Branch 1 taken 560 times.
2384 for (int j = 0; j < filter_size; j++) {
387 1824 const int jb = j & ~(taps_align - 1);
388 1824 const int ji = j - jb;
389 1824 const size_t idx_base = x * aligned_size + jb * block_size + ji;
390
2/2
✓ Branch 0 taken 29184 times.
✓ Branch 1 taken 1824 times.
31008 for (int i = 0; i < elems; i++) {
391 29184 const int w = filter->weights[(x + i) * filter_size + j];
392 29184 size_t idx = idx_base;
393
2/2
✓ Branch 0 taken 5632 times.
✓ Branch 1 taken 23552 times.
29184 if (op->type == SWS_PIXEL_U8) {
394 /* Interleave the pixels within each lane, i.e.:
395 * [a0 a1 a2 a3 | b0 b1 b2 b3 ] pixels 0-1, taps 0-3 (lane 0)
396 * [e0 e1 e2 e3 | f0 f1 f2 f3 ] pixels 4-5, taps 0-3 (lane 1)
397 * [c0 c1 c2 c3 | d0 d1 d2 d3 ] pixels 2-3, taps 0-3 (lane 0)
398 * [g0 g1 g2 g3 | h0 h1 h2 h3 ] pixels 6-7, taps 0-3 (lane 1)
399 * [i0 i1 i2 i3 | j0 j1 j2 j3 ] pixels 8-9, taps 0-3 (lane 0)
400 * ...
401 * [o0 o1 o2 o3 | p0 p1 p2 p3 ] pixels 14-15, taps 0-3 (lane 1)
402 * (repeat for taps 4-7, etc.)
403 */
404 5632 const int gather_base = i & ~(gather_size - 1);
405 5632 const int gather_pos = i - gather_base;
406 5632 const int lane_idx = gather_pos >> 2;
407 5632 const int pos_in_lane = gather_pos & 3;
408 5632 idx += gather_base * 4 /* which gather (m0 or m1) */
409 5632 + (pos_in_lane >> 1) * (mmsize / 2) /* lo/hi unpack */
410 5632 + lane_idx * 8 /* 8 ints per lane */
411 5632 + (pos_in_lane & 1) * 4; /* 4 taps per pair */
412 } else {
413 23552 idx += i * taps_align;
414 }
415
416
3/4
✓ Branch 0 taken 5632 times.
✓ Branch 1 taken 3584 times.
✓ Branch 2 taken 19968 times.
✗ Branch 3 not taken.
29184 switch (op->type) {
417 5632 case SWS_PIXEL_U8: weights.i16[idx] = w; break;
418 3584 case SWS_PIXEL_U16: weights.i16[idx] = w; break;
419 19968 case SWS_PIXEL_F32: weights.f32[idx] = w; break;
420 }
421 }
422 }
423 }
424
425 140 out->priv.ptr = weights.ptr;
426 140 out->priv.uptr[1] = aligned_size;
427 140 out->free = ff_op_priv_free;
428 140 out->over_read = (aligned_size - filter_size) * pixel_size;
429 140 return 0;
430 }
431
432 168 static bool check_filter_4x4_h(const SwsImplParams *params)
433 {
434 168 SwsContext *ctx = params->ctx;
435 168 const SwsOp *op = params->op;
436
3/4
✓ Branch 0 taken 168 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 56 times.
✓ Branch 3 taken 112 times.
168 if ((ctx->flags & SWS_BITEXACT) && op->type == SWS_PIXEL_F32)
437 56 return false; /* different accumulation order due to 4x4 transpose */
438
439 112 const int cpu_flags = av_get_cpu_flags();
440
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 112 times.
112 if (cpu_flags & AV_CPU_FLAG_SLOW_GATHER)
441 return true; /* always prefer over gathers if gathers are slow */
442
443 /**
444 * Otherwise, prefer it above a certain filter size. Empirically, this
445 * kernel seems to be faster whenever the reference/gather kernel crosses
446 * a breakpoint for the number of gathers needed, but this filter doesn't.
447 *
448 * Tested on a Lunar Lake (Intel Core Ultra 7 258V) system.
449 */
450 112 const SwsFilterWeights *filter = op->rw.kernel;
451
2/2
✓ Branch 0 taken 44 times.
✓ Branch 1 taken 12 times.
56 return op->type == SWS_PIXEL_U8 && filter->filter_size > 12 ||
452
6/6
✓ Branch 0 taken 56 times.
✓ Branch 1 taken 56 times.
✓ Branch 2 taken 56 times.
✓ Branch 3 taken 44 times.
✓ Branch 4 taken 40 times.
✓ Branch 5 taken 16 times.
252 op->type == SWS_PIXEL_U16 && filter->filter_size > 4 ||
453
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 84 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
84 op->type == SWS_PIXEL_F32 && filter->filter_size > 1;
454 }
455
456 28 static int setup_filter_4x4_h(const SwsImplParams *params, SwsImplResult *out)
457 {
458 28 const SwsOp *op = params->op;
459 28 const SwsFilterWeights *filter = op->rw.kernel;
460 28 const int pixel_size = ff_sws_pixel_type_size(op->type);
461 28 const int sizeof_weights = hscale_sizeof_weight(op);
462 28 const int block_size = params->table->block_size;
463 28 const int taps_align = 16 / sizeof_weights; /* taps per iteration (XMM) */
464 28 const int pixels_align = 4; /* pixels per iteration */
465 28 const int filter_size = filter->filter_size;
466 28 const size_t aligned_size = FFALIGN(filter_size, taps_align);
467 28 const int line_size = FFALIGN(filter->dst_size, block_size);
468 av_assert1(FFALIGN(line_size, pixels_align) == line_size);
469
470 union {
471 void *ptr;
472 int16_t *i16;
473 float *f32;
474 } weights;
475
476 28 weights.ptr = av_calloc(line_size, aligned_size * sizeof_weights);
477
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 28 times.
28 if (!weights.ptr)
478 return AVERROR(ENOMEM);
479
480 /**
481 * Desired memory layout: [w][taps][pixels_align][taps_align]
482 *
483 * Example with taps_align=8, pixels_align=4:
484 * [a0, a1, ... a7] weights for pixel 0, taps 0..7
485 * [b0, b1, ... b7] weights for pixel 1, taps 0..7
486 * [c0, c1, ... c7] weights for pixel 2, taps 0..7
487 * [d0, d1, ... d7] weights for pixel 3, taps 0..7
488 * [a8, a9, ... a15] weights for pixel 0, taps 8..15
489 * ...
490 * repeat for all taps, then move on to pixels 4..7, etc.
491 */
492
2/2
✓ Branch 0 taken 1792 times.
✓ Branch 1 taken 28 times.
1820 for (int x = 0; x < filter->dst_size; x++) {
493
2/2
✓ Branch 0 taken 30720 times.
✓ Branch 1 taken 1792 times.
32512 for (int j = 0; j < filter_size; j++) {
494 30720 const int xb = x & ~(pixels_align - 1);
495 30720 const int jb = j & ~(taps_align - 1);
496 30720 const int xi = x - xb, ji = j - jb;
497 30720 const int w = filter->weights[x * filter_size + j];
498 30720 const int idx = xb * aligned_size + jb * pixels_align + xi * taps_align + ji;
499
500
2/4
✓ Branch 0 taken 14336 times.
✓ Branch 1 taken 16384 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
30720 switch (op->type) {
501 14336 case SWS_PIXEL_U8: weights.i16[idx] = w; break;
502 16384 case SWS_PIXEL_U16: weights.i16[idx] = w; break;
503 case SWS_PIXEL_F32: weights.f32[idx] = w; break;
504 }
505 }
506 }
507
508 28 out->priv.ptr = weights.ptr;
509 28 out->priv.uptr[1] = aligned_size * sizeof_weights;
510 28 out->free = ff_op_priv_free;
511 28 out->over_read = (aligned_size - filter_size) * pixel_size;
512 28 return 0;
513 }
514
515 #define DECL_FILTER(EXT, TYPE, DIR, NAME, ELEMS, ...) \
516 DECL_ASM(TYPE, SWS_COMP_ELEMS(ELEMS), NAME##ELEMS##_##TYPE##EXT, \
517 .op = SWS_OP_READ, \
518 .rw.elems = ELEMS, \
519 .rw.filter = SWS_OP_FILTER_##DIR, \
520 __VA_ARGS__ \
521 );
522
523 #define DECL_FILTERS(EXT, TYPE, DIR, NAME, ...) \
524 DECL_FILTER(EXT, TYPE, DIR, NAME, 1, __VA_ARGS__) \
525 DECL_FILTER(EXT, TYPE, DIR, NAME, 2, __VA_ARGS__) \
526 DECL_FILTER(EXT, TYPE, DIR, NAME, 3, __VA_ARGS__) \
527 DECL_FILTER(EXT, TYPE, DIR, NAME, 4, __VA_ARGS__)
528
529 #define DECL_FILTERS_GENERIC(EXT, TYPE) \
530 DECL_FILTERS(EXT, TYPE, V, filter_v, .setup = setup_filter_v) \
531 DECL_FILTERS(EXT, TYPE, V, filter_fma_v, .setup = setup_filter_v, \
532 .check = check_filter_fma) \
533 DECL_FILTERS(EXT, TYPE, H, filter_h, .setup = setup_filter_h) \
534 DECL_FILTERS(EXT, TYPE, H, filter_4x4_h, .setup = setup_filter_4x4_h, \
535 .check = check_filter_4x4_h)
536
537 #define REF_FILTERS(NAME, SUFFIX) \
538 &op_##NAME##1##SUFFIX, \
539 &op_##NAME##2##SUFFIX, \
540 &op_##NAME##3##SUFFIX, \
541 &op_##NAME##4##SUFFIX
542
543 #define DECL_FUNCS_8(SIZE, EXT, FLAG) \
544 DECL_RW(EXT, U8, read_planar, READ, 1, false, 0) \
545 DECL_RW(EXT, U8, read_planar, READ, 2, false, 0) \
546 DECL_RW(EXT, U8, read_planar, READ, 3, false, 0) \
547 DECL_RW(EXT, U8, read_planar, READ, 4, false, 0) \
548 DECL_RW(EXT, U8, write_planar, WRITE, 1, false, 0) \
549 DECL_RW(EXT, U8, write_planar, WRITE, 2, false, 0) \
550 DECL_RW(EXT, U8, write_planar, WRITE, 3, false, 0) \
551 DECL_RW(EXT, U8, write_planar, WRITE, 4, false, 0) \
552 DECL_RW(EXT, U8, read_nibbles, READ, 1, false, 1) \
553 DECL_RW(EXT, U8, read_bits, READ, 1, false, 3) \
554 DECL_RW(EXT, U8, write_bits, WRITE, 1, false, 3) \
555 DECL_EXPAND_BITS(EXT, 8) \
556 DECL_PACKED_RW(EXT, 8) \
557 DECL_PACK_UNPACK(EXT, U8, 1, 2, 1, 0) \
558 DECL_PACK_UNPACK(EXT, U8, 3, 3, 2, 0) \
559 DECL_PACK_UNPACK(EXT, U8, 2, 3, 3, 0) \
560 void ff_p1000_shuffle##EXT(void); \
561 void ff_p1001_shuffle##EXT(void); \
562 void ff_p1110_shuffle##EXT(void); \
563 void ff_p1111_shuffle##EXT(void); \
564 DECL_SWIZZLE(EXT, 3, 0, 1, 2) \
565 DECL_SWIZZLE(EXT, 3, 0, 2, 1) \
566 DECL_SWIZZLE(EXT, 2, 1, 0, 3) \
567 DECL_SWIZZLE(EXT, 3, 2, 1, 0) \
568 DECL_SWIZZLE(EXT, 3, 1, 0, 2) \
569 DECL_SWIZZLE(EXT, 3, 2, 0, 1) \
570 DECL_SWIZZLE(EXT, 1, 2, 0, 3) \
571 DECL_SWIZZLE(EXT, 1, 0, 2, 3) \
572 DECL_SWIZZLE(EXT, 2, 0, 1, 3) \
573 DECL_SWIZZLE(EXT, 2, 3, 1, 0) \
574 DECL_SWIZZLE(EXT, 2, 1, 3, 0) \
575 DECL_SWIZZLE(EXT, 1, 2, 3, 0) \
576 DECL_SWIZZLE(EXT, 1, 3, 2, 0) \
577 DECL_SWIZZLE(EXT, 0, 2, 1, 3) \
578 DECL_SWIZZLE(EXT, 0, 2, 3, 1) \
579 DECL_SWIZZLE(EXT, 0, 3, 1, 2) \
580 DECL_SWIZZLE(EXT, 3, 1, 2, 0) \
581 DECL_SWIZZLE(EXT, 0, 3, 2, 1) \
582 DECL_SWIZZLE(EXT, 0, 0, 0, 3) \
583 DECL_SWIZZLE(EXT, 3, 0, 0, 0) \
584 DECL_SWIZZLE(EXT, 0, 0, 0, 1) \
585 DECL_SWIZZLE(EXT, 1, 0, 0, 0) \
586 DECL_CLEAR_ALPHA(EXT, 0) \
587 DECL_CLEAR_ALPHA(EXT, 1) \
588 DECL_CLEAR_ALPHA(EXT, 3) \
589 DECL_CLEAR_ZERO(EXT, 0) \
590 DECL_CLEAR_ZERO(EXT, 1) \
591 DECL_CLEAR_ZERO(EXT, 3) \
592 DECL_CLEAR(EXT, 0, 0, 0, 1) \
593 DECL_CLEAR(EXT, 1, 0, 0, 0) \
594 DECL_CLEAR(EXT, 1, 1, 0, 0) \
595 DECL_CLEAR(EXT, 0, 1, 1, 0) \
596 DECL_CLEAR(EXT, 0, 0, 1, 1) \
597 DECL_CLEAR(EXT, 1, 0, 1, 0) \
598 DECL_CLEAR(EXT, 0, 1, 0, 1) \
599 DECL_CLEAR(EXT, 0, 1, 1, 1) \
600 DECL_CLEAR(EXT, 1, 0, 1, 1) \
601 DECL_CLEAR(EXT, 1, 1, 0, 1) \
602 \
603 static const SwsOpTable ops8##EXT = { \
604 .cpu_flags = AV_CPU_FLAG_##FLAG, \
605 .block_size = SIZE, \
606 .entries = { \
607 &op_read_planar1##EXT, \
608 &op_read_planar2##EXT, \
609 &op_read_planar3##EXT, \
610 &op_read_planar4##EXT, \
611 &op_write_planar1##EXT, \
612 &op_write_planar2##EXT, \
613 &op_write_planar3##EXT, \
614 &op_write_planar4##EXT, \
615 &op_read8_packed2##EXT, \
616 &op_read8_packed3##EXT, \
617 &op_read8_packed4##EXT, \
618 &op_write8_packed2##EXT, \
619 &op_write8_packed3##EXT, \
620 &op_write8_packed4##EXT, \
621 &op_read_nibbles1##EXT, \
622 &op_read_bits1##EXT, \
623 &op_write_bits1##EXT, \
624 &op_expand_bits8##EXT, \
625 &op_pack_1210##EXT, \
626 &op_pack_3320##EXT, \
627 &op_pack_2330##EXT, \
628 &op_unpack_1210##EXT, \
629 &op_unpack_3320##EXT, \
630 &op_unpack_2330##EXT, \
631 &op_swizzle_3012##EXT, \
632 &op_swizzle_3021##EXT, \
633 &op_swizzle_2103##EXT, \
634 &op_swizzle_3210##EXT, \
635 &op_swizzle_3102##EXT, \
636 &op_swizzle_3201##EXT, \
637 &op_swizzle_1203##EXT, \
638 &op_swizzle_1023##EXT, \
639 &op_swizzle_2013##EXT, \
640 &op_swizzle_2310##EXT, \
641 &op_swizzle_2130##EXT, \
642 &op_swizzle_1230##EXT, \
643 &op_swizzle_1320##EXT, \
644 &op_swizzle_0213##EXT, \
645 &op_swizzle_0231##EXT, \
646 &op_swizzle_0312##EXT, \
647 &op_swizzle_3120##EXT, \
648 &op_swizzle_0321##EXT, \
649 &op_swizzle_0003##EXT, \
650 &op_swizzle_0001##EXT, \
651 &op_swizzle_3000##EXT, \
652 &op_swizzle_1000##EXT, \
653 &op_clear_alpha0##EXT, \
654 &op_clear_alpha1##EXT, \
655 &op_clear_alpha3##EXT, \
656 &op_clear_zero0##EXT, \
657 &op_clear_zero1##EXT, \
658 &op_clear_zero3##EXT, \
659 REF_PATTERN(clear##EXT, 0, 0, 0, 1), \
660 REF_PATTERN(clear##EXT, 1, 0, 0, 0), \
661 REF_PATTERN(clear##EXT, 1, 1, 0, 0), \
662 REF_PATTERN(clear##EXT, 0, 1, 1, 0), \
663 REF_PATTERN(clear##EXT, 0, 0, 1, 1), \
664 REF_PATTERN(clear##EXT, 1, 0, 1, 0), \
665 REF_PATTERN(clear##EXT, 0, 1, 0, 1), \
666 REF_PATTERN(clear##EXT, 0, 1, 1, 1), \
667 REF_PATTERN(clear##EXT, 1, 0, 1, 1), \
668 REF_PATTERN(clear##EXT, 1, 1, 0, 1), \
669 NULL \
670 }, \
671 };
672
673 #define DECL_FUNCS_16(SIZE, EXT, FLAG) \
674 DECL_PACKED_RW(EXT, 16) \
675 DECL_EXPAND_BITS(EXT, 16) \
676 DECL_PACK_UNPACK(EXT, U16, 4, 4, 4, 0) \
677 DECL_PACK_UNPACK(EXT, U16, 5, 5, 5, 0) \
678 DECL_PACK_UNPACK(EXT, U16, 5, 6, 5, 0) \
679 DECL_SWAP_BYTES(EXT, U16, 1, 0, 0, 0) \
680 DECL_SWAP_BYTES(EXT, U16, 1, 0, 0, 1) \
681 DECL_SWAP_BYTES(EXT, U16, 1, 1, 1, 0) \
682 DECL_SWAP_BYTES(EXT, U16, 1, 1, 1, 1) \
683 DECL_SHIFT16(EXT) \
684 DECL_CONVERT(EXT, U8, U16) \
685 DECL_CONVERT(EXT, U16, U8) \
686 DECL_EXPAND(EXT, U8, U16) \
687 \
688 static const SwsOpTable ops16##EXT = { \
689 .cpu_flags = AV_CPU_FLAG_##FLAG, \
690 .block_size = SIZE, \
691 .entries = { \
692 &op_read16_packed2##EXT, \
693 &op_read16_packed3##EXT, \
694 &op_read16_packed4##EXT, \
695 &op_write16_packed2##EXT, \
696 &op_write16_packed3##EXT, \
697 &op_write16_packed4##EXT, \
698 &op_pack_4440##EXT, \
699 &op_pack_5550##EXT, \
700 &op_pack_5650##EXT, \
701 &op_unpack_4440##EXT, \
702 &op_unpack_5550##EXT, \
703 &op_unpack_5650##EXT, \
704 &op_expand_bits16##EXT, \
705 REF_COMMON_PATTERNS(swap_bytes_U16##EXT), \
706 REF_COMMON_PATTERNS(convert_U8_U16##EXT), \
707 REF_COMMON_PATTERNS(convert_U16_U8##EXT), \
708 REF_COMMON_PATTERNS(expand_U8_U16##EXT), \
709 REF_COMMON_PATTERNS(lshift16##EXT), \
710 REF_COMMON_PATTERNS(rshift16##EXT), \
711 NULL \
712 }, \
713 };
714
715 #define DECL_FUNCS_32(SIZE, EXT, FLAG) \
716 DECL_PACKED_RW(_m2##EXT, 32) \
717 DECL_PACK_UNPACK(_m2##EXT, U32, 10, 10, 10, 2) \
718 DECL_PACK_UNPACK(_m2##EXT, U32, 2, 10, 10, 10) \
719 DECL_SWAP_BYTES(_m2##EXT, U32, 1, 0, 0, 0) \
720 DECL_SWAP_BYTES(_m2##EXT, U32, 1, 0, 0, 1) \
721 DECL_SWAP_BYTES(_m2##EXT, U32, 1, 1, 1, 0) \
722 DECL_SWAP_BYTES(_m2##EXT, U32, 1, 1, 1, 1) \
723 DECL_CONVERT(EXT, U8, U32) \
724 DECL_CONVERT(EXT, U32, U8) \
725 DECL_CONVERT(EXT, U16, U32) \
726 DECL_CONVERT(EXT, U32, U16) \
727 DECL_CONVERT(EXT, U8, F32) \
728 DECL_CONVERT(EXT, F32, U8) \
729 DECL_CONVERT(EXT, U16, F32) \
730 DECL_CONVERT(EXT, F32, U16) \
731 DECL_EXPAND(EXT, U8, U32) \
732 DECL_MIN_MAX(EXT) \
733 DECL_SCALE(EXT) \
734 DECL_DITHER0(EXT) \
735 DECL_DITHER(EXT, 1) \
736 DECL_DITHER(EXT, 2) \
737 DECL_DITHER(EXT, 3) \
738 DECL_DITHER(EXT, 4) \
739 DECL_DITHER(EXT, 5) \
740 DECL_DITHER(EXT, 6) \
741 DECL_DITHER(EXT, 7) \
742 DECL_DITHER(EXT, 8) \
743 DECL_LINEAR(EXT, luma, SWS_MASK_LUMA) \
744 DECL_LINEAR(EXT, alpha, SWS_MASK_ALPHA) \
745 DECL_LINEAR(EXT, lumalpha, SWS_MASK_LUMA | SWS_MASK_ALPHA) \
746 DECL_LINEAR(EXT, yalpha, SWS_MASK(1, 1)) \
747 DECL_LINEAR(EXT, dot3, 0x7) \
748 DECL_LINEAR(EXT, dot3a, 0x7 | SWS_MASK_ALPHA) \
749 DECL_LINEAR(EXT, row0, SWS_MASK_ROW(0) ^ SWS_MASK(0, 3)) \
750 DECL_LINEAR(EXT, diag3, SWS_MASK_DIAG3) \
751 DECL_LINEAR(EXT, diag4, SWS_MASK_DIAG4) \
752 DECL_LINEAR(EXT, diagoff3, SWS_MASK_DIAG3 | SWS_MASK_OFF3) \
753 DECL_LINEAR(EXT, affine3, SWS_MASK_MAT3 | SWS_MASK_OFF3) \
754 DECL_LINEAR(EXT, affine3uv, \
755 SWS_MASK_MAT3 | SWS_MASK_OFF(1) | SWS_MASK_OFF(2)) \
756 DECL_LINEAR(EXT, affine3x, \
757 SWS_MASK_MAT3 ^ SWS_MASK(0, 1) | SWS_MASK_OFF3) \
758 DECL_LINEAR(EXT, affine3xa, \
759 SWS_MASK_MAT3 ^ SWS_MASK(0, 1) | SWS_MASK_OFF3 | SWS_MASK_ALPHA) \
760 DECL_LINEAR(EXT, affine3xy, \
761 SWS_MASK_MAT3 ^ SWS_MASK(0, 0) ^ SWS_MASK(0, 1) | SWS_MASK_OFF3) \
762 DECL_LINEAR(EXT, affine3a, \
763 SWS_MASK_MAT3 | SWS_MASK_OFF3 | SWS_MASK_ALPHA) \
764 DECL_FILTERS_GENERIC(EXT, U8) \
765 DECL_FILTERS_GENERIC(EXT, U16) \
766 DECL_FILTERS_GENERIC(EXT, F32) \
767 \
768 static const SwsOpTable ops32##EXT = { \
769 .cpu_flags = AV_CPU_FLAG_##FLAG, \
770 .block_size = SIZE, \
771 .entries = { \
772 &op_read32_packed2_m2##EXT, \
773 &op_read32_packed3_m2##EXT, \
774 &op_read32_packed4_m2##EXT, \
775 &op_write32_packed2_m2##EXT, \
776 &op_write32_packed3_m2##EXT, \
777 &op_write32_packed4_m2##EXT, \
778 &op_pack_1010102_m2##EXT, \
779 &op_pack_2101010_m2##EXT, \
780 &op_unpack_1010102_m2##EXT, \
781 &op_unpack_2101010_m2##EXT, \
782 REF_COMMON_PATTERNS(swap_bytes_U32_m2##EXT), \
783 REF_COMMON_PATTERNS(convert_U8_U32##EXT), \
784 REF_COMMON_PATTERNS(convert_U32_U8##EXT), \
785 REF_COMMON_PATTERNS(convert_U16_U32##EXT), \
786 REF_COMMON_PATTERNS(convert_U32_U16##EXT), \
787 REF_COMMON_PATTERNS(convert_U8_F32##EXT), \
788 REF_COMMON_PATTERNS(convert_F32_U8##EXT), \
789 REF_COMMON_PATTERNS(convert_U16_F32##EXT), \
790 REF_COMMON_PATTERNS(convert_F32_U16##EXT), \
791 REF_COMMON_PATTERNS(expand_U8_U32##EXT), \
792 REF_COMMON_PATTERNS(min##EXT), \
793 REF_COMMON_PATTERNS(max##EXT), \
794 REF_COMMON_PATTERNS(scale##EXT), \
795 REF_COMMON_PATTERNS(dither0##EXT), \
796 &op_dither1##EXT, \
797 &op_dither2##EXT, \
798 &op_dither3##EXT, \
799 &op_dither4##EXT, \
800 &op_dither5##EXT, \
801 &op_dither6##EXT, \
802 &op_dither7##EXT, \
803 &op_dither8##EXT, \
804 &op_luma##EXT, \
805 &op_alpha##EXT, \
806 &op_lumalpha##EXT, \
807 &op_yalpha##EXT, \
808 &op_dot3##EXT, \
809 &op_dot3a##EXT, \
810 &op_row0##EXT, \
811 &op_diag3##EXT, \
812 &op_diag4##EXT, \
813 &op_diagoff3##EXT, \
814 &op_affine3##EXT, \
815 &op_affine3uv##EXT, \
816 &op_affine3x##EXT, \
817 &op_affine3xa##EXT, \
818 &op_affine3xy##EXT, \
819 &op_affine3a##EXT, \
820 REF_FILTERS(filter_fma_v, _U8##EXT), \
821 REF_FILTERS(filter_fma_v, _U16##EXT), \
822 REF_FILTERS(filter_fma_v, _F32##EXT), \
823 REF_FILTERS(filter_4x4_h, _U8##EXT), \
824 REF_FILTERS(filter_4x4_h, _U16##EXT), \
825 REF_FILTERS(filter_4x4_h, _F32##EXT), \
826 REF_FILTERS(filter_v, _U8##EXT), \
827 REF_FILTERS(filter_v, _U16##EXT), \
828 REF_FILTERS(filter_v, _F32##EXT), \
829 REF_FILTERS(filter_h, _U8##EXT), \
830 REF_FILTERS(filter_h, _U16##EXT), \
831 REF_FILTERS(filter_h, _F32##EXT), \
832 NULL \
833 }, \
834 };
835
836 DECL_FUNCS_8(16, _m1_sse4, SSE4)
837 DECL_FUNCS_8(32, _m1_avx2, AVX2)
838 DECL_FUNCS_8(32, _m2_sse4, SSE4)
839 DECL_FUNCS_8(64, _m2_avx2, AVX2)
840
841 DECL_FUNCS_16(16, _m1_avx2, AVX2)
842 DECL_FUNCS_16(32, _m2_avx2, AVX2)
843
844 DECL_FUNCS_32(16, _avx2, AVX2)
845
846 static const SwsOpTable *const tables[] = {
847 &ops8_m1_sse4,
848 &ops8_m1_avx2,
849 &ops8_m2_sse4,
850 &ops8_m2_avx2,
851 &ops16_m1_avx2,
852 &ops16_m2_avx2,
853 &ops32_avx2,
854 };
855
856 26628 static av_const int get_mmsize(const int cpu_flags)
857 {
858
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 26628 times.
26628 if (cpu_flags & AV_CPU_FLAG_AVX512)
859 return 64;
860
2/2
✓ Branch 0 taken 15136 times.
✓ Branch 1 taken 11492 times.
26628 else if (cpu_flags & AV_CPU_FLAG_AVX2)
861 15136 return 32;
862
2/2
✓ Branch 0 taken 5304 times.
✓ Branch 1 taken 6188 times.
11492 else if (cpu_flags & AV_CPU_FLAG_SSE4)
863 5304 return 16;
864 else
865 6188 return AVERROR(ENOTSUP);
866 }
867
868 /**
869 * Returns true if the operation's implementation only depends on the block
870 * size, and not the underlying pixel type
871 */
872 121010 static bool op_is_type_invariant(const SwsOp *op)
873 {
874
3/3
✓ Branch 0 taken 34821 times.
✓ Branch 1 taken 13345 times.
✓ Branch 2 taken 72844 times.
121010 switch (op->op) {
875 34821 case SWS_OP_READ:
876 case SWS_OP_WRITE:
877
8/8
✓ Branch 0 taken 25769 times.
✓ Branch 1 taken 9052 times.
✓ Branch 2 taken 18642 times.
✓ Branch 3 taken 7127 times.
✓ Branch 4 taken 27198 times.
✓ Branch 5 taken 496 times.
✓ Branch 6 taken 24510 times.
✓ Branch 7 taken 2688 times.
34821 return !(op->rw.elems > 1 && op->rw.packed) && !op->rw.frac && !op->rw.filter;
878 13345 case SWS_OP_SWIZZLE:
879 case SWS_OP_CLEAR:
880 13345 return true;
881 }
882
883 72844 return false;
884 }
885
886 606 static int movsize(const int bytes, const int mmsize)
887 {
888
2/2
✓ Branch 0 taken 582 times.
✓ Branch 1 taken 24 times.
1188 return bytes <= 4 ? 4 : /* movd */
889
2/2
✓ Branch 0 taken 489 times.
✓ Branch 1 taken 93 times.
582 bytes <= 8 ? 8 : /* movq */
890 mmsize; /* movu */
891 }
892
893 20440 static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out)
894 {
895 uint8_t shuffle[16];
896 int read_bytes, write_bytes;
897 int pixels;
898
899 /* Solve the shuffle mask for one 128-bit lane only */
900 20440 pixels = ff_sws_solve_shuffle(ops, shuffle, 16, 0x80, &read_bytes, &write_bytes);
901
2/2
✓ Branch 0 taken 20137 times.
✓ Branch 1 taken 303 times.
20440 if (pixels < 0)
902 20137 return pixels;
903
904 /* We can't shuffle acress lanes, so restrict the vector size to XMM
905 * whenever the read/write size would be a subset of the full vector */
906
4/4
✓ Branch 0 taken 166 times.
✓ Branch 1 taken 137 times.
✓ Branch 2 taken 41 times.
✓ Branch 3 taken 125 times.
303 if (read_bytes < 16 || write_bytes < 16)
907 178 mmsize = 16;
908
909 303 const int num_lanes = mmsize / 16;
910 303 const int in_total = num_lanes * read_bytes;
911 303 const int out_total = num_lanes * write_bytes;
912
913 303 *out = (SwsCompiledOp) {
914 303 .priv = av_memdup(shuffle, sizeof(shuffle)),
915 .free = av_free,
916 .slice_align = 1,
917 303 .block_size = pixels * num_lanes,
918 303 .over_read = movsize(in_total, mmsize) - in_total,
919 303 .over_write = movsize(out_total, mmsize) - out_total,
920
1/2
✓ Branch 0 taken 303 times.
✗ Branch 1 not taken.
606 .cpu_flags = mmsize > 32 ? AV_CPU_FLAG_AVX512 :
921
2/2
✓ Branch 0 taken 101 times.
✓ Branch 1 taken 202 times.
303 mmsize > 16 ? AV_CPU_FLAG_AVX2 :
922 AV_CPU_FLAG_SSE4,
923 };
924
925
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 303 times.
303 if (!out->priv)
926 return AVERROR(ENOMEM);
927
928 #define ASSIGN_SHUFFLE_FUNC(IN, OUT, EXT) \
929 do { \
930 SWS_DECL_FUNC(ff_packed_shuffle##IN##_##OUT##_##EXT); \
931 if (in_total == IN && out_total == OUT) \
932 out->func = ff_packed_shuffle##IN##_##OUT##_##EXT; \
933 } while (0)
934
935
3/4
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 301 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
303 ASSIGN_SHUFFLE_FUNC( 5, 15, sse4);
936
4/4
✓ Branch 0 taken 20 times.
✓ Branch 1 taken 283 times.
✓ Branch 2 taken 8 times.
✓ Branch 3 taken 12 times.
303 ASSIGN_SHUFFLE_FUNC( 4, 16, sse4);
937
3/4
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 299 times.
✓ Branch 2 taken 4 times.
✗ Branch 3 not taken.
303 ASSIGN_SHUFFLE_FUNC( 2, 12, sse4);
938
4/4
✓ Branch 0 taken 65 times.
✓ Branch 1 taken 238 times.
✓ Branch 2 taken 5 times.
✓ Branch 3 taken 60 times.
303 ASSIGN_SHUFFLE_FUNC(16, 8, sse4);
939
3/4
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 301 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
303 ASSIGN_SHUFFLE_FUNC(10, 15, sse4);
940
4/4
✓ Branch 0 taken 78 times.
✓ Branch 1 taken 225 times.
✓ Branch 2 taken 38 times.
✓ Branch 3 taken 40 times.
303 ASSIGN_SHUFFLE_FUNC( 8, 16, sse4);
941
4/4
✓ Branch 0 taken 20 times.
✓ Branch 1 taken 283 times.
✓ Branch 2 taken 12 times.
✓ Branch 3 taken 8 times.
303 ASSIGN_SHUFFLE_FUNC( 4, 12, sse4);
942
3/4
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 301 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
303 ASSIGN_SHUFFLE_FUNC(15, 5, sse4);
943
3/4
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 301 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
303 ASSIGN_SHUFFLE_FUNC(15, 15, sse4);
944
4/4
✓ Branch 0 taken 21 times.
✓ Branch 1 taken 282 times.
✓ Branch 2 taken 9 times.
✓ Branch 3 taken 12 times.
303 ASSIGN_SHUFFLE_FUNC(12, 16, sse4);
945
3/4
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 295 times.
✓ Branch 2 taken 8 times.
✗ Branch 3 not taken.
303 ASSIGN_SHUFFLE_FUNC( 6, 12, sse4);
946
3/4
✓ Branch 0 taken 65 times.
✓ Branch 1 taken 238 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 65 times.
303 ASSIGN_SHUFFLE_FUNC(16, 4, sse4);
947
4/4
✓ Branch 0 taken 65 times.
✓ Branch 1 taken 238 times.
✓ Branch 2 taken 36 times.
✓ Branch 3 taken 29 times.
303 ASSIGN_SHUFFLE_FUNC(16, 12, sse4);
948
4/4
✓ Branch 0 taken 65 times.
✓ Branch 1 taken 238 times.
✓ Branch 2 taken 24 times.
✓ Branch 3 taken 41 times.
303 ASSIGN_SHUFFLE_FUNC(16, 16, sse4);
949
4/4
✓ Branch 0 taken 78 times.
✓ Branch 1 taken 225 times.
✓ Branch 2 taken 40 times.
✓ Branch 3 taken 38 times.
303 ASSIGN_SHUFFLE_FUNC( 8, 12, sse4);
950
4/4
✓ Branch 0 taken 21 times.
✓ Branch 1 taken 282 times.
✓ Branch 2 taken 12 times.
✓ Branch 3 taken 9 times.
303 ASSIGN_SHUFFLE_FUNC(12, 12, sse4);
951
3/4
✓ Branch 0 taken 101 times.
✓ Branch 1 taken 202 times.
✓ Branch 2 taken 101 times.
✗ Branch 3 not taken.
303 ASSIGN_SHUFFLE_FUNC(32, 32, avx2);
952
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 303 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
303 ASSIGN_SHUFFLE_FUNC(64, 64, avx512);
953 av_assert1(out->func);
954 303 return 0;
955 }
956
957 /* Normalize clear values into 32-bit integer constants */
958 4384 static void normalize_clear(SwsOp *op)
959 {
960 static_assert(sizeof(uint32_t) == sizeof(int), "int size mismatch");
961 SwsImplResult res;
962 union {
963 uint32_t u32;
964 int i;
965 } c;
966
967 4384 ff_sws_setup_clear(&(const SwsImplParams) { .op = op }, &res);
968
969
2/2
✓ Branch 0 taken 17536 times.
✓ Branch 1 taken 4384 times.
21920 for (int i = 0; i < 4; i++) {
970
2/2
✓ Branch 0 taken 11995 times.
✓ Branch 1 taken 5541 times.
17536 if (!SWS_COMP_TEST(op->clear.mask, i))
971 11995 continue;
972
3/4
✓ Branch 0 taken 1778 times.
✓ Branch 1 taken 3563 times.
✓ Branch 2 taken 200 times.
✗ Branch 3 not taken.
5541 switch (ff_sws_pixel_type_size(op->type)) {
973 1778 case 1: c.u32 = 0x1010101U * res.priv.u8[i]; break;
974 3563 case 2: c.u32 = (uint32_t) res.priv.u16[i] << 16 | res.priv.u16[i]; break;
975 200 case 4: c.u32 = res.priv.u32[i]; break;
976 }
977
978 5541 op->clear.value[i].num = c.i;
979 5541 op->clear.value[i].den = 1;
980 }
981 4384 }
982
983 26628 static int compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out)
984 {
985 int ret;
986 26628 const int cpu_flags = av_get_cpu_flags();
987 26628 const int mmsize = get_mmsize(cpu_flags);
988
2/2
✓ Branch 0 taken 6188 times.
✓ Branch 1 taken 20440 times.
26628 if (mmsize < 0)
989 6188 return mmsize;
990
991 /* Special fast path for in-place packed shuffle */
992 20440 ret = solve_shuffle(ops, mmsize, out);
993
2/2
✓ Branch 0 taken 303 times.
✓ Branch 1 taken 20137 times.
20440 if (ret != AVERROR(ENOTSUP))
994 303 return ret;
995
996 20137 SwsOpChain *chain = ff_sws_op_chain_alloc();
997
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 20137 times.
20137 if (!chain)
998 return AVERROR(ENOMEM);
999
1000 20137 *out = (SwsCompiledOp) {
1001 .priv = chain,
1002 .slice_align = 1,
1003 .free = ff_sws_op_chain_free_cb,
1004
1005 /* Use at most two full YMM regs during the widest precision section */
1006 20137 .block_size = 2 * FFMIN(mmsize, 32) / ff_sws_op_list_max_size(ops),
1007 };
1008
1009
2/2
✓ Branch 0 taken 121010 times.
✓ Branch 1 taken 14623 times.
135633 for (int i = 0; i < ops->num_ops; i++) {
1010 121010 int op_block_size = out->block_size;
1011 121010 SwsOp *op = &ops->ops[i];
1012
1013
2/2
✓ Branch 1 taken 37855 times.
✓ Branch 2 taken 83155 times.
121010 if (op_is_type_invariant(op)) {
1014
2/2
✓ Branch 0 taken 4384 times.
✓ Branch 1 taken 33471 times.
37855 if (op->op == SWS_OP_CLEAR)
1015 4384 normalize_clear(op);
1016 37855 op_block_size *= ff_sws_pixel_type_size(op->type);
1017 37855 op->type = SWS_PIXEL_U8;
1018 }
1019
1020 121010 ret = ff_sws_op_compile_tables(ctx, tables, FF_ARRAY_ELEMS(tables),
1021 ops, i, op_block_size, chain);
1022
2/2
✓ Branch 0 taken 5514 times.
✓ Branch 1 taken 115496 times.
121010 if (ret < 0) {
1023 5514 av_log(ctx, AV_LOG_TRACE, "Failed to compile op %d\n", i);
1024 5514 ff_sws_op_chain_free(chain);
1025 5514 return ret;
1026 }
1027 }
1028
1029 #define ASSIGN_PROCESS_FUNC(NAME) \
1030 do { \
1031 SWS_DECL_FUNC(NAME); \
1032 out->func = NAME; \
1033 } while (0)
1034
1035 14623 const SwsOp *read = ff_sws_op_list_input(ops);
1036 14623 const SwsOp *write = ff_sws_op_list_output(ops);
1037
3/4
✓ Branch 0 taken 14623 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 5232 times.
✓ Branch 3 taken 9391 times.
14623 const int read_planes = read ? (read->rw.packed ? 1 : read->rw.elems) : 0;
1038
2/2
✓ Branch 0 taken 5263 times.
✓ Branch 1 taken 9360 times.
14623 const int write_planes = write->rw.packed ? 1 : write->rw.elems;
1039
4/5
✓ Branch 0 taken 4081 times.
✓ Branch 1 taken 175 times.
✓ Branch 2 taken 6592 times.
✓ Branch 3 taken 3775 times.
✗ Branch 4 not taken.
14623 switch (FFMAX(read_planes, write_planes)) {
1040 4081 case 1: ASSIGN_PROCESS_FUNC(ff_sws_process1_x86); break;
1041 175 case 2: ASSIGN_PROCESS_FUNC(ff_sws_process2_x86); break;
1042 6592 case 3: ASSIGN_PROCESS_FUNC(ff_sws_process3_x86); break;
1043 3775 case 4: ASSIGN_PROCESS_FUNC(ff_sws_process4_x86); break;
1044 }
1045
1046
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 14623 times.
14623 if (ret < 0) {
1047 ff_sws_op_chain_free(chain);
1048 return ret;
1049 }
1050
1051 14623 out->cpu_flags = chain->cpu_flags;
1052 14623 out->over_read = chain->over_read;
1053 14623 out->over_write = chain->over_write;
1054 14623 return 0;
1055 }
1056
1057 const SwsOpBackend backend_x86 = {
1058 .name = "x86",
1059 .compile = compile,
1060 .hw_format = AV_PIX_FMT_NONE,
1061 };
1062