| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /** | ||
| 2 | * Copyright (C) 2025 Niklas Haas | ||
| 3 | * | ||
| 4 | * This file is part of FFmpeg. | ||
| 5 | * | ||
| 6 | * FFmpeg is free software; you can redistribute it and/or | ||
| 7 | * modify it under the terms of the GNU Lesser General Public | ||
| 8 | * License as published by the Free Software Foundation; either | ||
| 9 | * version 2.1 of the License, or (at your option) any later version. | ||
| 10 | * | ||
| 11 | * FFmpeg is distributed in the hope that it will be useful, | ||
| 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 14 | * Lesser General Public License for more details. | ||
| 15 | * | ||
| 16 | * You should have received a copy of the GNU Lesser General Public | ||
| 17 | * License along with FFmpeg; if not, write to the Free Software | ||
| 18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
| 19 | */ | ||
| 20 | |||
| 21 | #include "libavutil/avassert.h" | ||
| 22 | #include "libavutil/cpu.h" | ||
| 23 | #include "libavutil/mathematics.h" | ||
| 24 | #include "libavutil/mem.h" | ||
| 25 | #include "libavutil/mem_internal.h" | ||
| 26 | #include "libavutil/refstruct.h" | ||
| 27 | |||
| 28 | #include "ops.h" | ||
| 29 | #include "ops_internal.h" | ||
| 30 | #include "ops_dispatch.h" | ||
| 31 | |||
| 32 | typedef struct SwsOpPass { | ||
| 33 | SwsCompiledOp comp; | ||
| 34 | SwsOpExec exec_base; | ||
| 35 | SwsOpExec exec_tail; | ||
| 36 | size_t num_blocks; | ||
| 37 | int tail_off_in; | ||
| 38 | int tail_off_out; | ||
| 39 | int tail_size_in; | ||
| 40 | int tail_size_out; | ||
| 41 | int planes_in; | ||
| 42 | int planes_out; | ||
| 43 | int pixel_bits_in; | ||
| 44 | int pixel_bits_out; | ||
| 45 | int idx_in[4]; | ||
| 46 | int idx_out[4]; | ||
| 47 | int *offsets_y; | ||
| 48 | int filter_size; | ||
| 49 | bool memcpy_first; | ||
| 50 | bool memcpy_last; | ||
| 51 | bool memcpy_out; | ||
| 52 | size_t tail_blocks; | ||
| 53 | uint8_t *tail_buf; /* extra memory for fixing unpadded tails */ | ||
| 54 | unsigned int tail_buf_size; | ||
| 55 | } SwsOpPass; | ||
| 56 | |||
| 57 | 66685 | int ff_sws_ops_compile_backend(SwsContext *ctx, const SwsOpBackend *backend, | |
| 58 | const SwsOpList *ops, SwsCompiledOp *out) | ||
| 59 | { | ||
| 60 | SwsOpList *copy; | ||
| 61 | 66685 | SwsCompiledOp compiled = {0}; | |
| 62 | 66685 | int ret = 0; | |
| 63 | |||
| 64 | 66685 | copy = ff_sws_op_list_duplicate(ops); | |
| 65 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 66685 times.
|
66685 | if (!copy) |
| 66 | ✗ | return AVERROR(ENOMEM); | |
| 67 | |||
| 68 | /* Ensure these are always set during compilation */ | ||
| 69 | 66685 | ff_sws_op_list_update_comps(copy); | |
| 70 | |||
| 71 | 66685 | ret = backend->compile(ctx, copy, &compiled); | |
| 72 |
2/2✓ Branch 0 taken 36496 times.
✓ Branch 1 taken 30189 times.
|
66685 | if (ret < 0) { |
| 73 |
1/2✓ Branch 0 taken 36496 times.
✗ Branch 1 not taken.
|
36496 | int msg_lev = ret == AVERROR(ENOTSUP) ? AV_LOG_TRACE : AV_LOG_ERROR; |
| 74 | 72992 | av_log(ctx, msg_lev, "Backend '%s' failed to compile operations: %s\n", | |
| 75 | 36496 | backend->name, av_err2str(ret)); | |
| 76 | } else { | ||
| 77 | 30189 | *out = compiled; | |
| 78 | } | ||
| 79 | |||
| 80 | 66685 | ff_sws_op_list_free(©); | |
| 81 | 66685 | return ret; | |
| 82 | } | ||
| 83 | |||
| 84 | 14280 | int ff_sws_ops_compile(SwsContext *ctx, const SwsOpList *ops, SwsCompiledOp *out) | |
| 85 | { | ||
| 86 |
1/2✓ Branch 0 taken 29557 times.
✗ Branch 1 not taken.
|
29557 | for (int n = 0; ff_sws_op_backends[n]; n++) { |
| 87 | 29557 | const SwsOpBackend *backend = ff_sws_op_backends[n]; | |
| 88 |
1/2✓ Branch 0 taken 29557 times.
✗ Branch 1 not taken.
|
29557 | if (ops->src.hw_format != backend->hw_format || |
| 89 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 29557 times.
|
29557 | ops->dst.hw_format != backend->hw_format) |
| 90 | ✗ | continue; | |
| 91 |
2/2✓ Branch 1 taken 15277 times.
✓ Branch 2 taken 14280 times.
|
29557 | if (ff_sws_ops_compile_backend(ctx, backend, ops, out) < 0) |
| 92 | 15277 | continue; | |
| 93 | |||
| 94 | 14280 | av_log(ctx, AV_LOG_VERBOSE, "Compiled using backend '%s': " | |
| 95 | "block size = %d, over-read = %d, over-write = %d, cpu flags = 0x%x\n", | ||
| 96 | 14280 | backend->name, out->block_size, out->over_read, out->over_write, | |
| 97 | out->cpu_flags); | ||
| 98 | |||
| 99 | 14280 | ff_sws_op_list_print(ctx, AV_LOG_VERBOSE, AV_LOG_TRACE, ops); | |
| 100 | 14280 | return 0; | |
| 101 | } | ||
| 102 | |||
| 103 | ✗ | return AVERROR(ENOTSUP); | |
| 104 | } | ||
| 105 | |||
| 106 | 30189 | void ff_sws_compiled_op_unref(SwsCompiledOp *comp) | |
| 107 | { | ||
| 108 |
1/2✓ Branch 0 taken 30189 times.
✗ Branch 1 not taken.
|
30189 | if (comp->free) |
| 109 | 30189 | comp->free(comp->priv); | |
| 110 | |||
| 111 | 30189 | *comp = (SwsCompiledOp) {0}; | |
| 112 | 30189 | } | |
| 113 | |||
| 114 | 14280 | static void op_pass_free(void *ptr) | |
| 115 | { | ||
| 116 | 14280 | SwsOpPass *p = ptr; | |
| 117 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14280 times.
|
14280 | if (!p) |
| 118 | ✗ | return; | |
| 119 | |||
| 120 | 14280 | ff_sws_compiled_op_unref(&p->comp); | |
| 121 | 14280 | av_refstruct_unref(&p->offsets_y); | |
| 122 | 14280 | av_free(p->exec_base.in_bump_y); | |
| 123 | 14280 | av_free(p->exec_base.in_offset_x); | |
| 124 | 14280 | av_free(p->tail_buf); | |
| 125 | 14280 | av_free(p); | |
| 126 | } | ||
| 127 | |||
| 128 | 14280 | static inline void get_row_data(const SwsOpPass *p, const int y_dst, | |
| 129 | const uint8_t *in[4], uint8_t *out[4]) | ||
| 130 | { | ||
| 131 | 14280 | const SwsOpExec *base = &p->exec_base; | |
| 132 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14280 times.
|
14280 | const int y_src = p->offsets_y ? p->offsets_y[y_dst] : y_dst; |
| 133 |
2/2✓ Branch 0 taken 26774 times.
✓ Branch 1 taken 14280 times.
|
41054 | for (int i = 0; i < p->planes_in; i++) |
| 134 | 26774 | in[i] = base->in[i] + (y_src >> base->in_sub_y[i]) * base->in_stride[i]; | |
| 135 |
2/2✓ Branch 0 taken 29274 times.
✓ Branch 1 taken 14280 times.
|
43554 | for (int i = 0; i < p->planes_out; i++) |
| 136 | 29274 | out[i] = base->out[i] + (y_dst >> base->out_sub_y[i]) * base->out_stride[i]; | |
| 137 | 14280 | } | |
| 138 | |||
| 139 | 4577 | static inline size_t pixel_bytes(size_t pixels, int pixel_bits, | |
| 140 | enum AVRounding rounding) | ||
| 141 | { | ||
| 142 | 4577 | const uint64_t bits = (uint64_t) pixels * pixel_bits; | |
| 143 |
2/3✓ Branch 0 taken 1804 times.
✓ Branch 1 taken 2773 times.
✗ Branch 2 not taken.
|
4577 | switch (rounding) { |
| 144 | 1804 | case AV_ROUND_ZERO: | |
| 145 | case AV_ROUND_DOWN: | ||
| 146 | 1804 | return bits >> 3; | |
| 147 | 2773 | case AV_ROUND_INF: | |
| 148 | case AV_ROUND_UP: | ||
| 149 | 2773 | return (bits + 7) >> 3; | |
| 150 | ✗ | default: | |
| 151 | ✗ | av_unreachable("Invalid rounding mode"); | |
| 152 | return (size_t) -1; | ||
| 153 | } | ||
| 154 | } | ||
| 155 | |||
| 156 | 56048 | static size_t safe_bytes_pad(int linesize, int plane_pad) | |
| 157 | { | ||
| 158 | av_assert1(linesize); | ||
| 159 | 56048 | int64_t safe_bytes = FFABS((int64_t) linesize) - plane_pad; | |
| 160 | 56048 | return FFMAX(safe_bytes, 0); | |
| 161 | } | ||
| 162 | |||
| 163 | ✗ | static size_t safe_blocks_offset(size_t num_blocks, unsigned block_size, | |
| 164 | ptrdiff_t safe_offset, | ||
| 165 | const int32_t *offset_bytes) | ||
| 166 | { | ||
| 167 | ✗ | size_t safe_blocks = num_blocks; | |
| 168 | ✗ | while (safe_blocks && offset_bytes[safe_blocks * block_size - 1] > safe_offset) | |
| 169 | ✗ | safe_blocks--; | |
| 170 | ✗ | return safe_blocks; | |
| 171 | } | ||
| 172 | |||
| 173 | 14280 | static int op_pass_setup(const SwsFrame *out, const SwsFrame *in, | |
| 174 | const SwsPass *pass) | ||
| 175 | { | ||
| 176 | 14280 | const AVPixFmtDescriptor *indesc = av_pix_fmt_desc_get(in->format); | |
| 177 | 14280 | const AVPixFmtDescriptor *outdesc = av_pix_fmt_desc_get(out->format); | |
| 178 | |||
| 179 | 14280 | SwsOpPass *p = pass->priv; | |
| 180 | 14280 | SwsOpExec *exec = &p->exec_base; | |
| 181 | 14280 | const SwsCompiledOp *comp = &p->comp; | |
| 182 | |||
| 183 | /* Set up main loop parameters */ | ||
| 184 | 14280 | const unsigned block_size = comp->block_size; | |
| 185 | 14280 | const size_t num_blocks = (pass->width + block_size - 1) / block_size; | |
| 186 | 14280 | const size_t aligned_w = num_blocks * block_size; | |
| 187 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14280 times.
|
14280 | if (aligned_w < pass->width) /* overflow */ |
| 188 | ✗ | return AVERROR(EINVAL); | |
| 189 | 14280 | p->num_blocks = num_blocks; | |
| 190 | 14280 | p->memcpy_first = false; | |
| 191 | 14280 | p->memcpy_last = false; | |
| 192 | 14280 | p->memcpy_out = false; | |
| 193 | |||
| 194 | 14280 | size_t safe_blocks = num_blocks; | |
| 195 |
2/2✓ Branch 0 taken 26774 times.
✓ Branch 1 taken 14280 times.
|
41054 | for (int i = 0; i < p->planes_in; i++) { |
| 196 | 26774 | int idx = p->idx_in[i]; | |
| 197 |
4/4✓ Branch 0 taken 20917 times.
✓ Branch 1 taken 5857 times.
✓ Branch 2 taken 5857 times.
✓ Branch 3 taken 15060 times.
|
26774 | int chroma = idx == 1 || idx == 2; |
| 198 |
2/2✓ Branch 0 taken 11714 times.
✓ Branch 1 taken 15060 times.
|
26774 | int sub_x = chroma ? indesc->log2_chroma_w : 0; |
| 199 |
2/2✓ Branch 0 taken 11714 times.
✓ Branch 1 taken 15060 times.
|
26774 | int sub_y = chroma ? indesc->log2_chroma_h : 0; |
| 200 | 26774 | size_t safe_bytes = safe_bytes_pad(in->linesize[idx], comp->over_read); | |
| 201 | size_t safe_blocks_in; | ||
| 202 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 26774 times.
|
26774 | if (exec->in_offset_x) { |
| 203 | ✗ | size_t filter_size = pixel_bytes(p->filter_size, p->pixel_bits_in, | |
| 204 | AV_ROUND_UP); | ||
| 205 | ✗ | safe_blocks_in = safe_blocks_offset(num_blocks, block_size, | |
| 206 | ✗ | safe_bytes - filter_size, | |
| 207 | ✗ | exec->in_offset_x); | |
| 208 | } else { | ||
| 209 | 26774 | safe_blocks_in = safe_bytes / exec->block_size_in; | |
| 210 | } | ||
| 211 | |||
| 212 |
2/2✓ Branch 0 taken 969 times.
✓ Branch 1 taken 25805 times.
|
26774 | if (safe_blocks_in < num_blocks) { |
| 213 | 969 | p->memcpy_first |= in->linesize[idx] < 0; | |
| 214 | 969 | p->memcpy_last |= in->linesize[idx] > 0; | |
| 215 | 969 | safe_blocks = FFMIN(safe_blocks, safe_blocks_in); | |
| 216 | } | ||
| 217 | |||
| 218 | 26774 | size_t loop_size = num_blocks * exec->block_size_in; | |
| 219 | 26774 | exec->in[i] = in->data[idx]; | |
| 220 | 26774 | exec->in_stride[i] = in->linesize[idx]; | |
| 221 | 26774 | exec->in_bump[i] = in->linesize[idx] - loop_size; | |
| 222 | 26774 | exec->in_sub_y[i] = sub_y; | |
| 223 | 26774 | exec->in_sub_x[i] = sub_x; | |
| 224 | } | ||
| 225 | |||
| 226 |
2/2✓ Branch 0 taken 29274 times.
✓ Branch 1 taken 14280 times.
|
43554 | for (int i = 0; i < p->planes_out; i++) { |
| 227 | 29274 | int idx = p->idx_out[i]; | |
| 228 |
4/4✓ Branch 0 taken 22967 times.
✓ Branch 1 taken 6307 times.
✓ Branch 2 taken 6307 times.
✓ Branch 3 taken 16660 times.
|
29274 | int chroma = idx == 1 || idx == 2; |
| 229 |
2/2✓ Branch 0 taken 12614 times.
✓ Branch 1 taken 16660 times.
|
29274 | int sub_x = chroma ? outdesc->log2_chroma_w : 0; |
| 230 |
2/2✓ Branch 0 taken 12614 times.
✓ Branch 1 taken 16660 times.
|
29274 | int sub_y = chroma ? outdesc->log2_chroma_h : 0; |
| 231 | 29274 | size_t safe_bytes = safe_bytes_pad(out->linesize[idx], comp->over_write); | |
| 232 | 29274 | size_t safe_blocks_out = safe_bytes / exec->block_size_out; | |
| 233 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 29274 times.
|
29274 | if (safe_blocks_out < num_blocks) { |
| 234 | ✗ | p->memcpy_out = true; | |
| 235 | ✗ | safe_blocks = FFMIN(safe_blocks, safe_blocks_out); | |
| 236 | } | ||
| 237 | |||
| 238 | 29274 | size_t loop_size = num_blocks * exec->block_size_out; | |
| 239 | 29274 | exec->out[i] = out->data[idx]; | |
| 240 | 29274 | exec->out_stride[i] = out->linesize[idx]; | |
| 241 | 29274 | exec->out_bump[i] = out->linesize[idx] - loop_size; | |
| 242 | 29274 | exec->out_sub_y[i] = sub_y; | |
| 243 | 29274 | exec->out_sub_x[i] = sub_x; | |
| 244 | } | ||
| 245 | |||
| 246 |
3/4✓ Branch 0 taken 14280 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 902 times.
✓ Branch 3 taken 13378 times.
|
14280 | const bool memcpy_in = p->memcpy_first || p->memcpy_last; |
| 247 |
3/4✓ Branch 0 taken 13378 times.
✓ Branch 1 taken 902 times.
✓ Branch 2 taken 13378 times.
✗ Branch 3 not taken.
|
14280 | if (!memcpy_in && !p->memcpy_out) { |
| 248 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 13378 times.
|
13378 | av_assert0(safe_blocks == num_blocks); |
| 249 | 13378 | return 0; | |
| 250 | } | ||
| 251 | |||
| 252 | /* Set-up tail section parameters and buffers */ | ||
| 253 | 902 | SwsOpExec *tail = &p->exec_tail; | |
| 254 | 902 | const int align = av_cpu_max_align(); | |
| 255 | 902 | size_t alloc_size = 0; | |
| 256 | 902 | *tail = *exec; | |
| 257 | |||
| 258 | 902 | const size_t safe_width = safe_blocks * block_size; | |
| 259 | 902 | const size_t tail_size = pass->width - safe_width; | |
| 260 | 902 | p->tail_off_out = pixel_bytes(safe_width, p->pixel_bits_out, AV_ROUND_DOWN); | |
| 261 | 902 | p->tail_size_out = pixel_bytes(tail_size, p->pixel_bits_out, AV_ROUND_UP); | |
| 262 | 902 | p->tail_blocks = num_blocks - safe_blocks; | |
| 263 | |||
| 264 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 902 times.
|
902 | if (exec->in_offset_x) { |
| 265 | ✗ | p->tail_off_in = exec->in_offset_x[safe_width]; | |
| 266 | ✗ | p->tail_size_in = exec->in_offset_x[pass->width - 1] - p->tail_off_in; | |
| 267 | ✗ | p->tail_size_in += pixel_bytes(p->filter_size, p->pixel_bits_in, AV_ROUND_UP); | |
| 268 | } else { | ||
| 269 | 902 | p->tail_off_in = pixel_bytes(safe_width, p->pixel_bits_in, AV_ROUND_DOWN); | |
| 270 | 902 | p->tail_size_in = pixel_bytes(tail_size, p->pixel_bits_in, AV_ROUND_UP); | |
| 271 | } | ||
| 272 | |||
| 273 | 902 | const size_t alloc_width = aligned_w - safe_width; | |
| 274 |
3/4✓ Branch 0 taken 1871 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 969 times.
✓ Branch 3 taken 902 times.
|
1871 | for (int i = 0; memcpy_in && i < p->planes_in; i++) { |
| 275 | size_t needed_size; | ||
| 276 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 969 times.
|
969 | if (exec->in_offset_x) { |
| 277 | /* The input offset map is already padded to multiples of the block | ||
| 278 | * size, and clamps the input offsets to the image boundaries; so | ||
| 279 | * we just need to compensate for the comp->over_read */ | ||
| 280 | ✗ | needed_size = p->tail_size_in; | |
| 281 | } else { | ||
| 282 | 969 | needed_size = pixel_bytes(alloc_width, p->pixel_bits_in, AV_ROUND_UP); | |
| 283 | } | ||
| 284 | 969 | size_t loop_size = p->tail_blocks * exec->block_size_in; | |
| 285 | 969 | tail->in_stride[i] = FFALIGN(needed_size + comp->over_read, align); | |
| 286 | 969 | tail->in_bump[i] = tail->in_stride[i] - loop_size; | |
| 287 | 969 | alloc_size += tail->in_stride[i] * in->height; | |
| 288 | } | ||
| 289 | |||
| 290 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 902 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
902 | for (int i = 0; p->memcpy_out && i < p->planes_out; i++) { |
| 291 | ✗ | size_t needed_size = pixel_bytes(alloc_width, p->pixel_bits_out, AV_ROUND_UP); | |
| 292 | ✗ | size_t loop_size = p->tail_blocks * exec->block_size_out; | |
| 293 | ✗ | tail->out_stride[i] = FFALIGN(needed_size + comp->over_write, align); | |
| 294 | ✗ | tail->out_bump[i] = tail->out_stride[i] - loop_size; | |
| 295 | ✗ | alloc_size += tail->out_stride[i] * out->height; | |
| 296 | } | ||
| 297 | |||
| 298 |
2/4✓ Branch 0 taken 902 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 902 times.
|
902 | if (memcpy_in && exec->in_offset_x) { |
| 299 | /* `in_offset_x` is indexed relative to the line start, not the start | ||
| 300 | * of the section being processed; so we need to over-allocate this | ||
| 301 | * array to the full width of the image, even though we will only | ||
| 302 | * partially fill in the offsets relevant to the tail region */ | ||
| 303 | ✗ | alloc_size += aligned_w * sizeof(*exec->in_offset_x); | |
| 304 | } | ||
| 305 | |||
| 306 | 902 | av_fast_mallocz(&p->tail_buf, &p->tail_buf_size, alloc_size); | |
| 307 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 902 times.
|
902 | if (!p->tail_buf) |
| 308 | ✗ | return AVERROR(ENOMEM); | |
| 309 | |||
| 310 | 902 | uint8_t *tail_buf = p->tail_buf; | |
| 311 |
3/4✓ Branch 0 taken 1871 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 969 times.
✓ Branch 3 taken 902 times.
|
1871 | for (int i = 0; memcpy_in && i < p->planes_in; i++) { |
| 312 | 969 | tail->in[i] = tail_buf; | |
| 313 | 969 | tail_buf += tail->in_stride[i] * in->height; | |
| 314 | } | ||
| 315 | |||
| 316 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 902 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
902 | for (int i = 0; p->memcpy_out && i < p->planes_out; i++) { |
| 317 | ✗ | tail->out[i] = tail_buf; | |
| 318 | ✗ | tail_buf += tail->out_stride[i] * out->height; | |
| 319 | } | ||
| 320 | |||
| 321 |
2/4✓ Branch 0 taken 902 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 902 times.
|
902 | if (memcpy_in && exec->in_offset_x) { |
| 322 | ✗ | tail->in_offset_x = (int32_t *) tail_buf; | |
| 323 | ✗ | for (int i = safe_width; i < aligned_w; i++) | |
| 324 | ✗ | tail->in_offset_x[i] = exec->in_offset_x[i] - p->tail_off_in; | |
| 325 | } | ||
| 326 | |||
| 327 | 902 | return 0; | |
| 328 | } | ||
| 329 | |||
| 330 | 969 | static void copy_lines(uint8_t *dst, const size_t dst_stride, | |
| 331 | const uint8_t *src, const size_t src_stride, | ||
| 332 | const int h, const size_t bytes) | ||
| 333 | { | ||
| 334 |
2/2✓ Branch 0 taken 93024 times.
✓ Branch 1 taken 969 times.
|
93993 | for (int y = 0; y < h; y++) { |
| 335 | 93024 | memcpy(dst, src, bytes); | |
| 336 | 93024 | dst += dst_stride; | |
| 337 | 93024 | src += src_stride; | |
| 338 | } | ||
| 339 | 969 | } | |
| 340 | |||
| 341 | 14280 | static void op_pass_run(const SwsFrame *out, const SwsFrame *in, const int y, | |
| 342 | const int h, const SwsPass *pass) | ||
| 343 | { | ||
| 344 | 14280 | const SwsOpPass *p = pass->priv; | |
| 345 | 14280 | const SwsCompiledOp *comp = &p->comp; | |
| 346 | |||
| 347 | /* Fill exec metadata for this slice */ | ||
| 348 | 14280 | DECLARE_ALIGNED_32(SwsOpExec, exec) = p->exec_base; | |
| 349 | 14280 | exec.slice_y = y; | |
| 350 | 14280 | exec.slice_h = h; | |
| 351 | |||
| 352 | /** | ||
| 353 | * To ensure safety, we need to consider the following: | ||
| 354 | * | ||
| 355 | * 1. We can overread the input, unless this is the last line of an | ||
| 356 | * unpadded buffer. All defined operations can handle arbitrary pixel | ||
| 357 | * input, so overread of arbitrary data is fine. For flipped images, | ||
| 358 | * this condition is actually *inverted* to where the first line is | ||
| 359 | * the one at the end of the buffer. | ||
| 360 | * | ||
| 361 | * 2. We can overwrite the output, as long as we don't write more than the | ||
| 362 | * amount of pixels that fit into one linesize. So we always need to | ||
| 363 | * memcpy the last column on the output side if unpadded. | ||
| 364 | */ | ||
| 365 | |||
| 366 |
3/4✓ Branch 0 taken 902 times.
✓ Branch 1 taken 13378 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 902 times.
|
27658 | const bool memcpy_in = p->memcpy_last && y + h == pass->height || |
| 367 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 13378 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
13378 | p->memcpy_first && y == 0; |
| 368 | 14280 | const bool memcpy_out = p->memcpy_out; | |
| 369 | 14280 | const size_t num_blocks = p->num_blocks; | |
| 370 | 14280 | const size_t tail_blocks = p->tail_blocks; | |
| 371 | |||
| 372 | 14280 | get_row_data(p, y, exec.in, exec.out); | |
| 373 |
3/4✓ Branch 0 taken 13378 times.
✓ Branch 1 taken 902 times.
✓ Branch 2 taken 13378 times.
✗ Branch 3 not taken.
|
14280 | if (!memcpy_in && !memcpy_out) { |
| 374 | /* Fast path (fully aligned/padded inputs and outputs) */ | ||
| 375 | 13378 | comp->func(&exec, comp->priv, 0, y, num_blocks, y + h); | |
| 376 | 13378 | return; | |
| 377 | } | ||
| 378 | |||
| 379 | /* Non-aligned case (slow path); process main blocks as normal, and | ||
| 380 | * a separate tail (via memcpy into an appropriately padded buffer) */ | ||
| 381 |
1/2✓ Branch 0 taken 902 times.
✗ Branch 1 not taken.
|
902 | if (num_blocks > tail_blocks) { |
| 382 |
2/2✓ Branch 0 taken 3608 times.
✓ Branch 1 taken 902 times.
|
4510 | for (int i = 0; i < 4; i++) { |
| 383 | /* We process fewer blocks, so the in_bump needs to be increased | ||
| 384 | * to reflect that the plane pointers are left on the last block, | ||
| 385 | * not the end of the processed line, after each loop iteration */ | ||
| 386 | 3608 | exec.in_bump[i] += exec.block_size_in * tail_blocks; | |
| 387 | 3608 | exec.out_bump[i] += exec.block_size_out * tail_blocks; | |
| 388 | } | ||
| 389 | |||
| 390 | 902 | comp->func(&exec, comp->priv, 0, y, num_blocks - tail_blocks, y + h); | |
| 391 | } | ||
| 392 | |||
| 393 | 902 | DECLARE_ALIGNED_32(SwsOpExec, tail) = p->exec_tail; | |
| 394 | 902 | tail.slice_y = y; | |
| 395 | 902 | tail.slice_h = h; | |
| 396 | |||
| 397 |
2/2✓ Branch 0 taken 969 times.
✓ Branch 1 taken 902 times.
|
1871 | for (int i = 0; i < p->planes_in; i++) { |
| 398 | /* Input offsets are relative to the base pointer */ | ||
| 399 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 969 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
969 | if (!exec.in_offset_x || memcpy_in) |
| 400 | 969 | exec.in[i] += p->tail_off_in; | |
| 401 | 969 | tail.in[i] += y * tail.in_stride[i]; | |
| 402 | } | ||
| 403 |
2/2✓ Branch 0 taken 1856 times.
✓ Branch 1 taken 902 times.
|
2758 | for (int i = 0; i < p->planes_out; i++) { |
| 404 | 1856 | exec.out[i] += p->tail_off_out; | |
| 405 | 1856 | tail.out[i] += y * tail.out_stride[i]; | |
| 406 | } | ||
| 407 | |||
| 408 |
2/2✓ Branch 0 taken 969 times.
✓ Branch 1 taken 902 times.
|
1871 | for (int i = 0; i < p->planes_in; i++) { |
| 409 |
1/2✓ Branch 0 taken 969 times.
✗ Branch 1 not taken.
|
969 | if (memcpy_in) { |
| 410 | 969 | copy_lines((uint8_t *) tail.in[i], tail.in_stride[i], | |
| 411 | 969 | exec.in[i], exec.in_stride[i], h, p->tail_size_in); | |
| 412 | } else { | ||
| 413 | /* Reuse input pointers directly */ | ||
| 414 | ✗ | const size_t loop_size = tail_blocks * exec.block_size_in; | |
| 415 | ✗ | tail.in[i] = exec.in[i]; | |
| 416 | ✗ | tail.in_stride[i] = exec.in_stride[i]; | |
| 417 | ✗ | tail.in_bump[i] = exec.in_stride[i] - loop_size; | |
| 418 | } | ||
| 419 | } | ||
| 420 | |||
| 421 |
3/4✓ Branch 0 taken 2758 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1856 times.
✓ Branch 3 taken 902 times.
|
2758 | for (int i = 0; !memcpy_out && i < p->planes_out; i++) { |
| 422 | /* Reuse output pointers directly */ | ||
| 423 | 1856 | const size_t loop_size = tail_blocks * exec.block_size_out; | |
| 424 | 1856 | tail.out[i] = exec.out[i]; | |
| 425 | 1856 | tail.out_stride[i] = exec.out_stride[i]; | |
| 426 | 1856 | tail.out_bump[i] = exec.out_stride[i] - loop_size; | |
| 427 | } | ||
| 428 | |||
| 429 | /* Dispatch kernel over tail */ | ||
| 430 | av_assert1(tail_blocks > 0); | ||
| 431 | 902 | comp->func(&tail, comp->priv, num_blocks - tail_blocks, y, num_blocks, y + h); | |
| 432 | |||
| 433 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 902 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
902 | for (int i = 0; memcpy_out && i < p->planes_out; i++) { |
| 434 | ✗ | copy_lines(exec.out[i], exec.out_stride[i], | |
| 435 | ✗ | tail.out[i], tail.out_stride[i], h, p->tail_size_out); | |
| 436 | } | ||
| 437 | } | ||
| 438 | |||
| 439 | 28560 | static int rw_planes(const SwsOp *op) | |
| 440 | { | ||
| 441 |
2/2✓ Branch 0 taken 12376 times.
✓ Branch 1 taken 16184 times.
|
28560 | return op->rw.packed ? 1 : op->rw.elems; |
| 442 | } | ||
| 443 | |||
| 444 | 28560 | static int rw_pixel_bits(const SwsOp *op) | |
| 445 | { | ||
| 446 |
2/2✓ Branch 0 taken 12376 times.
✓ Branch 1 taken 16184 times.
|
28560 | const int elems = op->rw.packed ? op->rw.elems : 1; |
| 447 | 28560 | const int size = ff_sws_pixel_type_size(op->type); | |
| 448 | 28560 | const int bits = 8 >> op->rw.frac; | |
| 449 | av_assert1(bits >= 1); | ||
| 450 | 28560 | return elems * size * bits; | |
| 451 | } | ||
| 452 | |||
| 453 | 28560 | static void align_pass(SwsPass *pass, int block_size, int over_rw, int pixel_bits) | |
| 454 | { | ||
| 455 |
2/2✓ Branch 0 taken 14280 times.
✓ Branch 1 taken 14280 times.
|
28560 | if (!pass) |
| 456 | 14280 | return; | |
| 457 | |||
| 458 | /* Add at least as many pixels as needed to cover the padding requirement */ | ||
| 459 | 14280 | const int pad = (over_rw * 8 + pixel_bits - 1) / pixel_bits; | |
| 460 | |||
| 461 | 14280 | SwsPassBuffer *buf = pass->output; | |
| 462 | 14280 | buf->width_align = FFMAX(buf->width_align, block_size); | |
| 463 | 14280 | buf->width_pad = FFMAX(buf->width_pad, pad); | |
| 464 | } | ||
| 465 | |||
| 466 | 14280 | static int compile(SwsGraph *graph, const SwsOpList *ops, SwsPass *input, | |
| 467 | SwsPass **output) | ||
| 468 | { | ||
| 469 | 14280 | SwsContext *ctx = graph->ctx; | |
| 470 | 14280 | SwsOpPass *p = av_mallocz(sizeof(*p)); | |
| 471 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14280 times.
|
14280 | if (!p) |
| 472 | ✗ | return AVERROR(ENOMEM); | |
| 473 | |||
| 474 | 14280 | int ret = ff_sws_ops_compile(ctx, ops, &p->comp); | |
| 475 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14280 times.
|
14280 | if (ret < 0) |
| 476 | ✗ | goto fail; | |
| 477 | |||
| 478 | 14280 | const SwsCompiledOp *comp = &p->comp; | |
| 479 | 14280 | const SwsFormat *dst = &ops->dst; | |
| 480 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14280 times.
|
14280 | if (p->comp.opaque) { |
| 481 | ✗ | SwsCompiledOp c = *comp; | |
| 482 | ✗ | av_free(p); | |
| 483 | ✗ | return ff_sws_graph_add_pass(graph, dst->format, dst->width, dst->height, | |
| 484 | input, c.slice_align, c.func_opaque, | ||
| 485 | NULL, c.priv, c.free, output); | ||
| 486 | } | ||
| 487 | |||
| 488 | 14280 | const SwsOp *read = ff_sws_op_list_input(ops); | |
| 489 | 14280 | const SwsOp *write = ff_sws_op_list_output(ops); | |
| 490 | 14280 | p->planes_in = rw_planes(read); | |
| 491 | 14280 | p->planes_out = rw_planes(write); | |
| 492 | 14280 | p->pixel_bits_in = rw_pixel_bits(read); | |
| 493 | 14280 | p->pixel_bits_out = rw_pixel_bits(write); | |
| 494 | 14280 | p->exec_base = (SwsOpExec) { | |
| 495 | 14280 | .width = dst->width, | |
| 496 | 14280 | .height = dst->height, | |
| 497 | }; | ||
| 498 | |||
| 499 | 14280 | const int64_t block_bits_in = (int64_t) comp->block_size * p->pixel_bits_in; | |
| 500 | 14280 | const int64_t block_bits_out = (int64_t) comp->block_size * p->pixel_bits_out; | |
| 501 |
2/4✓ Branch 0 taken 14280 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 14280 times.
|
14280 | if (block_bits_in & 0x7 || block_bits_out & 0x7) { |
| 502 | ✗ | av_log(ctx, AV_LOG_ERROR, "Block size must be a multiple of the pixel size.\n"); | |
| 503 | ✗ | ret = AVERROR(EINVAL); | |
| 504 | ✗ | goto fail; | |
| 505 | } | ||
| 506 | |||
| 507 | 14280 | p->exec_base.block_size_in = block_bits_in >> 3; | |
| 508 | 14280 | p->exec_base.block_size_out = block_bits_out >> 3; | |
| 509 | |||
| 510 |
2/2✓ Branch 0 taken 57120 times.
✓ Branch 1 taken 14280 times.
|
71400 | for (int i = 0; i < 4; i++) { |
| 511 |
2/2✓ Branch 0 taken 26774 times.
✓ Branch 1 taken 30346 times.
|
57120 | p->idx_in[i] = i < p->planes_in ? ops->plane_src[i] : -1; |
| 512 |
2/2✓ Branch 0 taken 29274 times.
✓ Branch 1 taken 27846 times.
|
57120 | p->idx_out[i] = i < p->planes_out ? ops->plane_dst[i] : -1; |
| 513 | } | ||
| 514 | |||
| 515 | 14280 | const SwsFilterWeights *filter = read->rw.kernel; | |
| 516 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14280 times.
|
14280 | if (read->rw.filter == SWS_OP_FILTER_V) { |
| 517 | ✗ | p->offsets_y = av_refstruct_ref(filter->offsets); | |
| 518 | |||
| 519 | /* Compute relative pointer bumps for each output line */ | ||
| 520 | ✗ | int32_t *bump = av_malloc_array(filter->dst_size, sizeof(*bump)); | |
| 521 | ✗ | if (!bump) { | |
| 522 | ✗ | ret = AVERROR(ENOMEM); | |
| 523 | ✗ | goto fail; | |
| 524 | } | ||
| 525 | |||
| 526 | ✗ | int line = filter->offsets[0]; | |
| 527 | ✗ | for (int y = 0; y < filter->dst_size - 1; y++) { | |
| 528 | ✗ | int next = filter->offsets[y + 1]; | |
| 529 | ✗ | bump[y] = next - line - 1; | |
| 530 | ✗ | line = next; | |
| 531 | } | ||
| 532 | ✗ | bump[filter->dst_size - 1] = 0; | |
| 533 | ✗ | p->exec_base.in_bump_y = bump; | |
| 534 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14280 times.
|
14280 | } else if (read->rw.filter == SWS_OP_FILTER_H) { |
| 535 | /* Compute pixel offset map for each output line */ | ||
| 536 | ✗ | const int pixels = FFALIGN(filter->dst_size, p->comp.block_size); | |
| 537 | ✗ | int32_t *offset = av_malloc_array(pixels, sizeof(*offset)); | |
| 538 | ✗ | if (!offset) { | |
| 539 | ✗ | ret = AVERROR(ENOMEM); | |
| 540 | ✗ | goto fail; | |
| 541 | } | ||
| 542 | |||
| 543 | ✗ | for (int x = 0; x < filter->dst_size; x++) { | |
| 544 | /* Sanity check; if the tap would land on a half-pixel, we cannot | ||
| 545 | * reasonably expect the implementation to know about this. Just | ||
| 546 | * error out in such (theoretical) cases. */ | ||
| 547 | ✗ | int64_t bits = (int64_t) filter->offsets[x] * p->pixel_bits_in; | |
| 548 | ✗ | if ((bits & 0x7) || (bits >> 3) > INT32_MAX) { | |
| 549 | ✗ | ret = AVERROR(EINVAL); | |
| 550 | ✗ | goto fail; | |
| 551 | } | ||
| 552 | ✗ | offset[x] = bits >> 3; | |
| 553 | } | ||
| 554 | ✗ | for (int x = filter->dst_size; x < pixels; x++) | |
| 555 | ✗ | offset[x] = offset[filter->dst_size - 1]; | |
| 556 | ✗ | p->exec_base.in_offset_x = offset; | |
| 557 | ✗ | p->exec_base.block_size_in = 0; /* ptr does not advance */ | |
| 558 | ✗ | p->filter_size = filter->filter_size; | |
| 559 | } | ||
| 560 | |||
| 561 | 14280 | ret = ff_sws_graph_add_pass(graph, dst->format, dst->width, dst->height, | |
| 562 | 14280 | input, comp->slice_align, op_pass_run, | |
| 563 | op_pass_setup, p, op_pass_free, output); | ||
| 564 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14280 times.
|
14280 | if (ret < 0) |
| 565 | ✗ | return ret; | |
| 566 | |||
| 567 | 14280 | align_pass(input, comp->block_size, comp->over_read, p->pixel_bits_in); | |
| 568 | 14280 | align_pass(*output, comp->block_size, comp->over_write, p->pixel_bits_out); | |
| 569 | 14280 | return 0; | |
| 570 | |||
| 571 | ✗ | fail: | |
| 572 | ✗ | op_pass_free(p); | |
| 573 | ✗ | return ret; | |
| 574 | } | ||
| 575 | |||
| 576 | 14280 | int ff_sws_compile_pass(SwsGraph *graph, SwsOpList **pops, int flags, | |
| 577 | SwsPass *input, SwsPass **output) | ||
| 578 | { | ||
| 579 | 14280 | const int passes_orig = graph->num_passes; | |
| 580 | 14280 | SwsContext *ctx = graph->ctx; | |
| 581 | 14280 | SwsOpList *ops = *pops; | |
| 582 | 14280 | int ret = 0; | |
| 583 | |||
| 584 | /* Check if the whole operation graph is an end-to-end no-op */ | ||
| 585 |
1/2✗ Branch 1 not taken.
✓ Branch 2 taken 14280 times.
|
14280 | if (ff_sws_op_list_is_noop(ops)) { |
| 586 | ✗ | *output = input; | |
| 587 | ✗ | goto out; | |
| 588 | } | ||
| 589 | |||
| 590 | 14280 | const SwsOp *read = ff_sws_op_list_input(ops); | |
| 591 | 14280 | const SwsOp *write = ff_sws_op_list_output(ops); | |
| 592 |
2/4✓ Branch 0 taken 14280 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 14280 times.
|
14280 | if (!read || !write) { |
| 593 | ✗ | av_log(ctx, AV_LOG_ERROR, "First and last operations must be a read " | |
| 594 | "and write, respectively.\n"); | ||
| 595 | ✗ | ret = AVERROR(EINVAL); | |
| 596 | ✗ | goto out; | |
| 597 | } | ||
| 598 | |||
| 599 |
1/2✓ Branch 0 taken 14280 times.
✗ Branch 1 not taken.
|
14280 | if (flags & SWS_OP_FLAG_OPTIMIZE) { |
| 600 | 14280 | ret = ff_sws_op_list_optimize(ops); | |
| 601 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14280 times.
|
14280 | if (ret < 0) |
| 602 | ✗ | goto out; | |
| 603 | 14280 | av_log(ctx, AV_LOG_DEBUG, "Operation list after optimizing:\n"); | |
| 604 | 14280 | ff_sws_op_list_print(ctx, AV_LOG_DEBUG, AV_LOG_TRACE, ops); | |
| 605 | } | ||
| 606 | |||
| 607 | 14280 | ret = compile(graph, ops, input, output); | |
| 608 |
1/2✓ Branch 0 taken 14280 times.
✗ Branch 1 not taken.
|
14280 | if (ret != AVERROR(ENOTSUP)) |
| 609 | 14280 | goto out; | |
| 610 | |||
| 611 | ✗ | av_log(ctx, AV_LOG_DEBUG, "Retrying with separated filter passes.\n"); | |
| 612 | ✗ | SwsPass *prev = input; | |
| 613 | ✗ | while (ops) { | |
| 614 | SwsOpList *rest; | ||
| 615 | ✗ | ret = ff_sws_op_list_subpass(ops, &rest); | |
| 616 | ✗ | if (ret < 0) | |
| 617 | ✗ | goto out; | |
| 618 | |||
| 619 | ✗ | if (prev == input && !rest) { | |
| 620 | /* No point in compiling an unsplit pass again */ | ||
| 621 | ✗ | ret = AVERROR(ENOTSUP); | |
| 622 | ✗ | goto out; | |
| 623 | } | ||
| 624 | |||
| 625 | ✗ | ret = compile(graph, ops, prev, &prev); | |
| 626 | ✗ | if (ret < 0) { | |
| 627 | ✗ | ff_sws_op_list_free(&rest); | |
| 628 | ✗ | goto out; | |
| 629 | } | ||
| 630 | |||
| 631 | ✗ | ff_sws_op_list_free(&ops); | |
| 632 | ✗ | ops = rest; | |
| 633 | } | ||
| 634 | |||
| 635 | /* Return last subpass successfully compiled */ | ||
| 636 | ✗ | av_log(ctx, AV_LOG_VERBOSE, "Using %d separate passes.\n", | |
| 637 | ✗ | graph->num_passes - passes_orig); | |
| 638 | ✗ | *output = prev; | |
| 639 | |||
| 640 | 14280 | out: | |
| 641 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14280 times.
|
14280 | if (ret == AVERROR(ENOTSUP)) { |
| 642 | ✗ | av_log(ctx, AV_LOG_WARNING, "No backend found for operations:\n"); | |
| 643 | ✗ | ff_sws_op_list_print(ctx, AV_LOG_WARNING, AV_LOG_TRACE, ops); | |
| 644 | } | ||
| 645 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14280 times.
|
14280 | if (ret < 0) |
| 646 | ✗ | ff_sws_graph_rollback(graph, passes_orig); | |
| 647 | 14280 | ff_sws_op_list_free(&ops); | |
| 648 | 14280 | *pops = NULL; | |
| 649 | 14280 | return ret; | |
| 650 | } | ||
| 651 |