Line |
Branch |
Exec |
Source |
1 |
|
|
/* |
2 |
|
|
* Copyright (c) 2019 Guo Yejun |
3 |
|
|
* |
4 |
|
|
* This file is part of FFmpeg. |
5 |
|
|
* |
6 |
|
|
* FFmpeg is free software; you can redistribute it and/or |
7 |
|
|
* modify it under the terms of the GNU Lesser General Public |
8 |
|
|
* License as published by the Free Software Foundation; either |
9 |
|
|
* version 2.1 of the License, or (at your option) any later version. |
10 |
|
|
* |
11 |
|
|
* FFmpeg is distributed in the hope that it will be useful, |
12 |
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 |
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 |
|
|
* Lesser General Public License for more details. |
15 |
|
|
* |
16 |
|
|
* You should have received a copy of the GNU Lesser General Public |
17 |
|
|
* License along with FFmpeg; if not, write to the Free Software |
18 |
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
19 |
|
|
*/ |
20 |
|
|
|
21 |
|
|
/** |
22 |
|
|
* @file |
23 |
|
|
* implementing a generic image processing filter using deep learning networks. |
24 |
|
|
*/ |
25 |
|
|
|
26 |
|
|
#include "libavutil/opt.h" |
27 |
|
|
#include "libavutil/pixdesc.h" |
28 |
|
|
#include "libavutil/avassert.h" |
29 |
|
|
#include "libavutil/imgutils.h" |
30 |
|
|
#include "filters.h" |
31 |
|
|
#include "dnn_filter_common.h" |
32 |
|
|
#include "internal.h" |
33 |
|
|
#include "video.h" |
34 |
|
|
#include "libswscale/swscale.h" |
35 |
|
|
#include "libavutil/time.h" |
36 |
|
|
|
37 |
|
|
typedef struct DnnProcessingContext { |
38 |
|
|
const AVClass *class; |
39 |
|
|
DnnContext dnnctx; |
40 |
|
|
struct SwsContext *sws_uv_scale; |
41 |
|
|
int sws_uv_height; |
42 |
|
|
} DnnProcessingContext; |
43 |
|
|
|
44 |
|
|
#define OFFSET(x) offsetof(DnnProcessingContext, dnnctx.x) |
45 |
|
|
#define FLAGS AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM |
46 |
|
|
static const AVOption dnn_processing_options[] = { |
47 |
|
|
{ "dnn_backend", "DNN backend", OFFSET(backend_type), AV_OPT_TYPE_INT, { .i64 = DNN_TF }, INT_MIN, INT_MAX, FLAGS, .unit = "backend" }, |
48 |
|
|
#if (CONFIG_LIBTENSORFLOW == 1) |
49 |
|
|
{ "tensorflow", "tensorflow backend flag", 0, AV_OPT_TYPE_CONST, { .i64 = DNN_TF }, 0, 0, FLAGS, .unit = "backend" }, |
50 |
|
|
#endif |
51 |
|
|
#if (CONFIG_LIBOPENVINO == 1) |
52 |
|
|
{ "openvino", "openvino backend flag", 0, AV_OPT_TYPE_CONST, { .i64 = DNN_OV }, 0, 0, FLAGS, .unit = "backend" }, |
53 |
|
|
#endif |
54 |
|
|
#if (CONFIG_LIBTORCH == 1) |
55 |
|
|
{ "torch", "torch backend flag", 0, AV_OPT_TYPE_CONST, { .i64 = DNN_TH }, 0, 0, FLAGS, "backend" }, |
56 |
|
|
#endif |
57 |
|
|
DNN_COMMON_OPTIONS |
58 |
|
|
{ NULL } |
59 |
|
|
}; |
60 |
|
|
|
61 |
|
|
AVFILTER_DEFINE_CLASS(dnn_processing); |
62 |
|
|
|
63 |
|
✗ |
static av_cold int init(AVFilterContext *context) |
64 |
|
|
{ |
65 |
|
✗ |
DnnProcessingContext *ctx = context->priv; |
66 |
|
✗ |
return ff_dnn_init(&ctx->dnnctx, DFT_PROCESS_FRAME, context); |
67 |
|
|
} |
68 |
|
|
|
69 |
|
|
static const enum AVPixelFormat pix_fmts[] = { |
70 |
|
|
AV_PIX_FMT_RGB24, AV_PIX_FMT_BGR24, |
71 |
|
|
AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAYF32, |
72 |
|
|
AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P, |
73 |
|
|
AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P, |
74 |
|
|
AV_PIX_FMT_NV12, |
75 |
|
|
AV_PIX_FMT_NONE |
76 |
|
|
}; |
77 |
|
|
|
78 |
|
|
#define LOG_FORMAT_CHANNEL_MISMATCH() \ |
79 |
|
|
av_log(ctx, AV_LOG_ERROR, \ |
80 |
|
|
"the frame's format %s does not match " \ |
81 |
|
|
"the model input channel %d\n", \ |
82 |
|
|
av_get_pix_fmt_name(fmt), \ |
83 |
|
|
model_input->dims[dnn_get_channel_idx_by_layout(model_input->layout)]); |
84 |
|
|
|
85 |
|
✗ |
static int check_modelinput_inlink(const DNNData *model_input, const AVFilterLink *inlink) |
86 |
|
|
{ |
87 |
|
✗ |
AVFilterContext *ctx = inlink->dst; |
88 |
|
✗ |
enum AVPixelFormat fmt = inlink->format; |
89 |
|
|
int width_idx, height_idx; |
90 |
|
|
|
91 |
|
✗ |
width_idx = dnn_get_width_idx_by_layout(model_input->layout); |
92 |
|
✗ |
height_idx = dnn_get_height_idx_by_layout(model_input->layout); |
93 |
|
|
// the design is to add explicit scale filter before this filter |
94 |
|
✗ |
if (model_input->dims[height_idx] != -1 && |
95 |
|
✗ |
model_input->dims[height_idx] != inlink->h) { |
96 |
|
✗ |
av_log(ctx, AV_LOG_ERROR, "the model requires frame height %d but got %d\n", |
97 |
|
✗ |
model_input->dims[height_idx], |
98 |
|
✗ |
inlink->h); |
99 |
|
✗ |
return AVERROR(EIO); |
100 |
|
|
} |
101 |
|
✗ |
if (model_input->dims[width_idx] != -1 && |
102 |
|
✗ |
model_input->dims[width_idx] != inlink->w) { |
103 |
|
✗ |
av_log(ctx, AV_LOG_ERROR, "the model requires frame width %d but got %d\n", |
104 |
|
✗ |
model_input->dims[width_idx], |
105 |
|
✗ |
inlink->w); |
106 |
|
✗ |
return AVERROR(EIO); |
107 |
|
|
} |
108 |
|
✗ |
if (model_input->dt != DNN_FLOAT) { |
109 |
|
✗ |
avpriv_report_missing_feature(ctx, "data type rather than DNN_FLOAT"); |
110 |
|
✗ |
return AVERROR(EIO); |
111 |
|
|
} |
112 |
|
|
|
113 |
|
✗ |
switch (fmt) { |
114 |
|
✗ |
case AV_PIX_FMT_RGB24: |
115 |
|
|
case AV_PIX_FMT_BGR24: |
116 |
|
✗ |
if (model_input->dims[dnn_get_channel_idx_by_layout(model_input->layout)] != 3) { |
117 |
|
✗ |
LOG_FORMAT_CHANNEL_MISMATCH(); |
118 |
|
✗ |
return AVERROR(EIO); |
119 |
|
|
} |
120 |
|
✗ |
return 0; |
121 |
|
✗ |
case AV_PIX_FMT_GRAY8: |
122 |
|
|
case AV_PIX_FMT_GRAYF32: |
123 |
|
|
case AV_PIX_FMT_YUV420P: |
124 |
|
|
case AV_PIX_FMT_YUV422P: |
125 |
|
|
case AV_PIX_FMT_YUV444P: |
126 |
|
|
case AV_PIX_FMT_YUV410P: |
127 |
|
|
case AV_PIX_FMT_YUV411P: |
128 |
|
|
case AV_PIX_FMT_NV12: |
129 |
|
✗ |
if (model_input->dims[dnn_get_channel_idx_by_layout(model_input->layout)] != 1) { |
130 |
|
✗ |
LOG_FORMAT_CHANNEL_MISMATCH(); |
131 |
|
✗ |
return AVERROR(EIO); |
132 |
|
|
} |
133 |
|
✗ |
return 0; |
134 |
|
✗ |
default: |
135 |
|
✗ |
avpriv_report_missing_feature(ctx, "%s", av_get_pix_fmt_name(fmt)); |
136 |
|
✗ |
return AVERROR(EIO); |
137 |
|
|
} |
138 |
|
|
|
139 |
|
|
return 0; |
140 |
|
|
} |
141 |
|
|
|
142 |
|
✗ |
static int config_input(AVFilterLink *inlink) |
143 |
|
|
{ |
144 |
|
✗ |
AVFilterContext *context = inlink->dst; |
145 |
|
✗ |
DnnProcessingContext *ctx = context->priv; |
146 |
|
|
int result; |
147 |
|
|
DNNData model_input; |
148 |
|
|
int check; |
149 |
|
|
|
150 |
|
✗ |
result = ff_dnn_get_input(&ctx->dnnctx, &model_input); |
151 |
|
✗ |
if (result != 0) { |
152 |
|
✗ |
av_log(ctx, AV_LOG_ERROR, "could not get input from the model\n"); |
153 |
|
✗ |
return result; |
154 |
|
|
} |
155 |
|
|
|
156 |
|
✗ |
check = check_modelinput_inlink(&model_input, inlink); |
157 |
|
✗ |
if (check != 0) { |
158 |
|
✗ |
return check; |
159 |
|
|
} |
160 |
|
|
|
161 |
|
✗ |
return 0; |
162 |
|
|
} |
163 |
|
|
|
164 |
|
✗ |
static av_always_inline int isPlanarYUV(enum AVPixelFormat pix_fmt) |
165 |
|
|
{ |
166 |
|
✗ |
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt); |
167 |
|
✗ |
av_assert0(desc); |
168 |
|
✗ |
return !(desc->flags & AV_PIX_FMT_FLAG_RGB) && desc->nb_components == 3; |
169 |
|
|
} |
170 |
|
|
|
171 |
|
✗ |
static int prepare_uv_scale(AVFilterLink *outlink) |
172 |
|
|
{ |
173 |
|
✗ |
AVFilterContext *context = outlink->src; |
174 |
|
✗ |
DnnProcessingContext *ctx = context->priv; |
175 |
|
✗ |
AVFilterLink *inlink = context->inputs[0]; |
176 |
|
✗ |
enum AVPixelFormat fmt = inlink->format; |
177 |
|
|
|
178 |
|
✗ |
if (isPlanarYUV(fmt)) { |
179 |
|
✗ |
if (inlink->w != outlink->w || inlink->h != outlink->h) { |
180 |
|
✗ |
if (fmt == AV_PIX_FMT_NV12) { |
181 |
|
✗ |
ctx->sws_uv_scale = sws_getContext(inlink->w >> 1, inlink->h >> 1, AV_PIX_FMT_YA8, |
182 |
|
✗ |
outlink->w >> 1, outlink->h >> 1, AV_PIX_FMT_YA8, |
183 |
|
|
SWS_BICUBIC, NULL, NULL, NULL); |
184 |
|
✗ |
ctx->sws_uv_height = inlink->h >> 1; |
185 |
|
|
} else { |
186 |
|
✗ |
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(fmt); |
187 |
|
✗ |
int sws_src_h = AV_CEIL_RSHIFT(inlink->h, desc->log2_chroma_h); |
188 |
|
✗ |
int sws_src_w = AV_CEIL_RSHIFT(inlink->w, desc->log2_chroma_w); |
189 |
|
✗ |
int sws_dst_h = AV_CEIL_RSHIFT(outlink->h, desc->log2_chroma_h); |
190 |
|
✗ |
int sws_dst_w = AV_CEIL_RSHIFT(outlink->w, desc->log2_chroma_w); |
191 |
|
✗ |
ctx->sws_uv_scale = sws_getContext(sws_src_w, sws_src_h, AV_PIX_FMT_GRAY8, |
192 |
|
|
sws_dst_w, sws_dst_h, AV_PIX_FMT_GRAY8, |
193 |
|
|
SWS_BICUBIC, NULL, NULL, NULL); |
194 |
|
✗ |
ctx->sws_uv_height = sws_src_h; |
195 |
|
|
} |
196 |
|
|
} |
197 |
|
|
} |
198 |
|
|
|
199 |
|
✗ |
return 0; |
200 |
|
|
} |
201 |
|
|
|
202 |
|
✗ |
static int config_output(AVFilterLink *outlink) |
203 |
|
|
{ |
204 |
|
✗ |
AVFilterContext *context = outlink->src; |
205 |
|
✗ |
DnnProcessingContext *ctx = context->priv; |
206 |
|
|
int result; |
207 |
|
✗ |
AVFilterLink *inlink = context->inputs[0]; |
208 |
|
|
|
209 |
|
|
// have a try run in case that the dnn model resize the frame |
210 |
|
✗ |
result = ff_dnn_get_output(&ctx->dnnctx, inlink->w, inlink->h, &outlink->w, &outlink->h); |
211 |
|
✗ |
if (result != 0) { |
212 |
|
✗ |
av_log(ctx, AV_LOG_ERROR, "could not get output from the model\n"); |
213 |
|
✗ |
return result; |
214 |
|
|
} |
215 |
|
|
|
216 |
|
✗ |
prepare_uv_scale(outlink); |
217 |
|
|
|
218 |
|
✗ |
return 0; |
219 |
|
|
} |
220 |
|
|
|
221 |
|
✗ |
static int copy_uv_planes(DnnProcessingContext *ctx, AVFrame *out, const AVFrame *in) |
222 |
|
|
{ |
223 |
|
|
const AVPixFmtDescriptor *desc; |
224 |
|
|
int uv_height; |
225 |
|
|
|
226 |
|
✗ |
if (!ctx->sws_uv_scale) { |
227 |
|
✗ |
av_assert0(in->height == out->height && in->width == out->width); |
228 |
|
✗ |
desc = av_pix_fmt_desc_get(in->format); |
229 |
|
✗ |
uv_height = AV_CEIL_RSHIFT(in->height, desc->log2_chroma_h); |
230 |
|
✗ |
for (int i = 1; i < 3; ++i) { |
231 |
|
✗ |
int bytewidth = av_image_get_linesize(in->format, in->width, i); |
232 |
|
✗ |
if (bytewidth < 0) { |
233 |
|
✗ |
return AVERROR(EINVAL); |
234 |
|
|
} |
235 |
|
✗ |
av_image_copy_plane(out->data[i], out->linesize[i], |
236 |
|
✗ |
in->data[i], in->linesize[i], |
237 |
|
|
bytewidth, uv_height); |
238 |
|
|
} |
239 |
|
✗ |
} else if (in->format == AV_PIX_FMT_NV12) { |
240 |
|
✗ |
sws_scale(ctx->sws_uv_scale, (const uint8_t **)(in->data + 1), in->linesize + 1, |
241 |
|
✗ |
0, ctx->sws_uv_height, out->data + 1, out->linesize + 1); |
242 |
|
|
} else { |
243 |
|
✗ |
sws_scale(ctx->sws_uv_scale, (const uint8_t **)(in->data + 1), in->linesize + 1, |
244 |
|
✗ |
0, ctx->sws_uv_height, out->data + 1, out->linesize + 1); |
245 |
|
✗ |
sws_scale(ctx->sws_uv_scale, (const uint8_t **)(in->data + 2), in->linesize + 2, |
246 |
|
✗ |
0, ctx->sws_uv_height, out->data + 2, out->linesize + 2); |
247 |
|
|
} |
248 |
|
|
|
249 |
|
✗ |
return 0; |
250 |
|
|
} |
251 |
|
|
|
252 |
|
✗ |
static int flush_frame(AVFilterLink *outlink, int64_t pts, int64_t *out_pts) |
253 |
|
|
{ |
254 |
|
✗ |
DnnProcessingContext *ctx = outlink->src->priv; |
255 |
|
|
int ret; |
256 |
|
|
DNNAsyncStatusType async_state; |
257 |
|
|
|
258 |
|
✗ |
ret = ff_dnn_flush(&ctx->dnnctx); |
259 |
|
✗ |
if (ret != 0) { |
260 |
|
✗ |
return -1; |
261 |
|
|
} |
262 |
|
|
|
263 |
|
|
do { |
264 |
|
✗ |
AVFrame *in_frame = NULL; |
265 |
|
✗ |
AVFrame *out_frame = NULL; |
266 |
|
✗ |
async_state = ff_dnn_get_result(&ctx->dnnctx, &in_frame, &out_frame); |
267 |
|
✗ |
if (out_frame) { |
268 |
|
✗ |
if (isPlanarYUV(in_frame->format)) |
269 |
|
✗ |
copy_uv_planes(ctx, out_frame, in_frame); |
270 |
|
✗ |
av_frame_free(&in_frame); |
271 |
|
✗ |
ret = ff_filter_frame(outlink, out_frame); |
272 |
|
✗ |
if (ret < 0) |
273 |
|
✗ |
return ret; |
274 |
|
✗ |
if (out_pts) |
275 |
|
✗ |
*out_pts = out_frame->pts + pts; |
276 |
|
|
} |
277 |
|
✗ |
av_usleep(5000); |
278 |
|
✗ |
} while (async_state >= DAST_NOT_READY); |
279 |
|
|
|
280 |
|
✗ |
return 0; |
281 |
|
|
} |
282 |
|
|
|
283 |
|
✗ |
static int activate(AVFilterContext *filter_ctx) |
284 |
|
|
{ |
285 |
|
✗ |
AVFilterLink *inlink = filter_ctx->inputs[0]; |
286 |
|
✗ |
AVFilterLink *outlink = filter_ctx->outputs[0]; |
287 |
|
✗ |
DnnProcessingContext *ctx = filter_ctx->priv; |
288 |
|
✗ |
AVFrame *in = NULL, *out = NULL; |
289 |
|
|
int64_t pts; |
290 |
|
|
int ret, status; |
291 |
|
✗ |
int got_frame = 0; |
292 |
|
|
int async_state; |
293 |
|
|
|
294 |
|
✗ |
FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink); |
295 |
|
|
|
296 |
|
|
do { |
297 |
|
|
// drain all input frames |
298 |
|
✗ |
ret = ff_inlink_consume_frame(inlink, &in); |
299 |
|
✗ |
if (ret < 0) |
300 |
|
✗ |
return ret; |
301 |
|
✗ |
if (ret > 0) { |
302 |
|
✗ |
out = ff_get_video_buffer(outlink, outlink->w, outlink->h); |
303 |
|
✗ |
if (!out) { |
304 |
|
✗ |
av_frame_free(&in); |
305 |
|
✗ |
return AVERROR(ENOMEM); |
306 |
|
|
} |
307 |
|
✗ |
av_frame_copy_props(out, in); |
308 |
|
✗ |
if (ff_dnn_execute_model(&ctx->dnnctx, in, out) != 0) { |
309 |
|
✗ |
return AVERROR(EIO); |
310 |
|
|
} |
311 |
|
|
} |
312 |
|
✗ |
} while (ret > 0); |
313 |
|
|
|
314 |
|
|
// drain all processed frames |
315 |
|
|
do { |
316 |
|
✗ |
AVFrame *in_frame = NULL; |
317 |
|
✗ |
AVFrame *out_frame = NULL; |
318 |
|
✗ |
async_state = ff_dnn_get_result(&ctx->dnnctx, &in_frame, &out_frame); |
319 |
|
✗ |
if (out_frame) { |
320 |
|
✗ |
if (isPlanarYUV(in_frame->format)) |
321 |
|
✗ |
copy_uv_planes(ctx, out_frame, in_frame); |
322 |
|
✗ |
av_frame_free(&in_frame); |
323 |
|
✗ |
ret = ff_filter_frame(outlink, out_frame); |
324 |
|
✗ |
if (ret < 0) |
325 |
|
✗ |
return ret; |
326 |
|
✗ |
got_frame = 1; |
327 |
|
|
} |
328 |
|
✗ |
} while (async_state == DAST_SUCCESS); |
329 |
|
|
|
330 |
|
|
// if frame got, schedule to next filter |
331 |
|
✗ |
if (got_frame) |
332 |
|
✗ |
return 0; |
333 |
|
|
|
334 |
|
✗ |
if (ff_inlink_acknowledge_status(inlink, &status, &pts)) { |
335 |
|
✗ |
if (status == AVERROR_EOF) { |
336 |
|
✗ |
int64_t out_pts = pts; |
337 |
|
✗ |
ret = flush_frame(outlink, pts, &out_pts); |
338 |
|
✗ |
ff_outlink_set_status(outlink, status, out_pts); |
339 |
|
✗ |
return ret; |
340 |
|
|
} |
341 |
|
|
} |
342 |
|
|
|
343 |
|
✗ |
FF_FILTER_FORWARD_WANTED(outlink, inlink); |
344 |
|
|
|
345 |
|
✗ |
return 0; |
346 |
|
|
} |
347 |
|
|
|
348 |
|
✗ |
static av_cold void uninit(AVFilterContext *ctx) |
349 |
|
|
{ |
350 |
|
✗ |
DnnProcessingContext *context = ctx->priv; |
351 |
|
|
|
352 |
|
✗ |
sws_freeContext(context->sws_uv_scale); |
353 |
|
✗ |
ff_dnn_uninit(&context->dnnctx); |
354 |
|
✗ |
} |
355 |
|
|
|
356 |
|
|
static const AVFilterPad dnn_processing_inputs[] = { |
357 |
|
|
{ |
358 |
|
|
.name = "default", |
359 |
|
|
.type = AVMEDIA_TYPE_VIDEO, |
360 |
|
|
.config_props = config_input, |
361 |
|
|
}, |
362 |
|
|
}; |
363 |
|
|
|
364 |
|
|
static const AVFilterPad dnn_processing_outputs[] = { |
365 |
|
|
{ |
366 |
|
|
.name = "default", |
367 |
|
|
.type = AVMEDIA_TYPE_VIDEO, |
368 |
|
|
.config_props = config_output, |
369 |
|
|
}, |
370 |
|
|
}; |
371 |
|
|
|
372 |
|
|
const AVFilter ff_vf_dnn_processing = { |
373 |
|
|
.name = "dnn_processing", |
374 |
|
|
.description = NULL_IF_CONFIG_SMALL("Apply DNN processing filter to the input."), |
375 |
|
|
.priv_size = sizeof(DnnProcessingContext), |
376 |
|
|
.init = init, |
377 |
|
|
.uninit = uninit, |
378 |
|
|
FILTER_INPUTS(dnn_processing_inputs), |
379 |
|
|
FILTER_OUTPUTS(dnn_processing_outputs), |
380 |
|
|
FILTER_PIXFMTS_ARRAY(pix_fmts), |
381 |
|
|
.priv_class = &dnn_processing_class, |
382 |
|
|
.activate = activate, |
383 |
|
|
}; |
384 |
|
|
|