FFmpeg coverage


Directory: ../../../ffmpeg/
File: src/libavfilter/vf_dnn_detect.c
Date: 2024-04-23 16:28:37
Exec Total Coverage
Lines: 0 478 0.0%
Functions: 0 21 0.0%
Branches: 0 241 0.0%

Line Branch Exec Source
1 /*
2 * This file is part of FFmpeg.
3 *
4 * FFmpeg is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
8 *
9 * FFmpeg is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with FFmpeg; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18
19 /**
20 * @file
21 * implementing an object detecting filter using deep learning networks.
22 */
23
24 #include "libavutil/file_open.h"
25 #include "libavutil/mem.h"
26 #include "libavutil/opt.h"
27 #include "filters.h"
28 #include "dnn_filter_common.h"
29 #include "internal.h"
30 #include "video.h"
31 #include "libavutil/time.h"
32 #include "libavutil/avstring.h"
33 #include "libavutil/detection_bbox.h"
34 #include "libavutil/fifo.h"
35
36 typedef enum {
37 DDMT_SSD,
38 DDMT_YOLOV1V2,
39 DDMT_YOLOV3,
40 DDMT_YOLOV4
41 } DNNDetectionModelType;
42
43 typedef struct DnnDetectContext {
44 const AVClass *class;
45 DnnContext dnnctx;
46 float confidence;
47 char *labels_filename;
48 char **labels;
49 int label_count;
50 DNNDetectionModelType model_type;
51 int cell_w;
52 int cell_h;
53 int nb_classes;
54 AVFifo *bboxes_fifo;
55 int scale_width;
56 int scale_height;
57 char *anchors_str;
58 float *anchors;
59 int nb_anchor;
60 } DnnDetectContext;
61
62 #define OFFSET(x) offsetof(DnnDetectContext, dnnctx.x)
63 #define OFFSET2(x) offsetof(DnnDetectContext, x)
64 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM
65 static const AVOption dnn_detect_options[] = {
66 { "dnn_backend", "DNN backend", OFFSET(backend_type), AV_OPT_TYPE_INT, { .i64 = DNN_OV }, INT_MIN, INT_MAX, FLAGS, .unit = "backend" },
67 #if (CONFIG_LIBTENSORFLOW == 1)
68 { "tensorflow", "tensorflow backend flag", 0, AV_OPT_TYPE_CONST, { .i64 = DNN_TF }, 0, 0, FLAGS, .unit = "backend" },
69 #endif
70 #if (CONFIG_LIBOPENVINO == 1)
71 { "openvino", "openvino backend flag", 0, AV_OPT_TYPE_CONST, { .i64 = DNN_OV }, 0, 0, FLAGS, .unit = "backend" },
72 #endif
73 DNN_COMMON_OPTIONS
74 { "confidence", "threshold of confidence", OFFSET2(confidence), AV_OPT_TYPE_FLOAT, { .dbl = 0.5 }, 0, 1, FLAGS},
75 { "labels", "path to labels file", OFFSET2(labels_filename), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, FLAGS },
76 { "model_type", "DNN detection model type", OFFSET2(model_type), AV_OPT_TYPE_INT, { .i64 = DDMT_SSD }, INT_MIN, INT_MAX, FLAGS, .unit = "model_type" },
77 { "ssd", "output shape [1, 1, N, 7]", 0, AV_OPT_TYPE_CONST, { .i64 = DDMT_SSD }, 0, 0, FLAGS, .unit = "model_type" },
78 { "yolo", "output shape [1, N*Cx*Cy*DetectionBox]", 0, AV_OPT_TYPE_CONST, { .i64 = DDMT_YOLOV1V2 }, 0, 0, FLAGS, .unit = "model_type" },
79 { "yolov3", "outputs shape [1, N*D, Cx, Cy]", 0, AV_OPT_TYPE_CONST, { .i64 = DDMT_YOLOV3 }, 0, 0, FLAGS, .unit = "model_type" },
80 { "yolov4", "outputs shape [1, N*D, Cx, Cy]", 0, AV_OPT_TYPE_CONST, { .i64 = DDMT_YOLOV4 }, 0, 0, FLAGS, .unit = "model_type" },
81 { "cell_w", "cell width", OFFSET2(cell_w), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INTMAX_MAX, FLAGS },
82 { "cell_h", "cell height", OFFSET2(cell_h), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INTMAX_MAX, FLAGS },
83 { "nb_classes", "The number of class", OFFSET2(nb_classes), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INTMAX_MAX, FLAGS },
84 { "anchors", "anchors, splited by '&'", OFFSET2(anchors_str), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, FLAGS },
85 { NULL }
86 };
87
88 AVFILTER_DEFINE_CLASS(dnn_detect);
89
90 static inline float sigmoid(float x) {
91 return 1.f / (1.f + exp(-x));
92 }
93
94 static inline float linear(float x) {
95 return x;
96 }
97
98 static int dnn_detect_get_label_id(int nb_classes, int cell_size, float *label_data)
99 {
100 float max_prob = 0;
101 int label_id = 0;
102 for (int i = 0; i < nb_classes; i++) {
103 if (label_data[i * cell_size] > max_prob) {
104 max_prob = label_data[i * cell_size];
105 label_id = i;
106 }
107 }
108 return label_id;
109 }
110
111 static int dnn_detect_parse_anchors(char *anchors_str, float **anchors)
112 {
113 char *saveptr = NULL, *token;
114 float *anchors_buf;
115 int nb_anchor = 0, i = 0;
116 while(anchors_str[i] != '\0') {
117 if(anchors_str[i] == '&')
118 nb_anchor++;
119 i++;
120 }
121 nb_anchor++;
122 anchors_buf = av_mallocz(nb_anchor * sizeof(**anchors));
123 if (!anchors_buf) {
124 return 0;
125 }
126 for (int i = 0; i < nb_anchor; i++) {
127 token = av_strtok(anchors_str, "&", &saveptr);
128 if (!token) {
129 av_freep(&anchors_buf);
130 return 0;
131 }
132 anchors_buf[i] = strtof(token, NULL);
133 anchors_str = NULL;
134 }
135 *anchors = anchors_buf;
136 return nb_anchor;
137 }
138
139 /* Calculate Intersection Over Union */
140 static float dnn_detect_IOU(AVDetectionBBox *bbox1, AVDetectionBBox *bbox2)
141 {
142 float overlapping_width = FFMIN(bbox1->x + bbox1->w, bbox2->x + bbox2->w) - FFMAX(bbox1->x, bbox2->x);
143 float overlapping_height = FFMIN(bbox1->y + bbox1->h, bbox2->y + bbox2->h) - FFMAX(bbox1->y, bbox2->y);
144 float intersection_area =
145 (overlapping_width < 0 || overlapping_height < 0) ? 0 : overlapping_height * overlapping_width;
146 float union_area = bbox1->w * bbox1->h + bbox2->w * bbox2->h - intersection_area;
147 return intersection_area / union_area;
148 }
149
150 static int dnn_detect_parse_yolo_output(AVFrame *frame, DNNData *output, int output_index,
151 AVFilterContext *filter_ctx)
152 {
153 DnnDetectContext *ctx = filter_ctx->priv;
154 float conf_threshold = ctx->confidence;
155 int detection_boxes, box_size;
156 int cell_w = 0, cell_h = 0, scale_w = 0, scale_h = 0;
157 int nb_classes = ctx->nb_classes;
158 float *output_data = output[output_index].data;
159 float *anchors = ctx->anchors;
160 AVDetectionBBox *bbox;
161 float (*post_process_raw_data)(float x) = linear;
162 int is_NHWC = 0;
163
164 if (ctx->model_type == DDMT_YOLOV1V2) {
165 cell_w = ctx->cell_w;
166 cell_h = ctx->cell_h;
167 scale_w = cell_w;
168 scale_h = cell_h;
169 } else {
170 if (output[output_index].dims[2] != output[output_index].dims[3] &&
171 output[output_index].dims[2] == output[output_index].dims[1]) {
172 is_NHWC = 1;
173 cell_w = output[output_index].dims[2];
174 cell_h = output[output_index].dims[1];
175 } else {
176 cell_w = output[output_index].dims[3];
177 cell_h = output[output_index].dims[2];
178 }
179 scale_w = ctx->scale_width;
180 scale_h = ctx->scale_height;
181 }
182 box_size = nb_classes + 5;
183
184 switch (ctx->model_type) {
185 case DDMT_YOLOV1V2:
186 case DDMT_YOLOV3:
187 post_process_raw_data = linear;
188 break;
189 case DDMT_YOLOV4:
190 post_process_raw_data = sigmoid;
191 break;
192 }
193
194 if (!cell_h || !cell_w) {
195 av_log(filter_ctx, AV_LOG_ERROR, "cell_w and cell_h are detected\n");
196 return AVERROR(EINVAL);
197 }
198
199 if (!nb_classes) {
200 av_log(filter_ctx, AV_LOG_ERROR, "nb_classes is not set\n");
201 return AVERROR(EINVAL);
202 }
203
204 if (!anchors) {
205 av_log(filter_ctx, AV_LOG_ERROR, "anchors is not set\n");
206 return AVERROR(EINVAL);
207 }
208
209 if (output[output_index].dims[1] * output[output_index].dims[2] *
210 output[output_index].dims[3] % (box_size * cell_w * cell_h)) {
211 av_log(filter_ctx, AV_LOG_ERROR, "wrong cell_w, cell_h or nb_classes\n");
212 return AVERROR(EINVAL);
213 }
214 detection_boxes = output[output_index].dims[1] *
215 output[output_index].dims[2] *
216 output[output_index].dims[3] / box_size / cell_w / cell_h;
217
218 anchors = anchors + (detection_boxes * output_index * 2);
219 /**
220 * find all candidate bbox
221 * yolo output can be reshaped to [B, N*D, Cx, Cy]
222 * Detection box 'D' has format [`x`, `y`, `h`, `w`, `box_score`, `class_no_1`, ...,]
223 **/
224 for (int box_id = 0; box_id < detection_boxes; box_id++) {
225 for (int cx = 0; cx < cell_w; cx++)
226 for (int cy = 0; cy < cell_h; cy++) {
227 float x, y, w, h, conf;
228 float *detection_boxes_data;
229 int label_id;
230
231 if (is_NHWC) {
232 detection_boxes_data = output_data +
233 ((cy * cell_w + cx) * detection_boxes + box_id) * box_size;
234 conf = post_process_raw_data(detection_boxes_data[4]);
235 } else {
236 detection_boxes_data = output_data + box_id * box_size * cell_w * cell_h;
237 conf = post_process_raw_data(
238 detection_boxes_data[cy * cell_w + cx + 4 * cell_w * cell_h]);
239 }
240
241 if (is_NHWC) {
242 x = post_process_raw_data(detection_boxes_data[0]);
243 y = post_process_raw_data(detection_boxes_data[1]);
244 w = detection_boxes_data[2];
245 h = detection_boxes_data[3];
246 label_id = dnn_detect_get_label_id(ctx->nb_classes, 1, detection_boxes_data + 5);
247 conf = conf * post_process_raw_data(detection_boxes_data[label_id + 5]);
248 } else {
249 x = post_process_raw_data(detection_boxes_data[cy * cell_w + cx]);
250 y = post_process_raw_data(detection_boxes_data[cy * cell_w + cx + cell_w * cell_h]);
251 w = detection_boxes_data[cy * cell_w + cx + 2 * cell_w * cell_h];
252 h = detection_boxes_data[cy * cell_w + cx + 3 * cell_w * cell_h];
253 label_id = dnn_detect_get_label_id(ctx->nb_classes, cell_w * cell_h,
254 detection_boxes_data + cy * cell_w + cx + 5 * cell_w * cell_h);
255 conf = conf * post_process_raw_data(
256 detection_boxes_data[cy * cell_w + cx + (label_id + 5) * cell_w * cell_h]);
257 }
258 if (conf < conf_threshold) {
259 continue;
260 }
261
262 bbox = av_mallocz(sizeof(*bbox));
263 if (!bbox)
264 return AVERROR(ENOMEM);
265
266 bbox->w = exp(w) * anchors[box_id * 2] * frame->width / scale_w;
267 bbox->h = exp(h) * anchors[box_id * 2 + 1] * frame->height / scale_h;
268 bbox->x = (cx + x) / cell_w * frame->width - bbox->w / 2;
269 bbox->y = (cy + y) / cell_h * frame->height - bbox->h / 2;
270 bbox->detect_confidence = av_make_q((int)(conf * 10000), 10000);
271 if (ctx->labels && label_id < ctx->label_count) {
272 av_strlcpy(bbox->detect_label, ctx->labels[label_id], sizeof(bbox->detect_label));
273 } else {
274 snprintf(bbox->detect_label, sizeof(bbox->detect_label), "%d", label_id);
275 }
276
277 if (av_fifo_write(ctx->bboxes_fifo, &bbox, 1) < 0) {
278 av_freep(&bbox);
279 return AVERROR(ENOMEM);
280 }
281 bbox = NULL;
282 }
283 }
284 return 0;
285 }
286
287 static int dnn_detect_fill_side_data(AVFrame *frame, AVFilterContext *filter_ctx)
288 {
289 DnnDetectContext *ctx = filter_ctx->priv;
290 float conf_threshold = ctx->confidence;
291 AVDetectionBBox *bbox;
292 int nb_bboxes = 0;
293 AVDetectionBBoxHeader *header;
294 if (av_fifo_can_read(ctx->bboxes_fifo) == 0) {
295 av_log(filter_ctx, AV_LOG_VERBOSE, "nothing detected in this frame.\n");
296 return 0;
297 }
298
299 /* remove overlap bboxes */
300 for (int i = 0; i < av_fifo_can_read(ctx->bboxes_fifo); i++){
301 av_fifo_peek(ctx->bboxes_fifo, &bbox, 1, i);
302 for (int j = 0; j < av_fifo_can_read(ctx->bboxes_fifo); j++) {
303 AVDetectionBBox *overlap_bbox;
304 av_fifo_peek(ctx->bboxes_fifo, &overlap_bbox, 1, j);
305 if (!strcmp(bbox->detect_label, overlap_bbox->detect_label) &&
306 av_cmp_q(bbox->detect_confidence, overlap_bbox->detect_confidence) < 0 &&
307 dnn_detect_IOU(bbox, overlap_bbox) >= conf_threshold) {
308 bbox->classify_count = -1; // bad result
309 nb_bboxes++;
310 break;
311 }
312 }
313 }
314 nb_bboxes = av_fifo_can_read(ctx->bboxes_fifo) - nb_bboxes;
315 header = av_detection_bbox_create_side_data(frame, nb_bboxes);
316 if (!header) {
317 av_log(filter_ctx, AV_LOG_ERROR, "failed to create side data with %d bounding boxes\n", nb_bboxes);
318 return -1;
319 }
320 av_strlcpy(header->source, ctx->dnnctx.model_filename, sizeof(header->source));
321
322 while(av_fifo_can_read(ctx->bboxes_fifo)) {
323 AVDetectionBBox *candidate_bbox;
324 av_fifo_read(ctx->bboxes_fifo, &candidate_bbox, 1);
325
326 if (nb_bboxes > 0 && candidate_bbox->classify_count != -1) {
327 bbox = av_get_detection_bbox(header, header->nb_bboxes - nb_bboxes);
328 memcpy(bbox, candidate_bbox, sizeof(*bbox));
329 nb_bboxes--;
330 }
331 av_freep(&candidate_bbox);
332 }
333 return 0;
334 }
335
336 static int dnn_detect_post_proc_yolo(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx)
337 {
338 int ret = 0;
339 ret = dnn_detect_parse_yolo_output(frame, output, 0, filter_ctx);
340 if (ret < 0)
341 return ret;
342 ret = dnn_detect_fill_side_data(frame, filter_ctx);
343 if (ret < 0)
344 return ret;
345 return 0;
346 }
347
348 static int dnn_detect_post_proc_yolov3(AVFrame *frame, DNNData *output,
349 AVFilterContext *filter_ctx, int nb_outputs)
350 {
351 int ret = 0;
352 for (int i = 0; i < nb_outputs; i++) {
353 ret = dnn_detect_parse_yolo_output(frame, output, i, filter_ctx);
354 if (ret < 0)
355 return ret;
356 }
357 ret = dnn_detect_fill_side_data(frame, filter_ctx);
358 if (ret < 0)
359 return ret;
360 return 0;
361 }
362
363 static int dnn_detect_post_proc_ssd(AVFrame *frame, DNNData *output, int nb_outputs,
364 AVFilterContext *filter_ctx)
365 {
366 DnnDetectContext *ctx = filter_ctx->priv;
367 float conf_threshold = ctx->confidence;
368 int proposal_count = 0;
369 int detect_size = 0;
370 float *detections = NULL, *labels = NULL;
371 int nb_bboxes = 0;
372 AVDetectionBBoxHeader *header;
373 AVDetectionBBox *bbox;
374 int scale_w = ctx->scale_width;
375 int scale_h = ctx->scale_height;
376
377 if (nb_outputs == 1 && output->dims[3] == 7) {
378 proposal_count = output->dims[2];
379 detect_size = output->dims[3];
380 detections = output->data;
381 } else if (nb_outputs == 2 && output[0].dims[3] == 5) {
382 proposal_count = output[0].dims[2];
383 detect_size = output[0].dims[3];
384 detections = output[0].data;
385 labels = output[1].data;
386 } else if (nb_outputs == 2 && output[1].dims[3] == 5) {
387 proposal_count = output[1].dims[2];
388 detect_size = output[1].dims[3];
389 detections = output[1].data;
390 labels = output[0].data;
391 } else {
392 av_log(filter_ctx, AV_LOG_ERROR, "Model output shape doesn't match ssd requirement.\n");
393 return AVERROR(EINVAL);
394 }
395
396 if (proposal_count == 0)
397 return 0;
398
399 for (int i = 0; i < proposal_count; ++i) {
400 float conf;
401 if (nb_outputs == 1)
402 conf = detections[i * detect_size + 2];
403 else
404 conf = detections[i * detect_size + 4];
405 if (conf < conf_threshold) {
406 continue;
407 }
408 nb_bboxes++;
409 }
410
411 if (nb_bboxes == 0) {
412 av_log(filter_ctx, AV_LOG_VERBOSE, "nothing detected in this frame.\n");
413 return 0;
414 }
415
416 header = av_detection_bbox_create_side_data(frame, nb_bboxes);
417 if (!header) {
418 av_log(filter_ctx, AV_LOG_ERROR, "failed to create side data with %d bounding boxes\n", nb_bboxes);
419 return -1;
420 }
421
422 av_strlcpy(header->source, ctx->dnnctx.model_filename, sizeof(header->source));
423
424 for (int i = 0; i < proposal_count; ++i) {
425 int av_unused image_id = (int)detections[i * detect_size + 0];
426 int label_id;
427 float conf, x0, y0, x1, y1;
428
429 if (nb_outputs == 1) {
430 label_id = (int)detections[i * detect_size + 1];
431 conf = detections[i * detect_size + 2];
432 x0 = detections[i * detect_size + 3];
433 y0 = detections[i * detect_size + 4];
434 x1 = detections[i * detect_size + 5];
435 y1 = detections[i * detect_size + 6];
436 } else {
437 label_id = (int)labels[i];
438 x0 = detections[i * detect_size] / scale_w;
439 y0 = detections[i * detect_size + 1] / scale_h;
440 x1 = detections[i * detect_size + 2] / scale_w;
441 y1 = detections[i * detect_size + 3] / scale_h;
442 conf = detections[i * detect_size + 4];
443 }
444
445 if (conf < conf_threshold) {
446 continue;
447 }
448
449 bbox = av_get_detection_bbox(header, header->nb_bboxes - nb_bboxes);
450 bbox->x = (int)(x0 * frame->width);
451 bbox->w = (int)(x1 * frame->width) - bbox->x;
452 bbox->y = (int)(y0 * frame->height);
453 bbox->h = (int)(y1 * frame->height) - bbox->y;
454
455 bbox->detect_confidence = av_make_q((int)(conf * 10000), 10000);
456 bbox->classify_count = 0;
457
458 if (ctx->labels && label_id < ctx->label_count) {
459 av_strlcpy(bbox->detect_label, ctx->labels[label_id], sizeof(bbox->detect_label));
460 } else {
461 snprintf(bbox->detect_label, sizeof(bbox->detect_label), "%d", label_id);
462 }
463
464 nb_bboxes--;
465 if (nb_bboxes == 0) {
466 break;
467 }
468 }
469 return 0;
470 }
471
472 static int dnn_detect_post_proc_ov(AVFrame *frame, DNNData *output, int nb_outputs,
473 AVFilterContext *filter_ctx)
474 {
475 AVFrameSideData *sd;
476 DnnDetectContext *ctx = filter_ctx->priv;
477 int ret = 0;
478
479 sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DETECTION_BBOXES);
480 if (sd) {
481 av_log(filter_ctx, AV_LOG_ERROR, "already have bounding boxes in side data.\n");
482 return -1;
483 }
484
485 switch (ctx->model_type) {
486 case DDMT_SSD:
487 ret = dnn_detect_post_proc_ssd(frame, output, nb_outputs, filter_ctx);
488 if (ret < 0)
489 return ret;
490 break;
491 case DDMT_YOLOV1V2:
492 ret = dnn_detect_post_proc_yolo(frame, output, filter_ctx);
493 if (ret < 0)
494 return ret;
495 break;
496 case DDMT_YOLOV3:
497 case DDMT_YOLOV4:
498 ret = dnn_detect_post_proc_yolov3(frame, output, filter_ctx, nb_outputs);
499 if (ret < 0)
500 return ret;
501 break;
502 }
503 return 0;
504 }
505
506 static int dnn_detect_post_proc_tf(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx)
507 {
508 DnnDetectContext *ctx = filter_ctx->priv;
509 int proposal_count;
510 float conf_threshold = ctx->confidence;
511 float *conf, *position, *label_id, x0, y0, x1, y1;
512 int nb_bboxes = 0;
513 AVFrameSideData *sd;
514 AVDetectionBBox *bbox;
515 AVDetectionBBoxHeader *header;
516
517 proposal_count = *(float *)(output[0].data);
518 conf = output[1].data;
519 position = output[3].data;
520 label_id = output[2].data;
521
522 sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DETECTION_BBOXES);
523 if (sd) {
524 av_log(filter_ctx, AV_LOG_ERROR, "already have dnn bounding boxes in side data.\n");
525 return -1;
526 }
527
528 for (int i = 0; i < proposal_count; ++i) {
529 if (conf[i] < conf_threshold)
530 continue;
531 nb_bboxes++;
532 }
533
534 if (nb_bboxes == 0) {
535 av_log(filter_ctx, AV_LOG_VERBOSE, "nothing detected in this frame.\n");
536 return 0;
537 }
538
539 header = av_detection_bbox_create_side_data(frame, nb_bboxes);
540 if (!header) {
541 av_log(filter_ctx, AV_LOG_ERROR, "failed to create side data with %d bounding boxes\n", nb_bboxes);
542 return -1;
543 }
544
545 av_strlcpy(header->source, ctx->dnnctx.model_filename, sizeof(header->source));
546
547 for (int i = 0; i < proposal_count; ++i) {
548 y0 = position[i * 4];
549 x0 = position[i * 4 + 1];
550 y1 = position[i * 4 + 2];
551 x1 = position[i * 4 + 3];
552
553 bbox = av_get_detection_bbox(header, i);
554
555 if (conf[i] < conf_threshold) {
556 continue;
557 }
558
559 bbox->x = (int)(x0 * frame->width);
560 bbox->w = (int)(x1 * frame->width) - bbox->x;
561 bbox->y = (int)(y0 * frame->height);
562 bbox->h = (int)(y1 * frame->height) - bbox->y;
563
564 bbox->detect_confidence = av_make_q((int)(conf[i] * 10000), 10000);
565 bbox->classify_count = 0;
566
567 if (ctx->labels && label_id[i] < ctx->label_count) {
568 av_strlcpy(bbox->detect_label, ctx->labels[(int)label_id[i]], sizeof(bbox->detect_label));
569 } else {
570 snprintf(bbox->detect_label, sizeof(bbox->detect_label), "%d", (int)label_id[i]);
571 }
572
573 nb_bboxes--;
574 if (nb_bboxes == 0) {
575 break;
576 }
577 }
578 return 0;
579 }
580
581 static int dnn_detect_post_proc(AVFrame *frame, DNNData *output, uint32_t nb, AVFilterContext *filter_ctx)
582 {
583 DnnDetectContext *ctx = filter_ctx->priv;
584 DnnContext *dnn_ctx = &ctx->dnnctx;
585 switch (dnn_ctx->backend_type) {
586 case DNN_OV:
587 return dnn_detect_post_proc_ov(frame, output, nb, filter_ctx);
588 case DNN_TF:
589 return dnn_detect_post_proc_tf(frame, output, filter_ctx);
590 default:
591 avpriv_report_missing_feature(filter_ctx, "Current dnn backend does not support detect filter\n");
592 return AVERROR(EINVAL);
593 }
594 }
595
596 static void free_detect_labels(DnnDetectContext *ctx)
597 {
598 for (int i = 0; i < ctx->label_count; i++) {
599 av_freep(&ctx->labels[i]);
600 }
601 ctx->label_count = 0;
602 av_freep(&ctx->labels);
603 }
604
605 static int read_detect_label_file(AVFilterContext *context)
606 {
607 int line_len;
608 FILE *file;
609 DnnDetectContext *ctx = context->priv;
610
611 file = avpriv_fopen_utf8(ctx->labels_filename, "r");
612 if (!file){
613 av_log(context, AV_LOG_ERROR, "failed to open file %s\n", ctx->labels_filename);
614 return AVERROR(EINVAL);
615 }
616
617 while (!feof(file)) {
618 char *label;
619 char buf[256];
620 if (!fgets(buf, 256, file)) {
621 break;
622 }
623
624 line_len = strlen(buf);
625 while (line_len) {
626 int i = line_len - 1;
627 if (buf[i] == '\n' || buf[i] == '\r' || buf[i] == ' ') {
628 buf[i] = '\0';
629 line_len--;
630 } else {
631 break;
632 }
633 }
634
635 if (line_len == 0) // empty line
636 continue;
637
638 if (line_len >= AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE) {
639 av_log(context, AV_LOG_ERROR, "label %s too long\n", buf);
640 fclose(file);
641 return AVERROR(EINVAL);
642 }
643
644 label = av_strdup(buf);
645 if (!label) {
646 av_log(context, AV_LOG_ERROR, "failed to allocate memory for label %s\n", buf);
647 fclose(file);
648 return AVERROR(ENOMEM);
649 }
650
651 if (av_dynarray_add_nofree(&ctx->labels, &ctx->label_count, label) < 0) {
652 av_log(context, AV_LOG_ERROR, "failed to do av_dynarray_add\n");
653 fclose(file);
654 av_freep(&label);
655 return AVERROR(ENOMEM);
656 }
657 }
658
659 fclose(file);
660 return 0;
661 }
662
663 static int check_output_nb(DnnDetectContext *ctx, DNNBackendType backend_type, int output_nb)
664 {
665 switch(backend_type) {
666 case DNN_TF:
667 if (output_nb != 4) {
668 av_log(ctx, AV_LOG_ERROR, "Only support tensorflow detect model with 4 outputs, \
669 but get %d instead\n", output_nb);
670 return AVERROR(EINVAL);
671 }
672 return 0;
673 case DNN_OV:
674 return 0;
675 default:
676 avpriv_report_missing_feature(ctx, "Dnn detect filter does not support current backend\n");
677 return AVERROR(EINVAL);
678 }
679 return 0;
680 }
681
682 static av_cold int dnn_detect_init(AVFilterContext *context)
683 {
684 DnnDetectContext *ctx = context->priv;
685 DnnContext *dnn_ctx = &ctx->dnnctx;
686 int ret;
687
688 ret = ff_dnn_init(&ctx->dnnctx, DFT_ANALYTICS_DETECT, context);
689 if (ret < 0)
690 return ret;
691 ret = check_output_nb(ctx, dnn_ctx->backend_type, dnn_ctx->nb_outputs);
692 if (ret < 0)
693 return ret;
694 ctx->bboxes_fifo = av_fifo_alloc2(1, sizeof(AVDetectionBBox *), AV_FIFO_FLAG_AUTO_GROW);
695 if (!ctx->bboxes_fifo)
696 return AVERROR(ENOMEM);
697 ff_dnn_set_detect_post_proc(&ctx->dnnctx, dnn_detect_post_proc);
698
699 if (ctx->labels_filename) {
700 return read_detect_label_file(context);
701 }
702 if (ctx->anchors_str) {
703 ret = dnn_detect_parse_anchors(ctx->anchors_str, &ctx->anchors);
704 if (!ctx->anchors) {
705 av_log(context, AV_LOG_ERROR, "failed to parse anchors_str\n");
706 return AVERROR(EINVAL);
707 }
708 ctx->nb_anchor = ret;
709 }
710 return 0;
711 }
712
713 static const enum AVPixelFormat pix_fmts[] = {
714 AV_PIX_FMT_RGB24, AV_PIX_FMT_BGR24,
715 AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAYF32,
716 AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P,
717 AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P,
718 AV_PIX_FMT_NV12,
719 AV_PIX_FMT_NONE
720 };
721
722 static int dnn_detect_flush_frame(AVFilterLink *outlink, int64_t pts, int64_t *out_pts)
723 {
724 DnnDetectContext *ctx = outlink->src->priv;
725 int ret;
726 DNNAsyncStatusType async_state;
727
728 ret = ff_dnn_flush(&ctx->dnnctx);
729 if (ret != 0) {
730 return -1;
731 }
732
733 do {
734 AVFrame *in_frame = NULL;
735 AVFrame *out_frame = NULL;
736 async_state = ff_dnn_get_result(&ctx->dnnctx, &in_frame, &out_frame);
737 if (async_state == DAST_SUCCESS) {
738 ret = ff_filter_frame(outlink, in_frame);
739 if (ret < 0)
740 return ret;
741 if (out_pts)
742 *out_pts = in_frame->pts + pts;
743 }
744 av_usleep(5000);
745 } while (async_state >= DAST_NOT_READY);
746
747 return 0;
748 }
749
750 static int dnn_detect_activate(AVFilterContext *filter_ctx)
751 {
752 AVFilterLink *inlink = filter_ctx->inputs[0];
753 AVFilterLink *outlink = filter_ctx->outputs[0];
754 DnnDetectContext *ctx = filter_ctx->priv;
755 AVFrame *in = NULL;
756 int64_t pts;
757 int ret, status;
758 int got_frame = 0;
759 int async_state;
760
761 FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);
762
763 do {
764 // drain all input frames
765 ret = ff_inlink_consume_frame(inlink, &in);
766 if (ret < 0)
767 return ret;
768 if (ret > 0) {
769 if (ff_dnn_execute_model(&ctx->dnnctx, in, NULL) != 0) {
770 return AVERROR(EIO);
771 }
772 }
773 } while (ret > 0);
774
775 // drain all processed frames
776 do {
777 AVFrame *in_frame = NULL;
778 AVFrame *out_frame = NULL;
779 async_state = ff_dnn_get_result(&ctx->dnnctx, &in_frame, &out_frame);
780 if (async_state == DAST_SUCCESS) {
781 ret = ff_filter_frame(outlink, in_frame);
782 if (ret < 0)
783 return ret;
784 got_frame = 1;
785 }
786 } while (async_state == DAST_SUCCESS);
787
788 // if frame got, schedule to next filter
789 if (got_frame)
790 return 0;
791
792 if (ff_inlink_acknowledge_status(inlink, &status, &pts)) {
793 if (status == AVERROR_EOF) {
794 int64_t out_pts = pts;
795 ret = dnn_detect_flush_frame(outlink, pts, &out_pts);
796 ff_outlink_set_status(outlink, status, out_pts);
797 return ret;
798 }
799 }
800
801 FF_FILTER_FORWARD_WANTED(outlink, inlink);
802
803 return 0;
804 }
805
806 static av_cold void dnn_detect_uninit(AVFilterContext *context)
807 {
808 DnnDetectContext *ctx = context->priv;
809 AVDetectionBBox *bbox;
810 ff_dnn_uninit(&ctx->dnnctx);
811 while(av_fifo_can_read(ctx->bboxes_fifo)) {
812 av_fifo_read(ctx->bboxes_fifo, &bbox, 1);
813 av_freep(&bbox);
814 }
815 av_fifo_freep2(&ctx->bboxes_fifo);
816 av_freep(&ctx->anchors);
817 free_detect_labels(ctx);
818 }
819
820 static int config_input(AVFilterLink *inlink)
821 {
822 AVFilterContext *context = inlink->dst;
823 DnnDetectContext *ctx = context->priv;
824 DNNData model_input;
825 int ret, width_idx, height_idx;
826
827 ret = ff_dnn_get_input(&ctx->dnnctx, &model_input);
828 if (ret != 0) {
829 av_log(ctx, AV_LOG_ERROR, "could not get input from the model\n");
830 return ret;
831 }
832 width_idx = dnn_get_width_idx_by_layout(model_input.layout);
833 height_idx = dnn_get_height_idx_by_layout(model_input.layout);
834 ctx->scale_width = model_input.dims[width_idx] == -1 ? inlink->w :
835 model_input.dims[width_idx];
836 ctx->scale_height = model_input.dims[height_idx] == -1 ? inlink->h :
837 model_input.dims[height_idx];
838
839 return 0;
840 }
841
842 static const AVFilterPad dnn_detect_inputs[] = {
843 {
844 .name = "default",
845 .type = AVMEDIA_TYPE_VIDEO,
846 .config_props = config_input,
847 },
848 };
849
850 const AVFilter ff_vf_dnn_detect = {
851 .name = "dnn_detect",
852 .description = NULL_IF_CONFIG_SMALL("Apply DNN detect filter to the input."),
853 .priv_size = sizeof(DnnDetectContext),
854 .init = dnn_detect_init,
855 .uninit = dnn_detect_uninit,
856 FILTER_INPUTS(dnn_detect_inputs),
857 FILTER_OUTPUTS(ff_video_default_filterpad),
858 FILTER_PIXFMTS_ARRAY(pix_fmts),
859 .priv_class = &dnn_detect_class,
860 .activate = dnn_detect_activate,
861 };
862