Line |
Branch |
Exec |
Source |
1 |
|
|
/* |
2 |
|
|
* This file is part of FFmpeg. |
3 |
|
|
* |
4 |
|
|
* FFmpeg is free software; you can redistribute it and/or |
5 |
|
|
* modify it under the terms of the GNU Lesser General Public |
6 |
|
|
* License as published by the Free Software Foundation; either |
7 |
|
|
* version 2.1 of the License, or (at your option) any later version. |
8 |
|
|
* |
9 |
|
|
* FFmpeg is distributed in the hope that it will be useful, |
10 |
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 |
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
12 |
|
|
* Lesser General Public License for more details. |
13 |
|
|
* |
14 |
|
|
* You should have received a copy of the GNU Lesser General Public |
15 |
|
|
* License along with FFmpeg; if not, write to the Free Software |
16 |
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
17 |
|
|
*/ |
18 |
|
|
|
19 |
|
|
/** |
20 |
|
|
* @file |
21 |
|
|
* implementing an object detecting filter using deep learning networks. |
22 |
|
|
*/ |
23 |
|
|
|
24 |
|
|
#include "libavutil/file_open.h" |
25 |
|
|
#include "libavutil/mem.h" |
26 |
|
|
#include "libavutil/opt.h" |
27 |
|
|
#include "filters.h" |
28 |
|
|
#include "dnn_filter_common.h" |
29 |
|
|
#include "internal.h" |
30 |
|
|
#include "video.h" |
31 |
|
|
#include "libavutil/time.h" |
32 |
|
|
#include "libavutil/avstring.h" |
33 |
|
|
#include "libavutil/detection_bbox.h" |
34 |
|
|
#include "libavutil/fifo.h" |
35 |
|
|
|
36 |
|
|
typedef enum { |
37 |
|
|
DDMT_SSD, |
38 |
|
|
DDMT_YOLOV1V2, |
39 |
|
|
DDMT_YOLOV3, |
40 |
|
|
DDMT_YOLOV4 |
41 |
|
|
} DNNDetectionModelType; |
42 |
|
|
|
43 |
|
|
typedef struct DnnDetectContext { |
44 |
|
|
const AVClass *class; |
45 |
|
|
DnnContext dnnctx; |
46 |
|
|
float confidence; |
47 |
|
|
char *labels_filename; |
48 |
|
|
char **labels; |
49 |
|
|
int label_count; |
50 |
|
|
DNNDetectionModelType model_type; |
51 |
|
|
int cell_w; |
52 |
|
|
int cell_h; |
53 |
|
|
int nb_classes; |
54 |
|
|
AVFifo *bboxes_fifo; |
55 |
|
|
int scale_width; |
56 |
|
|
int scale_height; |
57 |
|
|
char *anchors_str; |
58 |
|
|
float *anchors; |
59 |
|
|
int nb_anchor; |
60 |
|
|
} DnnDetectContext; |
61 |
|
|
|
62 |
|
|
#define OFFSET(x) offsetof(DnnDetectContext, dnnctx.x) |
63 |
|
|
#define OFFSET2(x) offsetof(DnnDetectContext, x) |
64 |
|
|
#define FLAGS AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM |
65 |
|
|
static const AVOption dnn_detect_options[] = { |
66 |
|
|
{ "dnn_backend", "DNN backend", OFFSET(backend_type), AV_OPT_TYPE_INT, { .i64 = DNN_OV }, INT_MIN, INT_MAX, FLAGS, .unit = "backend" }, |
67 |
|
|
#if (CONFIG_LIBTENSORFLOW == 1) |
68 |
|
|
{ "tensorflow", "tensorflow backend flag", 0, AV_OPT_TYPE_CONST, { .i64 = DNN_TF }, 0, 0, FLAGS, .unit = "backend" }, |
69 |
|
|
#endif |
70 |
|
|
#if (CONFIG_LIBOPENVINO == 1) |
71 |
|
|
{ "openvino", "openvino backend flag", 0, AV_OPT_TYPE_CONST, { .i64 = DNN_OV }, 0, 0, FLAGS, .unit = "backend" }, |
72 |
|
|
#endif |
73 |
|
|
DNN_COMMON_OPTIONS |
74 |
|
|
{ "confidence", "threshold of confidence", OFFSET2(confidence), AV_OPT_TYPE_FLOAT, { .dbl = 0.5 }, 0, 1, FLAGS}, |
75 |
|
|
{ "labels", "path to labels file", OFFSET2(labels_filename), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, FLAGS }, |
76 |
|
|
{ "model_type", "DNN detection model type", OFFSET2(model_type), AV_OPT_TYPE_INT, { .i64 = DDMT_SSD }, INT_MIN, INT_MAX, FLAGS, .unit = "model_type" }, |
77 |
|
|
{ "ssd", "output shape [1, 1, N, 7]", 0, AV_OPT_TYPE_CONST, { .i64 = DDMT_SSD }, 0, 0, FLAGS, .unit = "model_type" }, |
78 |
|
|
{ "yolo", "output shape [1, N*Cx*Cy*DetectionBox]", 0, AV_OPT_TYPE_CONST, { .i64 = DDMT_YOLOV1V2 }, 0, 0, FLAGS, .unit = "model_type" }, |
79 |
|
|
{ "yolov3", "outputs shape [1, N*D, Cx, Cy]", 0, AV_OPT_TYPE_CONST, { .i64 = DDMT_YOLOV3 }, 0, 0, FLAGS, .unit = "model_type" }, |
80 |
|
|
{ "yolov4", "outputs shape [1, N*D, Cx, Cy]", 0, AV_OPT_TYPE_CONST, { .i64 = DDMT_YOLOV4 }, 0, 0, FLAGS, .unit = "model_type" }, |
81 |
|
|
{ "cell_w", "cell width", OFFSET2(cell_w), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INTMAX_MAX, FLAGS }, |
82 |
|
|
{ "cell_h", "cell height", OFFSET2(cell_h), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INTMAX_MAX, FLAGS }, |
83 |
|
|
{ "nb_classes", "The number of class", OFFSET2(nb_classes), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INTMAX_MAX, FLAGS }, |
84 |
|
|
{ "anchors", "anchors, splited by '&'", OFFSET2(anchors_str), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, FLAGS }, |
85 |
|
|
{ NULL } |
86 |
|
|
}; |
87 |
|
|
|
88 |
|
|
AVFILTER_DEFINE_CLASS(dnn_detect); |
89 |
|
|
|
90 |
|
✗ |
static inline float sigmoid(float x) { |
91 |
|
✗ |
return 1.f / (1.f + exp(-x)); |
92 |
|
|
} |
93 |
|
|
|
94 |
|
✗ |
static inline float linear(float x) { |
95 |
|
✗ |
return x; |
96 |
|
|
} |
97 |
|
|
|
98 |
|
✗ |
static int dnn_detect_get_label_id(int nb_classes, int cell_size, float *label_data) |
99 |
|
|
{ |
100 |
|
✗ |
float max_prob = 0; |
101 |
|
✗ |
int label_id = 0; |
102 |
|
✗ |
for (int i = 0; i < nb_classes; i++) { |
103 |
|
✗ |
if (label_data[i * cell_size] > max_prob) { |
104 |
|
✗ |
max_prob = label_data[i * cell_size]; |
105 |
|
✗ |
label_id = i; |
106 |
|
|
} |
107 |
|
|
} |
108 |
|
✗ |
return label_id; |
109 |
|
|
} |
110 |
|
|
|
111 |
|
✗ |
static int dnn_detect_parse_anchors(char *anchors_str, float **anchors) |
112 |
|
|
{ |
113 |
|
✗ |
char *saveptr = NULL, *token; |
114 |
|
|
float *anchors_buf; |
115 |
|
✗ |
int nb_anchor = 0, i = 0; |
116 |
|
✗ |
while(anchors_str[i] != '\0') { |
117 |
|
✗ |
if(anchors_str[i] == '&') |
118 |
|
✗ |
nb_anchor++; |
119 |
|
✗ |
i++; |
120 |
|
|
} |
121 |
|
✗ |
nb_anchor++; |
122 |
|
✗ |
anchors_buf = av_mallocz(nb_anchor * sizeof(**anchors)); |
123 |
|
✗ |
if (!anchors_buf) { |
124 |
|
✗ |
return 0; |
125 |
|
|
} |
126 |
|
✗ |
for (int i = 0; i < nb_anchor; i++) { |
127 |
|
✗ |
token = av_strtok(anchors_str, "&", &saveptr); |
128 |
|
✗ |
if (!token) { |
129 |
|
✗ |
av_freep(&anchors_buf); |
130 |
|
✗ |
return 0; |
131 |
|
|
} |
132 |
|
✗ |
anchors_buf[i] = strtof(token, NULL); |
133 |
|
✗ |
anchors_str = NULL; |
134 |
|
|
} |
135 |
|
✗ |
*anchors = anchors_buf; |
136 |
|
✗ |
return nb_anchor; |
137 |
|
|
} |
138 |
|
|
|
139 |
|
|
/* Calculate Intersection Over Union */ |
140 |
|
✗ |
static float dnn_detect_IOU(AVDetectionBBox *bbox1, AVDetectionBBox *bbox2) |
141 |
|
|
{ |
142 |
|
✗ |
float overlapping_width = FFMIN(bbox1->x + bbox1->w, bbox2->x + bbox2->w) - FFMAX(bbox1->x, bbox2->x); |
143 |
|
✗ |
float overlapping_height = FFMIN(bbox1->y + bbox1->h, bbox2->y + bbox2->h) - FFMAX(bbox1->y, bbox2->y); |
144 |
|
✗ |
float intersection_area = |
145 |
|
✗ |
(overlapping_width < 0 || overlapping_height < 0) ? 0 : overlapping_height * overlapping_width; |
146 |
|
✗ |
float union_area = bbox1->w * bbox1->h + bbox2->w * bbox2->h - intersection_area; |
147 |
|
✗ |
return intersection_area / union_area; |
148 |
|
|
} |
149 |
|
|
|
150 |
|
✗ |
static int dnn_detect_parse_yolo_output(AVFrame *frame, DNNData *output, int output_index, |
151 |
|
|
AVFilterContext *filter_ctx) |
152 |
|
|
{ |
153 |
|
✗ |
DnnDetectContext *ctx = filter_ctx->priv; |
154 |
|
✗ |
float conf_threshold = ctx->confidence; |
155 |
|
|
int detection_boxes, box_size; |
156 |
|
✗ |
int cell_w = 0, cell_h = 0, scale_w = 0, scale_h = 0; |
157 |
|
✗ |
int nb_classes = ctx->nb_classes; |
158 |
|
✗ |
float *output_data = output[output_index].data; |
159 |
|
✗ |
float *anchors = ctx->anchors; |
160 |
|
|
AVDetectionBBox *bbox; |
161 |
|
✗ |
float (*post_process_raw_data)(float x) = linear; |
162 |
|
✗ |
int is_NHWC = 0; |
163 |
|
|
|
164 |
|
✗ |
if (ctx->model_type == DDMT_YOLOV1V2) { |
165 |
|
✗ |
cell_w = ctx->cell_w; |
166 |
|
✗ |
cell_h = ctx->cell_h; |
167 |
|
✗ |
scale_w = cell_w; |
168 |
|
✗ |
scale_h = cell_h; |
169 |
|
|
} else { |
170 |
|
✗ |
if (output[output_index].dims[2] != output[output_index].dims[3] && |
171 |
|
✗ |
output[output_index].dims[2] == output[output_index].dims[1]) { |
172 |
|
✗ |
is_NHWC = 1; |
173 |
|
✗ |
cell_w = output[output_index].dims[2]; |
174 |
|
✗ |
cell_h = output[output_index].dims[1]; |
175 |
|
|
} else { |
176 |
|
✗ |
cell_w = output[output_index].dims[3]; |
177 |
|
✗ |
cell_h = output[output_index].dims[2]; |
178 |
|
|
} |
179 |
|
✗ |
scale_w = ctx->scale_width; |
180 |
|
✗ |
scale_h = ctx->scale_height; |
181 |
|
|
} |
182 |
|
✗ |
box_size = nb_classes + 5; |
183 |
|
|
|
184 |
|
✗ |
switch (ctx->model_type) { |
185 |
|
✗ |
case DDMT_YOLOV1V2: |
186 |
|
|
case DDMT_YOLOV3: |
187 |
|
✗ |
post_process_raw_data = linear; |
188 |
|
✗ |
break; |
189 |
|
✗ |
case DDMT_YOLOV4: |
190 |
|
✗ |
post_process_raw_data = sigmoid; |
191 |
|
✗ |
break; |
192 |
|
|
} |
193 |
|
|
|
194 |
|
✗ |
if (!cell_h || !cell_w) { |
195 |
|
✗ |
av_log(filter_ctx, AV_LOG_ERROR, "cell_w and cell_h are detected\n"); |
196 |
|
✗ |
return AVERROR(EINVAL); |
197 |
|
|
} |
198 |
|
|
|
199 |
|
✗ |
if (!nb_classes) { |
200 |
|
✗ |
av_log(filter_ctx, AV_LOG_ERROR, "nb_classes is not set\n"); |
201 |
|
✗ |
return AVERROR(EINVAL); |
202 |
|
|
} |
203 |
|
|
|
204 |
|
✗ |
if (!anchors) { |
205 |
|
✗ |
av_log(filter_ctx, AV_LOG_ERROR, "anchors is not set\n"); |
206 |
|
✗ |
return AVERROR(EINVAL); |
207 |
|
|
} |
208 |
|
|
|
209 |
|
✗ |
if (output[output_index].dims[1] * output[output_index].dims[2] * |
210 |
|
✗ |
output[output_index].dims[3] % (box_size * cell_w * cell_h)) { |
211 |
|
✗ |
av_log(filter_ctx, AV_LOG_ERROR, "wrong cell_w, cell_h or nb_classes\n"); |
212 |
|
✗ |
return AVERROR(EINVAL); |
213 |
|
|
} |
214 |
|
✗ |
detection_boxes = output[output_index].dims[1] * |
215 |
|
✗ |
output[output_index].dims[2] * |
216 |
|
✗ |
output[output_index].dims[3] / box_size / cell_w / cell_h; |
217 |
|
|
|
218 |
|
✗ |
anchors = anchors + (detection_boxes * output_index * 2); |
219 |
|
|
/** |
220 |
|
|
* find all candidate bbox |
221 |
|
|
* yolo output can be reshaped to [B, N*D, Cx, Cy] |
222 |
|
|
* Detection box 'D' has format [`x`, `y`, `h`, `w`, `box_score`, `class_no_1`, ...,] |
223 |
|
|
**/ |
224 |
|
✗ |
for (int box_id = 0; box_id < detection_boxes; box_id++) { |
225 |
|
✗ |
for (int cx = 0; cx < cell_w; cx++) |
226 |
|
✗ |
for (int cy = 0; cy < cell_h; cy++) { |
227 |
|
|
float x, y, w, h, conf; |
228 |
|
|
float *detection_boxes_data; |
229 |
|
|
int label_id; |
230 |
|
|
|
231 |
|
✗ |
if (is_NHWC) { |
232 |
|
✗ |
detection_boxes_data = output_data + |
233 |
|
✗ |
((cy * cell_w + cx) * detection_boxes + box_id) * box_size; |
234 |
|
✗ |
conf = post_process_raw_data(detection_boxes_data[4]); |
235 |
|
|
} else { |
236 |
|
✗ |
detection_boxes_data = output_data + box_id * box_size * cell_w * cell_h; |
237 |
|
✗ |
conf = post_process_raw_data( |
238 |
|
✗ |
detection_boxes_data[cy * cell_w + cx + 4 * cell_w * cell_h]); |
239 |
|
|
} |
240 |
|
|
|
241 |
|
✗ |
if (is_NHWC) { |
242 |
|
✗ |
x = post_process_raw_data(detection_boxes_data[0]); |
243 |
|
✗ |
y = post_process_raw_data(detection_boxes_data[1]); |
244 |
|
✗ |
w = detection_boxes_data[2]; |
245 |
|
✗ |
h = detection_boxes_data[3]; |
246 |
|
✗ |
label_id = dnn_detect_get_label_id(ctx->nb_classes, 1, detection_boxes_data + 5); |
247 |
|
✗ |
conf = conf * post_process_raw_data(detection_boxes_data[label_id + 5]); |
248 |
|
|
} else { |
249 |
|
✗ |
x = post_process_raw_data(detection_boxes_data[cy * cell_w + cx]); |
250 |
|
✗ |
y = post_process_raw_data(detection_boxes_data[cy * cell_w + cx + cell_w * cell_h]); |
251 |
|
✗ |
w = detection_boxes_data[cy * cell_w + cx + 2 * cell_w * cell_h]; |
252 |
|
✗ |
h = detection_boxes_data[cy * cell_w + cx + 3 * cell_w * cell_h]; |
253 |
|
✗ |
label_id = dnn_detect_get_label_id(ctx->nb_classes, cell_w * cell_h, |
254 |
|
✗ |
detection_boxes_data + cy * cell_w + cx + 5 * cell_w * cell_h); |
255 |
|
✗ |
conf = conf * post_process_raw_data( |
256 |
|
✗ |
detection_boxes_data[cy * cell_w + cx + (label_id + 5) * cell_w * cell_h]); |
257 |
|
|
} |
258 |
|
✗ |
if (conf < conf_threshold) { |
259 |
|
✗ |
continue; |
260 |
|
|
} |
261 |
|
|
|
262 |
|
✗ |
bbox = av_mallocz(sizeof(*bbox)); |
263 |
|
✗ |
if (!bbox) |
264 |
|
✗ |
return AVERROR(ENOMEM); |
265 |
|
|
|
266 |
|
✗ |
bbox->w = exp(w) * anchors[box_id * 2] * frame->width / scale_w; |
267 |
|
✗ |
bbox->h = exp(h) * anchors[box_id * 2 + 1] * frame->height / scale_h; |
268 |
|
✗ |
bbox->x = (cx + x) / cell_w * frame->width - bbox->w / 2; |
269 |
|
✗ |
bbox->y = (cy + y) / cell_h * frame->height - bbox->h / 2; |
270 |
|
✗ |
bbox->detect_confidence = av_make_q((int)(conf * 10000), 10000); |
271 |
|
✗ |
if (ctx->labels && label_id < ctx->label_count) { |
272 |
|
✗ |
av_strlcpy(bbox->detect_label, ctx->labels[label_id], sizeof(bbox->detect_label)); |
273 |
|
|
} else { |
274 |
|
✗ |
snprintf(bbox->detect_label, sizeof(bbox->detect_label), "%d", label_id); |
275 |
|
|
} |
276 |
|
|
|
277 |
|
✗ |
if (av_fifo_write(ctx->bboxes_fifo, &bbox, 1) < 0) { |
278 |
|
✗ |
av_freep(&bbox); |
279 |
|
✗ |
return AVERROR(ENOMEM); |
280 |
|
|
} |
281 |
|
✗ |
bbox = NULL; |
282 |
|
|
} |
283 |
|
|
} |
284 |
|
✗ |
return 0; |
285 |
|
|
} |
286 |
|
|
|
287 |
|
✗ |
static int dnn_detect_fill_side_data(AVFrame *frame, AVFilterContext *filter_ctx) |
288 |
|
|
{ |
289 |
|
✗ |
DnnDetectContext *ctx = filter_ctx->priv; |
290 |
|
✗ |
float conf_threshold = ctx->confidence; |
291 |
|
|
AVDetectionBBox *bbox; |
292 |
|
✗ |
int nb_bboxes = 0; |
293 |
|
|
AVDetectionBBoxHeader *header; |
294 |
|
✗ |
if (av_fifo_can_read(ctx->bboxes_fifo) == 0) { |
295 |
|
✗ |
av_log(filter_ctx, AV_LOG_VERBOSE, "nothing detected in this frame.\n"); |
296 |
|
✗ |
return 0; |
297 |
|
|
} |
298 |
|
|
|
299 |
|
|
/* remove overlap bboxes */ |
300 |
|
✗ |
for (int i = 0; i < av_fifo_can_read(ctx->bboxes_fifo); i++){ |
301 |
|
✗ |
av_fifo_peek(ctx->bboxes_fifo, &bbox, 1, i); |
302 |
|
✗ |
for (int j = 0; j < av_fifo_can_read(ctx->bboxes_fifo); j++) { |
303 |
|
|
AVDetectionBBox *overlap_bbox; |
304 |
|
✗ |
av_fifo_peek(ctx->bboxes_fifo, &overlap_bbox, 1, j); |
305 |
|
✗ |
if (!strcmp(bbox->detect_label, overlap_bbox->detect_label) && |
306 |
|
✗ |
av_cmp_q(bbox->detect_confidence, overlap_bbox->detect_confidence) < 0 && |
307 |
|
✗ |
dnn_detect_IOU(bbox, overlap_bbox) >= conf_threshold) { |
308 |
|
✗ |
bbox->classify_count = -1; // bad result |
309 |
|
✗ |
nb_bboxes++; |
310 |
|
✗ |
break; |
311 |
|
|
} |
312 |
|
|
} |
313 |
|
|
} |
314 |
|
✗ |
nb_bboxes = av_fifo_can_read(ctx->bboxes_fifo) - nb_bboxes; |
315 |
|
✗ |
header = av_detection_bbox_create_side_data(frame, nb_bboxes); |
316 |
|
✗ |
if (!header) { |
317 |
|
✗ |
av_log(filter_ctx, AV_LOG_ERROR, "failed to create side data with %d bounding boxes\n", nb_bboxes); |
318 |
|
✗ |
return -1; |
319 |
|
|
} |
320 |
|
✗ |
av_strlcpy(header->source, ctx->dnnctx.model_filename, sizeof(header->source)); |
321 |
|
|
|
322 |
|
✗ |
while(av_fifo_can_read(ctx->bboxes_fifo)) { |
323 |
|
|
AVDetectionBBox *candidate_bbox; |
324 |
|
✗ |
av_fifo_read(ctx->bboxes_fifo, &candidate_bbox, 1); |
325 |
|
|
|
326 |
|
✗ |
if (nb_bboxes > 0 && candidate_bbox->classify_count != -1) { |
327 |
|
✗ |
bbox = av_get_detection_bbox(header, header->nb_bboxes - nb_bboxes); |
328 |
|
✗ |
memcpy(bbox, candidate_bbox, sizeof(*bbox)); |
329 |
|
✗ |
nb_bboxes--; |
330 |
|
|
} |
331 |
|
✗ |
av_freep(&candidate_bbox); |
332 |
|
|
} |
333 |
|
✗ |
return 0; |
334 |
|
|
} |
335 |
|
|
|
336 |
|
✗ |
static int dnn_detect_post_proc_yolo(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx) |
337 |
|
|
{ |
338 |
|
✗ |
int ret = 0; |
339 |
|
✗ |
ret = dnn_detect_parse_yolo_output(frame, output, 0, filter_ctx); |
340 |
|
✗ |
if (ret < 0) |
341 |
|
✗ |
return ret; |
342 |
|
✗ |
ret = dnn_detect_fill_side_data(frame, filter_ctx); |
343 |
|
✗ |
if (ret < 0) |
344 |
|
✗ |
return ret; |
345 |
|
✗ |
return 0; |
346 |
|
|
} |
347 |
|
|
|
348 |
|
✗ |
static int dnn_detect_post_proc_yolov3(AVFrame *frame, DNNData *output, |
349 |
|
|
AVFilterContext *filter_ctx, int nb_outputs) |
350 |
|
|
{ |
351 |
|
✗ |
int ret = 0; |
352 |
|
✗ |
for (int i = 0; i < nb_outputs; i++) { |
353 |
|
✗ |
ret = dnn_detect_parse_yolo_output(frame, output, i, filter_ctx); |
354 |
|
✗ |
if (ret < 0) |
355 |
|
✗ |
return ret; |
356 |
|
|
} |
357 |
|
✗ |
ret = dnn_detect_fill_side_data(frame, filter_ctx); |
358 |
|
✗ |
if (ret < 0) |
359 |
|
✗ |
return ret; |
360 |
|
✗ |
return 0; |
361 |
|
|
} |
362 |
|
|
|
363 |
|
✗ |
static int dnn_detect_post_proc_ssd(AVFrame *frame, DNNData *output, int nb_outputs, |
364 |
|
|
AVFilterContext *filter_ctx) |
365 |
|
|
{ |
366 |
|
✗ |
DnnDetectContext *ctx = filter_ctx->priv; |
367 |
|
✗ |
float conf_threshold = ctx->confidence; |
368 |
|
✗ |
int proposal_count = 0; |
369 |
|
✗ |
int detect_size = 0; |
370 |
|
✗ |
float *detections = NULL, *labels = NULL; |
371 |
|
✗ |
int nb_bboxes = 0; |
372 |
|
|
AVDetectionBBoxHeader *header; |
373 |
|
|
AVDetectionBBox *bbox; |
374 |
|
✗ |
int scale_w = ctx->scale_width; |
375 |
|
✗ |
int scale_h = ctx->scale_height; |
376 |
|
|
|
377 |
|
✗ |
if (nb_outputs == 1 && output->dims[3] == 7) { |
378 |
|
✗ |
proposal_count = output->dims[2]; |
379 |
|
✗ |
detect_size = output->dims[3]; |
380 |
|
✗ |
detections = output->data; |
381 |
|
✗ |
} else if (nb_outputs == 2 && output[0].dims[3] == 5) { |
382 |
|
✗ |
proposal_count = output[0].dims[2]; |
383 |
|
✗ |
detect_size = output[0].dims[3]; |
384 |
|
✗ |
detections = output[0].data; |
385 |
|
✗ |
labels = output[1].data; |
386 |
|
✗ |
} else if (nb_outputs == 2 && output[1].dims[3] == 5) { |
387 |
|
✗ |
proposal_count = output[1].dims[2]; |
388 |
|
✗ |
detect_size = output[1].dims[3]; |
389 |
|
✗ |
detections = output[1].data; |
390 |
|
✗ |
labels = output[0].data; |
391 |
|
|
} else { |
392 |
|
✗ |
av_log(filter_ctx, AV_LOG_ERROR, "Model output shape doesn't match ssd requirement.\n"); |
393 |
|
✗ |
return AVERROR(EINVAL); |
394 |
|
|
} |
395 |
|
|
|
396 |
|
✗ |
if (proposal_count == 0) |
397 |
|
✗ |
return 0; |
398 |
|
|
|
399 |
|
✗ |
for (int i = 0; i < proposal_count; ++i) { |
400 |
|
|
float conf; |
401 |
|
✗ |
if (nb_outputs == 1) |
402 |
|
✗ |
conf = detections[i * detect_size + 2]; |
403 |
|
|
else |
404 |
|
✗ |
conf = detections[i * detect_size + 4]; |
405 |
|
✗ |
if (conf < conf_threshold) { |
406 |
|
✗ |
continue; |
407 |
|
|
} |
408 |
|
✗ |
nb_bboxes++; |
409 |
|
|
} |
410 |
|
|
|
411 |
|
✗ |
if (nb_bboxes == 0) { |
412 |
|
✗ |
av_log(filter_ctx, AV_LOG_VERBOSE, "nothing detected in this frame.\n"); |
413 |
|
✗ |
return 0; |
414 |
|
|
} |
415 |
|
|
|
416 |
|
✗ |
header = av_detection_bbox_create_side_data(frame, nb_bboxes); |
417 |
|
✗ |
if (!header) { |
418 |
|
✗ |
av_log(filter_ctx, AV_LOG_ERROR, "failed to create side data with %d bounding boxes\n", nb_bboxes); |
419 |
|
✗ |
return -1; |
420 |
|
|
} |
421 |
|
|
|
422 |
|
✗ |
av_strlcpy(header->source, ctx->dnnctx.model_filename, sizeof(header->source)); |
423 |
|
|
|
424 |
|
✗ |
for (int i = 0; i < proposal_count; ++i) { |
425 |
|
✗ |
int av_unused image_id = (int)detections[i * detect_size + 0]; |
426 |
|
|
int label_id; |
427 |
|
|
float conf, x0, y0, x1, y1; |
428 |
|
|
|
429 |
|
✗ |
if (nb_outputs == 1) { |
430 |
|
✗ |
label_id = (int)detections[i * detect_size + 1]; |
431 |
|
✗ |
conf = detections[i * detect_size + 2]; |
432 |
|
✗ |
x0 = detections[i * detect_size + 3]; |
433 |
|
✗ |
y0 = detections[i * detect_size + 4]; |
434 |
|
✗ |
x1 = detections[i * detect_size + 5]; |
435 |
|
✗ |
y1 = detections[i * detect_size + 6]; |
436 |
|
|
} else { |
437 |
|
✗ |
label_id = (int)labels[i]; |
438 |
|
✗ |
x0 = detections[i * detect_size] / scale_w; |
439 |
|
✗ |
y0 = detections[i * detect_size + 1] / scale_h; |
440 |
|
✗ |
x1 = detections[i * detect_size + 2] / scale_w; |
441 |
|
✗ |
y1 = detections[i * detect_size + 3] / scale_h; |
442 |
|
✗ |
conf = detections[i * detect_size + 4]; |
443 |
|
|
} |
444 |
|
|
|
445 |
|
✗ |
if (conf < conf_threshold) { |
446 |
|
✗ |
continue; |
447 |
|
|
} |
448 |
|
|
|
449 |
|
✗ |
bbox = av_get_detection_bbox(header, header->nb_bboxes - nb_bboxes); |
450 |
|
✗ |
bbox->x = (int)(x0 * frame->width); |
451 |
|
✗ |
bbox->w = (int)(x1 * frame->width) - bbox->x; |
452 |
|
✗ |
bbox->y = (int)(y0 * frame->height); |
453 |
|
✗ |
bbox->h = (int)(y1 * frame->height) - bbox->y; |
454 |
|
|
|
455 |
|
✗ |
bbox->detect_confidence = av_make_q((int)(conf * 10000), 10000); |
456 |
|
✗ |
bbox->classify_count = 0; |
457 |
|
|
|
458 |
|
✗ |
if (ctx->labels && label_id < ctx->label_count) { |
459 |
|
✗ |
av_strlcpy(bbox->detect_label, ctx->labels[label_id], sizeof(bbox->detect_label)); |
460 |
|
|
} else { |
461 |
|
✗ |
snprintf(bbox->detect_label, sizeof(bbox->detect_label), "%d", label_id); |
462 |
|
|
} |
463 |
|
|
|
464 |
|
✗ |
nb_bboxes--; |
465 |
|
✗ |
if (nb_bboxes == 0) { |
466 |
|
✗ |
break; |
467 |
|
|
} |
468 |
|
|
} |
469 |
|
✗ |
return 0; |
470 |
|
|
} |
471 |
|
|
|
472 |
|
✗ |
static int dnn_detect_post_proc_ov(AVFrame *frame, DNNData *output, int nb_outputs, |
473 |
|
|
AVFilterContext *filter_ctx) |
474 |
|
|
{ |
475 |
|
|
AVFrameSideData *sd; |
476 |
|
✗ |
DnnDetectContext *ctx = filter_ctx->priv; |
477 |
|
✗ |
int ret = 0; |
478 |
|
|
|
479 |
|
✗ |
sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DETECTION_BBOXES); |
480 |
|
✗ |
if (sd) { |
481 |
|
✗ |
av_log(filter_ctx, AV_LOG_ERROR, "already have bounding boxes in side data.\n"); |
482 |
|
✗ |
return -1; |
483 |
|
|
} |
484 |
|
|
|
485 |
|
✗ |
switch (ctx->model_type) { |
486 |
|
✗ |
case DDMT_SSD: |
487 |
|
✗ |
ret = dnn_detect_post_proc_ssd(frame, output, nb_outputs, filter_ctx); |
488 |
|
✗ |
if (ret < 0) |
489 |
|
✗ |
return ret; |
490 |
|
✗ |
break; |
491 |
|
✗ |
case DDMT_YOLOV1V2: |
492 |
|
✗ |
ret = dnn_detect_post_proc_yolo(frame, output, filter_ctx); |
493 |
|
✗ |
if (ret < 0) |
494 |
|
✗ |
return ret; |
495 |
|
✗ |
break; |
496 |
|
✗ |
case DDMT_YOLOV3: |
497 |
|
|
case DDMT_YOLOV4: |
498 |
|
✗ |
ret = dnn_detect_post_proc_yolov3(frame, output, filter_ctx, nb_outputs); |
499 |
|
✗ |
if (ret < 0) |
500 |
|
✗ |
return ret; |
501 |
|
✗ |
break; |
502 |
|
|
} |
503 |
|
✗ |
return 0; |
504 |
|
|
} |
505 |
|
|
|
506 |
|
✗ |
static int dnn_detect_post_proc_tf(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx) |
507 |
|
|
{ |
508 |
|
✗ |
DnnDetectContext *ctx = filter_ctx->priv; |
509 |
|
|
int proposal_count; |
510 |
|
✗ |
float conf_threshold = ctx->confidence; |
511 |
|
|
float *conf, *position, *label_id, x0, y0, x1, y1; |
512 |
|
✗ |
int nb_bboxes = 0; |
513 |
|
|
AVFrameSideData *sd; |
514 |
|
|
AVDetectionBBox *bbox; |
515 |
|
|
AVDetectionBBoxHeader *header; |
516 |
|
|
|
517 |
|
✗ |
proposal_count = *(float *)(output[0].data); |
518 |
|
✗ |
conf = output[1].data; |
519 |
|
✗ |
position = output[3].data; |
520 |
|
✗ |
label_id = output[2].data; |
521 |
|
|
|
522 |
|
✗ |
sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DETECTION_BBOXES); |
523 |
|
✗ |
if (sd) { |
524 |
|
✗ |
av_log(filter_ctx, AV_LOG_ERROR, "already have dnn bounding boxes in side data.\n"); |
525 |
|
✗ |
return -1; |
526 |
|
|
} |
527 |
|
|
|
528 |
|
✗ |
for (int i = 0; i < proposal_count; ++i) { |
529 |
|
✗ |
if (conf[i] < conf_threshold) |
530 |
|
✗ |
continue; |
531 |
|
✗ |
nb_bboxes++; |
532 |
|
|
} |
533 |
|
|
|
534 |
|
✗ |
if (nb_bboxes == 0) { |
535 |
|
✗ |
av_log(filter_ctx, AV_LOG_VERBOSE, "nothing detected in this frame.\n"); |
536 |
|
✗ |
return 0; |
537 |
|
|
} |
538 |
|
|
|
539 |
|
✗ |
header = av_detection_bbox_create_side_data(frame, nb_bboxes); |
540 |
|
✗ |
if (!header) { |
541 |
|
✗ |
av_log(filter_ctx, AV_LOG_ERROR, "failed to create side data with %d bounding boxes\n", nb_bboxes); |
542 |
|
✗ |
return -1; |
543 |
|
|
} |
544 |
|
|
|
545 |
|
✗ |
av_strlcpy(header->source, ctx->dnnctx.model_filename, sizeof(header->source)); |
546 |
|
|
|
547 |
|
✗ |
for (int i = 0; i < proposal_count; ++i) { |
548 |
|
✗ |
y0 = position[i * 4]; |
549 |
|
✗ |
x0 = position[i * 4 + 1]; |
550 |
|
✗ |
y1 = position[i * 4 + 2]; |
551 |
|
✗ |
x1 = position[i * 4 + 3]; |
552 |
|
|
|
553 |
|
✗ |
bbox = av_get_detection_bbox(header, i); |
554 |
|
|
|
555 |
|
✗ |
if (conf[i] < conf_threshold) { |
556 |
|
✗ |
continue; |
557 |
|
|
} |
558 |
|
|
|
559 |
|
✗ |
bbox->x = (int)(x0 * frame->width); |
560 |
|
✗ |
bbox->w = (int)(x1 * frame->width) - bbox->x; |
561 |
|
✗ |
bbox->y = (int)(y0 * frame->height); |
562 |
|
✗ |
bbox->h = (int)(y1 * frame->height) - bbox->y; |
563 |
|
|
|
564 |
|
✗ |
bbox->detect_confidence = av_make_q((int)(conf[i] * 10000), 10000); |
565 |
|
✗ |
bbox->classify_count = 0; |
566 |
|
|
|
567 |
|
✗ |
if (ctx->labels && label_id[i] < ctx->label_count) { |
568 |
|
✗ |
av_strlcpy(bbox->detect_label, ctx->labels[(int)label_id[i]], sizeof(bbox->detect_label)); |
569 |
|
|
} else { |
570 |
|
✗ |
snprintf(bbox->detect_label, sizeof(bbox->detect_label), "%d", (int)label_id[i]); |
571 |
|
|
} |
572 |
|
|
|
573 |
|
✗ |
nb_bboxes--; |
574 |
|
✗ |
if (nb_bboxes == 0) { |
575 |
|
✗ |
break; |
576 |
|
|
} |
577 |
|
|
} |
578 |
|
✗ |
return 0; |
579 |
|
|
} |
580 |
|
|
|
581 |
|
✗ |
static int dnn_detect_post_proc(AVFrame *frame, DNNData *output, uint32_t nb, AVFilterContext *filter_ctx) |
582 |
|
|
{ |
583 |
|
✗ |
DnnDetectContext *ctx = filter_ctx->priv; |
584 |
|
✗ |
DnnContext *dnn_ctx = &ctx->dnnctx; |
585 |
|
✗ |
switch (dnn_ctx->backend_type) { |
586 |
|
✗ |
case DNN_OV: |
587 |
|
✗ |
return dnn_detect_post_proc_ov(frame, output, nb, filter_ctx); |
588 |
|
✗ |
case DNN_TF: |
589 |
|
✗ |
return dnn_detect_post_proc_tf(frame, output, filter_ctx); |
590 |
|
✗ |
default: |
591 |
|
✗ |
avpriv_report_missing_feature(filter_ctx, "Current dnn backend does not support detect filter\n"); |
592 |
|
✗ |
return AVERROR(EINVAL); |
593 |
|
|
} |
594 |
|
|
} |
595 |
|
|
|
596 |
|
✗ |
static void free_detect_labels(DnnDetectContext *ctx) |
597 |
|
|
{ |
598 |
|
✗ |
for (int i = 0; i < ctx->label_count; i++) { |
599 |
|
✗ |
av_freep(&ctx->labels[i]); |
600 |
|
|
} |
601 |
|
✗ |
ctx->label_count = 0; |
602 |
|
✗ |
av_freep(&ctx->labels); |
603 |
|
✗ |
} |
604 |
|
|
|
605 |
|
✗ |
static int read_detect_label_file(AVFilterContext *context) |
606 |
|
|
{ |
607 |
|
|
int line_len; |
608 |
|
|
FILE *file; |
609 |
|
✗ |
DnnDetectContext *ctx = context->priv; |
610 |
|
|
|
611 |
|
✗ |
file = avpriv_fopen_utf8(ctx->labels_filename, "r"); |
612 |
|
✗ |
if (!file){ |
613 |
|
✗ |
av_log(context, AV_LOG_ERROR, "failed to open file %s\n", ctx->labels_filename); |
614 |
|
✗ |
return AVERROR(EINVAL); |
615 |
|
|
} |
616 |
|
|
|
617 |
|
✗ |
while (!feof(file)) { |
618 |
|
|
char *label; |
619 |
|
|
char buf[256]; |
620 |
|
✗ |
if (!fgets(buf, 256, file)) { |
621 |
|
✗ |
break; |
622 |
|
|
} |
623 |
|
|
|
624 |
|
✗ |
line_len = strlen(buf); |
625 |
|
✗ |
while (line_len) { |
626 |
|
✗ |
int i = line_len - 1; |
627 |
|
✗ |
if (buf[i] == '\n' || buf[i] == '\r' || buf[i] == ' ') { |
628 |
|
✗ |
buf[i] = '\0'; |
629 |
|
✗ |
line_len--; |
630 |
|
|
} else { |
631 |
|
|
break; |
632 |
|
|
} |
633 |
|
|
} |
634 |
|
|
|
635 |
|
✗ |
if (line_len == 0) // empty line |
636 |
|
✗ |
continue; |
637 |
|
|
|
638 |
|
✗ |
if (line_len >= AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE) { |
639 |
|
✗ |
av_log(context, AV_LOG_ERROR, "label %s too long\n", buf); |
640 |
|
✗ |
fclose(file); |
641 |
|
✗ |
return AVERROR(EINVAL); |
642 |
|
|
} |
643 |
|
|
|
644 |
|
✗ |
label = av_strdup(buf); |
645 |
|
✗ |
if (!label) { |
646 |
|
✗ |
av_log(context, AV_LOG_ERROR, "failed to allocate memory for label %s\n", buf); |
647 |
|
✗ |
fclose(file); |
648 |
|
✗ |
return AVERROR(ENOMEM); |
649 |
|
|
} |
650 |
|
|
|
651 |
|
✗ |
if (av_dynarray_add_nofree(&ctx->labels, &ctx->label_count, label) < 0) { |
652 |
|
✗ |
av_log(context, AV_LOG_ERROR, "failed to do av_dynarray_add\n"); |
653 |
|
✗ |
fclose(file); |
654 |
|
✗ |
av_freep(&label); |
655 |
|
✗ |
return AVERROR(ENOMEM); |
656 |
|
|
} |
657 |
|
|
} |
658 |
|
|
|
659 |
|
✗ |
fclose(file); |
660 |
|
✗ |
return 0; |
661 |
|
|
} |
662 |
|
|
|
663 |
|
✗ |
static int check_output_nb(DnnDetectContext *ctx, DNNBackendType backend_type, int output_nb) |
664 |
|
|
{ |
665 |
|
✗ |
switch(backend_type) { |
666 |
|
✗ |
case DNN_TF: |
667 |
|
✗ |
if (output_nb != 4) { |
668 |
|
✗ |
av_log(ctx, AV_LOG_ERROR, "Only support tensorflow detect model with 4 outputs, \ |
669 |
|
|
but get %d instead\n", output_nb); |
670 |
|
✗ |
return AVERROR(EINVAL); |
671 |
|
|
} |
672 |
|
✗ |
return 0; |
673 |
|
✗ |
case DNN_OV: |
674 |
|
✗ |
return 0; |
675 |
|
✗ |
default: |
676 |
|
✗ |
avpriv_report_missing_feature(ctx, "Dnn detect filter does not support current backend\n"); |
677 |
|
✗ |
return AVERROR(EINVAL); |
678 |
|
|
} |
679 |
|
|
return 0; |
680 |
|
|
} |
681 |
|
|
|
682 |
|
✗ |
static av_cold int dnn_detect_init(AVFilterContext *context) |
683 |
|
|
{ |
684 |
|
✗ |
DnnDetectContext *ctx = context->priv; |
685 |
|
✗ |
DnnContext *dnn_ctx = &ctx->dnnctx; |
686 |
|
|
int ret; |
687 |
|
|
|
688 |
|
✗ |
ret = ff_dnn_init(&ctx->dnnctx, DFT_ANALYTICS_DETECT, context); |
689 |
|
✗ |
if (ret < 0) |
690 |
|
✗ |
return ret; |
691 |
|
✗ |
ret = check_output_nb(ctx, dnn_ctx->backend_type, dnn_ctx->nb_outputs); |
692 |
|
✗ |
if (ret < 0) |
693 |
|
✗ |
return ret; |
694 |
|
✗ |
ctx->bboxes_fifo = av_fifo_alloc2(1, sizeof(AVDetectionBBox *), AV_FIFO_FLAG_AUTO_GROW); |
695 |
|
✗ |
if (!ctx->bboxes_fifo) |
696 |
|
✗ |
return AVERROR(ENOMEM); |
697 |
|
✗ |
ff_dnn_set_detect_post_proc(&ctx->dnnctx, dnn_detect_post_proc); |
698 |
|
|
|
699 |
|
✗ |
if (ctx->labels_filename) { |
700 |
|
✗ |
return read_detect_label_file(context); |
701 |
|
|
} |
702 |
|
✗ |
if (ctx->anchors_str) { |
703 |
|
✗ |
ret = dnn_detect_parse_anchors(ctx->anchors_str, &ctx->anchors); |
704 |
|
✗ |
if (!ctx->anchors) { |
705 |
|
✗ |
av_log(context, AV_LOG_ERROR, "failed to parse anchors_str\n"); |
706 |
|
✗ |
return AVERROR(EINVAL); |
707 |
|
|
} |
708 |
|
✗ |
ctx->nb_anchor = ret; |
709 |
|
|
} |
710 |
|
✗ |
return 0; |
711 |
|
|
} |
712 |
|
|
|
713 |
|
|
static const enum AVPixelFormat pix_fmts[] = { |
714 |
|
|
AV_PIX_FMT_RGB24, AV_PIX_FMT_BGR24, |
715 |
|
|
AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAYF32, |
716 |
|
|
AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P, |
717 |
|
|
AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P, |
718 |
|
|
AV_PIX_FMT_NV12, |
719 |
|
|
AV_PIX_FMT_NONE |
720 |
|
|
}; |
721 |
|
|
|
722 |
|
✗ |
static int dnn_detect_flush_frame(AVFilterLink *outlink, int64_t pts, int64_t *out_pts) |
723 |
|
|
{ |
724 |
|
✗ |
DnnDetectContext *ctx = outlink->src->priv; |
725 |
|
|
int ret; |
726 |
|
|
DNNAsyncStatusType async_state; |
727 |
|
|
|
728 |
|
✗ |
ret = ff_dnn_flush(&ctx->dnnctx); |
729 |
|
✗ |
if (ret != 0) { |
730 |
|
✗ |
return -1; |
731 |
|
|
} |
732 |
|
|
|
733 |
|
|
do { |
734 |
|
✗ |
AVFrame *in_frame = NULL; |
735 |
|
✗ |
AVFrame *out_frame = NULL; |
736 |
|
✗ |
async_state = ff_dnn_get_result(&ctx->dnnctx, &in_frame, &out_frame); |
737 |
|
✗ |
if (async_state == DAST_SUCCESS) { |
738 |
|
✗ |
ret = ff_filter_frame(outlink, in_frame); |
739 |
|
✗ |
if (ret < 0) |
740 |
|
✗ |
return ret; |
741 |
|
✗ |
if (out_pts) |
742 |
|
✗ |
*out_pts = in_frame->pts + pts; |
743 |
|
|
} |
744 |
|
✗ |
av_usleep(5000); |
745 |
|
✗ |
} while (async_state >= DAST_NOT_READY); |
746 |
|
|
|
747 |
|
✗ |
return 0; |
748 |
|
|
} |
749 |
|
|
|
750 |
|
✗ |
static int dnn_detect_activate(AVFilterContext *filter_ctx) |
751 |
|
|
{ |
752 |
|
✗ |
AVFilterLink *inlink = filter_ctx->inputs[0]; |
753 |
|
✗ |
AVFilterLink *outlink = filter_ctx->outputs[0]; |
754 |
|
✗ |
DnnDetectContext *ctx = filter_ctx->priv; |
755 |
|
✗ |
AVFrame *in = NULL; |
756 |
|
|
int64_t pts; |
757 |
|
|
int ret, status; |
758 |
|
✗ |
int got_frame = 0; |
759 |
|
|
int async_state; |
760 |
|
|
|
761 |
|
✗ |
FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink); |
762 |
|
|
|
763 |
|
|
do { |
764 |
|
|
// drain all input frames |
765 |
|
✗ |
ret = ff_inlink_consume_frame(inlink, &in); |
766 |
|
✗ |
if (ret < 0) |
767 |
|
✗ |
return ret; |
768 |
|
✗ |
if (ret > 0) { |
769 |
|
✗ |
if (ff_dnn_execute_model(&ctx->dnnctx, in, NULL) != 0) { |
770 |
|
✗ |
return AVERROR(EIO); |
771 |
|
|
} |
772 |
|
|
} |
773 |
|
✗ |
} while (ret > 0); |
774 |
|
|
|
775 |
|
|
// drain all processed frames |
776 |
|
|
do { |
777 |
|
✗ |
AVFrame *in_frame = NULL; |
778 |
|
✗ |
AVFrame *out_frame = NULL; |
779 |
|
✗ |
async_state = ff_dnn_get_result(&ctx->dnnctx, &in_frame, &out_frame); |
780 |
|
✗ |
if (async_state == DAST_SUCCESS) { |
781 |
|
✗ |
ret = ff_filter_frame(outlink, in_frame); |
782 |
|
✗ |
if (ret < 0) |
783 |
|
✗ |
return ret; |
784 |
|
✗ |
got_frame = 1; |
785 |
|
|
} |
786 |
|
✗ |
} while (async_state == DAST_SUCCESS); |
787 |
|
|
|
788 |
|
|
// if frame got, schedule to next filter |
789 |
|
✗ |
if (got_frame) |
790 |
|
✗ |
return 0; |
791 |
|
|
|
792 |
|
✗ |
if (ff_inlink_acknowledge_status(inlink, &status, &pts)) { |
793 |
|
✗ |
if (status == AVERROR_EOF) { |
794 |
|
✗ |
int64_t out_pts = pts; |
795 |
|
✗ |
ret = dnn_detect_flush_frame(outlink, pts, &out_pts); |
796 |
|
✗ |
ff_outlink_set_status(outlink, status, out_pts); |
797 |
|
✗ |
return ret; |
798 |
|
|
} |
799 |
|
|
} |
800 |
|
|
|
801 |
|
✗ |
FF_FILTER_FORWARD_WANTED(outlink, inlink); |
802 |
|
|
|
803 |
|
✗ |
return 0; |
804 |
|
|
} |
805 |
|
|
|
806 |
|
✗ |
static av_cold void dnn_detect_uninit(AVFilterContext *context) |
807 |
|
|
{ |
808 |
|
✗ |
DnnDetectContext *ctx = context->priv; |
809 |
|
|
AVDetectionBBox *bbox; |
810 |
|
✗ |
ff_dnn_uninit(&ctx->dnnctx); |
811 |
|
✗ |
while(av_fifo_can_read(ctx->bboxes_fifo)) { |
812 |
|
✗ |
av_fifo_read(ctx->bboxes_fifo, &bbox, 1); |
813 |
|
✗ |
av_freep(&bbox); |
814 |
|
|
} |
815 |
|
✗ |
av_fifo_freep2(&ctx->bboxes_fifo); |
816 |
|
✗ |
av_freep(&ctx->anchors); |
817 |
|
✗ |
free_detect_labels(ctx); |
818 |
|
✗ |
} |
819 |
|
|
|
820 |
|
✗ |
static int config_input(AVFilterLink *inlink) |
821 |
|
|
{ |
822 |
|
✗ |
AVFilterContext *context = inlink->dst; |
823 |
|
✗ |
DnnDetectContext *ctx = context->priv; |
824 |
|
|
DNNData model_input; |
825 |
|
|
int ret, width_idx, height_idx; |
826 |
|
|
|
827 |
|
✗ |
ret = ff_dnn_get_input(&ctx->dnnctx, &model_input); |
828 |
|
✗ |
if (ret != 0) { |
829 |
|
✗ |
av_log(ctx, AV_LOG_ERROR, "could not get input from the model\n"); |
830 |
|
✗ |
return ret; |
831 |
|
|
} |
832 |
|
✗ |
width_idx = dnn_get_width_idx_by_layout(model_input.layout); |
833 |
|
✗ |
height_idx = dnn_get_height_idx_by_layout(model_input.layout); |
834 |
|
✗ |
ctx->scale_width = model_input.dims[width_idx] == -1 ? inlink->w : |
835 |
|
|
model_input.dims[width_idx]; |
836 |
|
✗ |
ctx->scale_height = model_input.dims[height_idx] == -1 ? inlink->h : |
837 |
|
|
model_input.dims[height_idx]; |
838 |
|
|
|
839 |
|
✗ |
return 0; |
840 |
|
|
} |
841 |
|
|
|
842 |
|
|
static const AVFilterPad dnn_detect_inputs[] = { |
843 |
|
|
{ |
844 |
|
|
.name = "default", |
845 |
|
|
.type = AVMEDIA_TYPE_VIDEO, |
846 |
|
|
.config_props = config_input, |
847 |
|
|
}, |
848 |
|
|
}; |
849 |
|
|
|
850 |
|
|
const AVFilter ff_vf_dnn_detect = { |
851 |
|
|
.name = "dnn_detect", |
852 |
|
|
.description = NULL_IF_CONFIG_SMALL("Apply DNN detect filter to the input."), |
853 |
|
|
.priv_size = sizeof(DnnDetectContext), |
854 |
|
|
.init = dnn_detect_init, |
855 |
|
|
.uninit = dnn_detect_uninit, |
856 |
|
|
FILTER_INPUTS(dnn_detect_inputs), |
857 |
|
|
FILTER_OUTPUTS(ff_video_default_filterpad), |
858 |
|
|
FILTER_PIXFMTS_ARRAY(pix_fmts), |
859 |
|
|
.priv_class = &dnn_detect_class, |
860 |
|
|
.activate = dnn_detect_activate, |
861 |
|
|
}; |
862 |
|
|
|