Skip to content

Commit 7a879cc

Browse files
Semmer2guoyejun
authored andcommitted
libavfilter: vf_drawtext filter support draw text with detection bounding boxes in side_data
This feature can be used with dnn detection by setting vf_drawtext's option text_source=side_data_detection_bboxes, for example: ./ffmpeg -i face.jpeg -vf dnn_detect=dnn_backend=openvino:model=face-detection-adas-0001.xml:\ input=data:output=detection_out:labels=face-detection-adas-0001.label,drawbox=box_source= side_data_detection_bboxes,drawtext=text_source=side_data_detection_bboxes:fontcolor=green:\ fontsize=40, -y face_detect.jpeg Please note, the default fontsize of vf_drawtext is 12, which may be too small to be seen clearly. Signed-off-by: Ting Fu <ting.fu@intel.com>
1 parent f444be6 commit 7a879cc

File tree

2 files changed

+79
-6
lines changed

2 files changed

+79
-6
lines changed

doc/filters.texi

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10788,6 +10788,14 @@ parameter @var{text}.
1078810788

1078910789
If both @var{text} and @var{textfile} are specified, an error is thrown.
1079010790

10791+
@item text_source
10792+
Text source should be set as side_data_detection_bboxes if you want to use text data in
10793+
detection bboxes of side data.
10794+
10795+
If text source is set, @var{text} and @var{textfile} will be ignored and still use
10796+
text data in detection bboxes of side data. So please do not use this parameter
10797+
if you are not sure about the text source.
10798+
1079110799
@item reload
1079210800
If set to 1, the @var{textfile} will be reloaded before each frame.
1079310801
Be sure to update it atomically, or it may be read partially, or even fail.

libavfilter/vf_drawtext.c

Lines changed: 71 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
#include "libavutil/time_internal.h"
5656
#include "libavutil/tree.h"
5757
#include "libavutil/lfg.h"
58+
#include "libavutil/detection_bbox.h"
5859
#include "avfilter.h"
5960
#include "drawutils.h"
6061
#include "formats.h"
@@ -199,6 +200,8 @@ typedef struct DrawTextContext {
199200
int tc24hmax; ///< 1 if timecode is wrapped to 24 hours, 0 otherwise
200201
int reload; ///< reload text file for each frame
201202
int start_number; ///< starting frame number for n/frame_num var
203+
char *text_source_string; ///< the string to specify text data source
204+
enum AVFrameSideDataType text_source;
202205
#if CONFIG_LIBFRIBIDI
203206
int text_shaping; ///< 1 to shape the text before drawing it
204207
#endif
@@ -246,6 +249,7 @@ static const AVOption drawtext_options[]= {
246249
{ "alpha", "apply alpha while rendering", OFFSET(a_expr), AV_OPT_TYPE_STRING, { .str = "1" }, .flags = FLAGS },
247250
{"fix_bounds", "check and fix text coords to avoid clipping", OFFSET(fix_bounds), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS},
248251
{"start_number", "start frame number for n/frame_num variable", OFFSET(start_number), AV_OPT_TYPE_INT, {.i64=0}, 0, INT_MAX, FLAGS},
252+
{"text_source", "the source of text", OFFSET(text_source_string), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 1, FLAGS },
249253

250254
#if CONFIG_LIBFRIBIDI
251255
{"text_shaping", "attempt to shape text before drawing", OFFSET(text_shaping), AV_OPT_TYPE_BOOL, {.i64=1}, 0, 1, FLAGS},
@@ -690,6 +694,16 @@ static int shape_text(AVFilterContext *ctx)
690694
}
691695
#endif
692696

697+
static enum AVFrameSideDataType text_source_string_parse(const char *text_source_string)
698+
{
699+
av_assert0(text_source_string);
700+
if (!strcmp(text_source_string, "side_data_detection_bboxes")) {
701+
return AV_FRAME_DATA_DETECTION_BBOXES;
702+
} else {
703+
return AVERROR(EINVAL);
704+
}
705+
}
706+
693707
static av_cold int init(AVFilterContext *ctx)
694708
{
695709
int err;
@@ -731,9 +745,28 @@ static av_cold int init(AVFilterContext *ctx)
731745
s->text = av_strdup("");
732746
}
733747

748+
if (s->text_source_string) {
749+
s->text_source = text_source_string_parse(s->text_source_string);
750+
if ((int)s->text_source < 0) {
751+
av_log(ctx, AV_LOG_ERROR, "Error text source: %s\n", s->text_source_string);
752+
return AVERROR(EINVAL);
753+
}
754+
}
755+
756+
if (s->text_source == AV_FRAME_DATA_DETECTION_BBOXES) {
757+
if (s->text) {
758+
av_log(ctx, AV_LOG_WARNING, "Multiple texts provided, will use text_source only\n");
759+
av_free(s->text);
760+
}
761+
s->text = av_mallocz(AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE *
762+
(AV_NUM_DETECTION_BBOX_CLASSIFY + 1));
763+
if (!s->text)
764+
return AVERROR(ENOMEM);
765+
}
766+
734767
if (!s->text) {
735768
av_log(ctx, AV_LOG_ERROR,
736-
"Either text, a valid file or a timecode must be provided\n");
769+
"Either text, a valid file, a timecode or text source must be provided\n");
737770
return AVERROR(EINVAL);
738771
}
739772

@@ -1440,10 +1473,15 @@ static int draw_text(AVFilterContext *ctx, AVFrame *frame,
14401473

14411474
s->var_values[VAR_LINE_H] = s->var_values[VAR_LH] = s->max_glyph_h;
14421475

1443-
s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, s->var_values, &s->prng);
1444-
s->y = s->var_values[VAR_Y] = av_expr_eval(s->y_pexpr, s->var_values, &s->prng);
1445-
/* It is necessary if x is expressed from y */
1446-
s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, s->var_values, &s->prng);
1476+
if (s->text_source == AV_FRAME_DATA_DETECTION_BBOXES) {
1477+
s->var_values[VAR_X] = s->x;
1478+
s->var_values[VAR_Y] = s->y;
1479+
} else {
1480+
s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, s->var_values, &s->prng);
1481+
s->y = s->var_values[VAR_Y] = av_expr_eval(s->y_pexpr, s->var_values, &s->prng);
1482+
/* It is necessary if x is expressed from y */
1483+
s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, s->var_values, &s->prng);
1484+
}
14471485

14481486
update_alpha(s);
14491487
update_color_with_alpha(s, &fontcolor , s->fontcolor );
@@ -1511,6 +1549,21 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *frame)
15111549
AVFilterLink *outlink = ctx->outputs[0];
15121550
DrawTextContext *s = ctx->priv;
15131551
int ret;
1552+
const AVDetectionBBoxHeader *header = NULL;
1553+
const AVDetectionBBox *bbox;
1554+
AVFrameSideData *sd;
1555+
int loop = 1;
1556+
1557+
if (s->text_source == AV_FRAME_DATA_DETECTION_BBOXES && sd) {
1558+
sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DETECTION_BBOXES);
1559+
if (sd) {
1560+
header = (AVDetectionBBoxHeader *)sd->data;
1561+
loop = header->nb_bboxes;
1562+
} else {
1563+
av_log(s, AV_LOG_WARNING, "No detection bboxes.\n");
1564+
return ff_filter_frame(outlink, frame);
1565+
}
1566+
}
15141567

15151568
if (s->reload) {
15161569
if ((ret = load_textfile(ctx)) < 0) {
@@ -1536,7 +1589,19 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *frame)
15361589
s->var_values[VAR_PKT_SIZE] = frame->pkt_size;
15371590
s->metadata = frame->metadata;
15381591

1539-
draw_text(ctx, frame, frame->width, frame->height);
1592+
for (int i = 0; i < loop; i++) {
1593+
if (header) {
1594+
bbox = av_get_detection_bbox(header, i);
1595+
strcpy(s->text, bbox->detect_label);
1596+
for (int j = 0; j < bbox->classify_count; j++) {
1597+
strcat(s->text, ", ");
1598+
strcat(s->text, bbox->classify_labels[j]);
1599+
}
1600+
s->x = bbox->x;
1601+
s->y = bbox->y - s->fontsize;
1602+
}
1603+
draw_text(ctx, frame, frame->width, frame->height);
1604+
}
15401605

15411606
av_log(ctx, AV_LOG_DEBUG, "n:%d t:%f text_w:%d text_h:%d x:%d y:%d\n",
15421607
(int)s->var_values[VAR_N], s->var_values[VAR_T],

0 commit comments

Comments
 (0)