5555#include "libavutil/time_internal.h"
5656#include "libavutil/tree.h"
5757#include "libavutil/lfg.h"
58+ #include "libavutil/detection_bbox.h"
5859#include "avfilter.h"
5960#include "drawutils.h"
6061#include "formats.h"
@@ -199,6 +200,8 @@ typedef struct DrawTextContext {
199200 int tc24hmax ; ///< 1 if timecode is wrapped to 24 hours, 0 otherwise
200201 int reload ; ///< reload text file for each frame
201202 int start_number ; ///< starting frame number for n/frame_num var
203+ char * text_source_string ; ///< the string to specify text data source
204+ enum AVFrameSideDataType text_source ;
202205#if CONFIG_LIBFRIBIDI
203206 int text_shaping ; ///< 1 to shape the text before drawing it
204207#endif
@@ -246,6 +249,7 @@ static const AVOption drawtext_options[]= {
246249 { "alpha" , "apply alpha while rendering" , OFFSET (a_expr ), AV_OPT_TYPE_STRING , { .str = "1" }, .flags = FLAGS },
247250 {"fix_bounds" , "check and fix text coords to avoid clipping" , OFFSET (fix_bounds ), AV_OPT_TYPE_BOOL , {.i64 = 0 }, 0 , 1 , FLAGS },
248251 {"start_number" , "start frame number for n/frame_num variable" , OFFSET (start_number ), AV_OPT_TYPE_INT , {.i64 = 0 }, 0 , INT_MAX , FLAGS },
252+ {"text_source" , "the source of text" , OFFSET (text_source_string ), AV_OPT_TYPE_STRING , {.str = NULL }, 0 , 1 , FLAGS },
249253
250254#if CONFIG_LIBFRIBIDI
251255 {"text_shaping" , "attempt to shape text before drawing" , OFFSET (text_shaping ), AV_OPT_TYPE_BOOL , {.i64 = 1 }, 0 , 1 , FLAGS },
@@ -690,6 +694,16 @@ static int shape_text(AVFilterContext *ctx)
690694}
691695#endif
692696
697+ static enum AVFrameSideDataType text_source_string_parse (const char * text_source_string )
698+ {
699+ av_assert0 (text_source_string );
700+ if (!strcmp (text_source_string , "side_data_detection_bboxes" )) {
701+ return AV_FRAME_DATA_DETECTION_BBOXES ;
702+ } else {
703+ return AVERROR (EINVAL );
704+ }
705+ }
706+
693707static av_cold int init (AVFilterContext * ctx )
694708{
695709 int err ;
@@ -731,9 +745,28 @@ static av_cold int init(AVFilterContext *ctx)
731745 s -> text = av_strdup ("" );
732746 }
733747
748+ if (s -> text_source_string ) {
749+ s -> text_source = text_source_string_parse (s -> text_source_string );
750+ if ((int )s -> text_source < 0 ) {
751+ av_log (ctx , AV_LOG_ERROR , "Error text source: %s\n" , s -> text_source_string );
752+ return AVERROR (EINVAL );
753+ }
754+ }
755+
756+ if (s -> text_source == AV_FRAME_DATA_DETECTION_BBOXES ) {
757+ if (s -> text ) {
758+ av_log (ctx , AV_LOG_WARNING , "Multiple texts provided, will use text_source only\n" );
759+ av_free (s -> text );
760+ }
761+ s -> text = av_mallocz (AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE *
762+ (AV_NUM_DETECTION_BBOX_CLASSIFY + 1 ));
763+ if (!s -> text )
764+ return AVERROR (ENOMEM );
765+ }
766+
734767 if (!s -> text ) {
735768 av_log (ctx , AV_LOG_ERROR ,
736- "Either text, a valid file or a timecode must be provided\n" );
769+ "Either text, a valid file, a timecode or text source must be provided\n" );
737770 return AVERROR (EINVAL );
738771 }
739772
@@ -1440,10 +1473,15 @@ static int draw_text(AVFilterContext *ctx, AVFrame *frame,
14401473
14411474 s -> var_values [VAR_LINE_H ] = s -> var_values [VAR_LH ] = s -> max_glyph_h ;
14421475
1443- s -> x = s -> var_values [VAR_X ] = av_expr_eval (s -> x_pexpr , s -> var_values , & s -> prng );
1444- s -> y = s -> var_values [VAR_Y ] = av_expr_eval (s -> y_pexpr , s -> var_values , & s -> prng );
1445- /* It is necessary if x is expressed from y */
1446- s -> x = s -> var_values [VAR_X ] = av_expr_eval (s -> x_pexpr , s -> var_values , & s -> prng );
1476+ if (s -> text_source == AV_FRAME_DATA_DETECTION_BBOXES ) {
1477+ s -> var_values [VAR_X ] = s -> x ;
1478+ s -> var_values [VAR_Y ] = s -> y ;
1479+ } else {
1480+ s -> x = s -> var_values [VAR_X ] = av_expr_eval (s -> x_pexpr , s -> var_values , & s -> prng );
1481+ s -> y = s -> var_values [VAR_Y ] = av_expr_eval (s -> y_pexpr , s -> var_values , & s -> prng );
1482+ /* It is necessary if x is expressed from y */
1483+ s -> x = s -> var_values [VAR_X ] = av_expr_eval (s -> x_pexpr , s -> var_values , & s -> prng );
1484+ }
14471485
14481486 update_alpha (s );
14491487 update_color_with_alpha (s , & fontcolor , s -> fontcolor );
@@ -1511,6 +1549,21 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *frame)
15111549 AVFilterLink * outlink = ctx -> outputs [0 ];
15121550 DrawTextContext * s = ctx -> priv ;
15131551 int ret ;
1552+ const AVDetectionBBoxHeader * header = NULL ;
1553+ const AVDetectionBBox * bbox ;
1554+ AVFrameSideData * sd ;
1555+ int loop = 1 ;
1556+
1557+ if (s -> text_source == AV_FRAME_DATA_DETECTION_BBOXES && sd ) {
1558+ sd = av_frame_get_side_data (frame , AV_FRAME_DATA_DETECTION_BBOXES );
1559+ if (sd ) {
1560+ header = (AVDetectionBBoxHeader * )sd -> data ;
1561+ loop = header -> nb_bboxes ;
1562+ } else {
1563+ av_log (s , AV_LOG_WARNING , "No detection bboxes.\n" );
1564+ return ff_filter_frame (outlink , frame );
1565+ }
1566+ }
15141567
15151568 if (s -> reload ) {
15161569 if ((ret = load_textfile (ctx )) < 0 ) {
@@ -1536,7 +1589,19 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *frame)
15361589 s -> var_values [VAR_PKT_SIZE ] = frame -> pkt_size ;
15371590 s -> metadata = frame -> metadata ;
15381591
1539- draw_text (ctx , frame , frame -> width , frame -> height );
1592+ for (int i = 0 ; i < loop ; i ++ ) {
1593+ if (header ) {
1594+ bbox = av_get_detection_bbox (header , i );
1595+ strcpy (s -> text , bbox -> detect_label );
1596+ for (int j = 0 ; j < bbox -> classify_count ; j ++ ) {
1597+ strcat (s -> text , ", " );
1598+ strcat (s -> text , bbox -> classify_labels [j ]);
1599+ }
1600+ s -> x = bbox -> x ;
1601+ s -> y = bbox -> y - s -> fontsize ;
1602+ }
1603+ draw_text (ctx , frame , frame -> width , frame -> height );
1604+ }
15401605
15411606 av_log (ctx , AV_LOG_DEBUG , "n:%d t:%f text_w:%d text_h:%d x:%d y:%d\n" ,
15421607 (int )s -> var_values [VAR_N ], s -> var_values [VAR_T ],
0 commit comments