video: merge the tiles of grid heif and avif images

guidocella · guidocella · commit b2ecddee82c6 · 2026-04-17T20:48:52.000+02:00
Fixes #13585. Fixes #16486. Switching between different grid images added with --external-files is also supported. The prority of independent tracks is reverted for images, because we now want to select a dependent track to trigger the merging, rather than small preview tracks.
diff --git a/demux/demux_lavf.c b/demux/demux_lavf.c
@@ -900,6 +900,89 @@ static void add_new_streams(demuxer_t *demuxer)
         handle_new_stream(demuxer, priv->num_streams);
 }
 
+static void handle_tile_grid_groups(demuxer_t *demuxer)
+{
+#if LIBAVFORMAT_VERSION_INT >= AV_VERSION_INT(61, 1, 100)
+    lavf_priv_t *priv = demuxer->priv;
+    AVFormatContext *avfc = priv->avfc;
+
+    for (int g = 0; g < avfc->nb_stream_groups; g++) {
+        AVStreamGroup *stream_group = avfc->stream_groups[g];
+        if (stream_group->type != AV_STREAM_GROUP_PARAMS_TILE_GRID)
+            continue;
+
+        const AVStreamGroupTileGrid *av_grid = stream_group->params.tile_grid;
+        if (!av_grid || av_grid->nb_tiles == 0)
+            continue;
+
+        bool valid = true;
+        for (int i = 0; i < av_grid->nb_tiles; i++) {
+            if (av_grid->offsets[i].horizontal >= av_grid->coded_width ||
+                av_grid->offsets[i].vertical   >= av_grid->coded_height)
+            {
+                MP_WARN(demuxer, "Tile grid offsets exceed coded canvas (%dx%d) -"
+                        "ignoring tile grid.\n",
+                        av_grid->coded_width, av_grid->coded_height);
+                valid = false;
+                break;
+            }
+        }
+        if (!valid)
+            continue;
+
+        struct mp_tile_grid *mp_grid = talloc_zero(demuxer, struct mp_tile_grid);
+        mp_grid->nb_tiles          = av_grid->nb_tiles;
+        mp_grid->width             = av_grid->width;
+        mp_grid->height            = av_grid->height;
+        mp_grid->coded_width       = av_grid->coded_width;
+        mp_grid->coded_height      = av_grid->coded_height;
+        mp_grid->horizontal_offset = av_grid->horizontal_offset;
+        mp_grid->vertical_offset   = av_grid->vertical_offset;
+        memcpy(mp_grid->background, av_grid->background, 4);
+
+        mp_grid->tiles = talloc_array(mp_grid, struct mp_tile_grid_entry,
+                                      av_grid->nb_tiles);
+
+        for (int i = 0; i < av_grid->nb_tiles; i++) {
+            unsigned int group_idx = av_grid->offsets[i].idx;
+            if (group_idx >= stream_group->nb_streams) {
+                MP_WARN(demuxer, "Tile %d references out-of-range group "
+                        "stream index %u (group has %u streams) – skipping.\n",
+                        i, group_idx, stream_group->nb_streams);
+                continue;
+            }
+
+            int ff_idx = stream_group->streams[group_idx]->index;
+
+            mp_grid->tiles[i].ff_index   = ff_idx;
+            mp_grid->tiles[i].horizontal = av_grid->offsets[i].horizontal;
+            mp_grid->tiles[i].vertical   = av_grid->offsets[i].vertical;
+
+            if (ff_idx >= 0 && ff_idx < priv->num_streams &&
+                priv->streams[ff_idx])
+            {
+                struct sh_stream *sh = priv->streams[ff_idx]->sh;
+                if (sh && sh->type == STREAM_VIDEO) {
+                    sh->tile_grid = mp_grid;
+                } else {
+                    MP_WARN(demuxer, "Tile %u stream %d is not a video "
+                            "stream – ignoring tile grid for it.\n",
+                            i, ff_idx);
+                }
+            }
+        }
+
+        MP_VERBOSE(demuxer,
+                   "Stream group %u: tile grid %d tile(s), "
+                   "display %dx%d, coded %dx%d, offset (%d,%d).\n",
+                   g, mp_grid->nb_tiles,
+                   mp_grid->width, mp_grid->height,
+                   mp_grid->coded_width, mp_grid->coded_height,
+                   mp_grid->horizontal_offset, mp_grid->vertical_offset);
+    }
+#endif
+}
+
 static void update_metadata(demuxer_t *demuxer)
 {
     lavf_priv_t *priv = demuxer->priv;
@@ -1140,6 +1223,8 @@ static int demux_open_lavf(demuxer_t *demuxer, enum demux_check check)
 
     add_new_streams(demuxer);
 
+    handle_tile_grid_groups(demuxer);
+
     mp_tags_move_from_av_dictionary(demuxer->metadata, &avfc->metadata);
 
     demuxer->ts_resets_possible =
diff --git a/demux/stheader.h b/demux/stheader.h
@@ -65,6 +65,11 @@ struct sh_stream {
     // stream is a picture (such as album art)
     struct demux_packet *attached_picture;
 
+    // Metadata for tiled grid images.
+    // All streams belonging to the same group share the same mp_tile_grid
+    // object.
+    struct mp_tile_grid *tile_grid;
+
     // Internal to demux.c
     struct demux_stream *ds;
 };
@@ -142,4 +147,37 @@ struct mp_codec_params {
     double duration;
 };
 
+struct mp_tile_grid {
+    int nb_tiles;
+
+    // Dimensions after cropping.
+    int width, height;
+
+    // Dimensions before cropping (union of all tile areas plus
+    // any alignment padding on the right/bottom edges).
+    int coded_width, coded_height;
+
+    // Top-left offset of the display rectangle within the coded canvas.
+    //   crop_right  = coded_width  - width  - horizontal_offset
+    //   crop_bottom = coded_height - height - vertical_offset
+    int horizontal_offset;
+    int vertical_offset;
+
+    // Per-tile placement info, array of length nb_tiles.
+    struct mp_tile_grid_entry *tiles;
+
+    // Background fill colour used outside tile boundaries (R,G,B,A bytes).
+    uint8_t background[4];
+};
+
+// Describes one tile's position within a tiled grid image.
+struct mp_tile_grid_entry {
+    // Global AVFormatContext stream index (AVStream.index).
+    // Used to find the matching track.
+    int ff_index;
+    // Top-left pixel position of this tile in the assembled image.
+    int horizontal;
+    int vertical;
+};
+
 #endif /* MPLAYER_STHEADER_H */
diff --git a/player/loadfile.c b/player/loadfile.c
@@ -504,7 +504,7 @@ static bool compare_track(struct track *t1, struct track *t2, char **langs, bool
     if (t1->image != t2->image)
         return !t1->image;
     if (t1->dependent_track != t2->dependent_track)
-        return !t1->dependent_track;
+        return t1->image ? t1->dependent_track : !t1->dependent_track;
     if (t1->stream && t2->stream && opts->hls_bitrate >= 0 &&
         t1->stream->hls_bitrate != t2->stream->hls_bitrate)
     {
@@ -706,7 +706,7 @@ void mp_switch_track_n(struct MPContext *mpctx, int order, enum stream_type type
     if (track == current)
         return;
 
-    if (current && current->sink) {
+    if (current && current->sink && !current->stream->tile_grid) {
         MP_ERR(mpctx, "Can't disable input to complex filter.\n");
         goto error;
     }
diff --git a/player/video.c b/player/video.c
@@ -39,6 +39,7 @@
 #include "sub/osd.h"
 #include "video/hwdec.h"
 #include "filters/f_decoder_wrapper.h"
+#include "filters/f_lavfi.h"
 #include "video/out/vo.h"
 
 #include "core.h"
@@ -155,10 +156,33 @@ static void vo_chain_uninit(struct vo_chain *vo_c)
     // this does not free the VO
 }
 
+static void uninit_grid(struct MPContext *mpctx)
+{
+    struct track *primary = mpctx->vo_chain->track;
+    if (!primary || !primary->stream || !primary->stream->tile_grid)
+        return;
+
+    struct mp_tile_grid *grid = primary->stream->tile_grid;
+    for (int n = 0; n < mpctx->num_tracks; n++) {
+        struct track *track = mpctx->tracks[n];
+        if (!track->stream || track->stream->tile_grid != grid)
+            continue;
+        if (track->sink) {
+            mp_pin_disconnect(track->sink);
+            track->sink = NULL;
+        }
+        if (track != primary)
+            track->dec = NULL;
+        track->selected = false;
+        reselect_demux_stream(mpctx, track, false);
+    }
+}
+
 void uninit_video_chain(struct MPContext *mpctx)
 {
     if (mpctx->vo_chain) {
         reset_video_state(mpctx);
+        uninit_grid(mpctx);
         vo_chain_uninit(mpctx->vo_chain);
         mpctx->vo_chain = NULL;
 
@@ -201,14 +225,151 @@ int init_video_decoder(struct MPContext *mpctx, struct track *track)
     return 0;
 }
 
+static char *tile_grid_graph(void *ctx, const struct mp_tile_grid *grid)
+{
+    bstr buf = {0};
+
+    for (int i = 0; i < grid->nb_tiles; i++)
+        bstr_xappend_asprintf(ctx, &buf, "[in%d]", i);
+
+    bstr_xappend_asprintf(ctx, &buf, "xstack=inputs=%d:layout=", grid->nb_tiles);
+    for (int i = 0; i < grid->nb_tiles; i++) {
+        if (i > 0)
+            bstr_xappend(ctx, &buf, bstr0("|"));
+        bstr_xappend_asprintf(ctx, &buf, "%d_%d", grid->tiles[i].horizontal,
+                              grid->tiles[i].vertical);
+    }
+    bstr_xappend_asprintf(ctx, &buf,
+                          ":fill=0x%02X%02X%02X@0x%02X",
+                          grid->background[0], grid->background[1],
+                          grid->background[2], grid->background[3]);
+
+    if (grid->coded_width != grid->width || grid->coded_height != grid->height) {
+        bstr_xappend_asprintf(ctx, &buf, ",crop=w=%d:h=%d:x=%d:y=%d", grid->width,
+                              grid->height, grid->horizontal_offset, grid->vertical_offset);
+    }
+
+    bstr_xappend(ctx, &buf, bstr0("[vo]"));
+    return buf.start;
+}
+
+static struct track *find_tile_track(struct MPContext *mpctx,
+                                     const struct mp_tile_grid *tg, int tile_idx)
+{
+
+    int wanted_ff = tg->tiles[tile_idx].ff_index;
+    for (int n = 0; n < mpctx->num_tracks; n++) {
+        struct track *t = mpctx->tracks[n];
+        if (t->ff_index == wanted_ff && t->stream && t->stream->tile_grid == tg)
+            return t;
+    }
+    return NULL;
+}
+
+static void reinit_video_chain_tiled(struct MPContext *mpctx, struct track *track)
+{
+    struct mp_tile_grid *grid = track->stream->tile_grid;
+    mp_assert(grid);
+
+    for (int i = 0; i < grid->nb_tiles; i++) {
+        struct track *t = find_tile_track(mpctx, grid, i);
+        if (t) {
+            t->selected = true;
+            reselect_demux_stream(mpctx, t, false);
+        }
+    }
+
+    reinit_video_chain_src(mpctx, NULL);
+    if (!mpctx->vo_chain)
+        return;
+
+    struct vo_chain *vo_c = mpctx->vo_chain;
+
+    void *tmp = talloc_new(NULL);
+    char *graph_str = tile_grid_graph(tmp, grid);
+    MP_VERBOSE(mpctx, "Tile grid xstack graph: %s\n", graph_str);
+
+    struct mp_lavfi *lavfi =
+        mp_lavfi_create_graph(vo_c->filter->f, 0, false, NULL, NULL, graph_str);
+    talloc_free(tmp);
+
+    if (!lavfi) {
+        MP_ERR(mpctx, "Failed to create tile grid filtergraph.\n");
+        goto err_out;
+    }
+
+    struct mp_filter *lavfi_f = lavfi->f;
+
+    struct mp_pin *out_pad = mp_filter_get_named_pin(lavfi_f, "vo");
+    if (!out_pad || mp_pin_get_dir(out_pad) != MP_PIN_OUT) {
+        MP_ERR(mpctx, "Tile grid filtergraph missing output pin 'vo'.\n");
+        goto err_out;
+    }
+    vo_c->filter_src = out_pad;
+    mp_pin_connect(vo_c->filter->f->pins[0], vo_c->filter_src);
+
+    for (int i = 0; i < grid->nb_tiles; i++) {
+        struct track *tile_track = find_tile_track(mpctx, grid, i);
+        if (!tile_track) {
+            MP_ERR(mpctx, "No track found for tile %d (ff_index %d).\n",
+                   i, grid->tiles[i].ff_index);
+            goto err_out;
+        }
+
+        tile_track->vo_c = vo_c;
+        bool result = init_video_decoder(mpctx, tile_track);
+        // vo_chain_uninit() only unsets vo_c on the primary track
+        // (vo_c->track).
+        tile_track->vo_c = NULL;
+        if (!result)
+            goto err_out;
+
+        char label[16];
+        snprintf(label, sizeof(label), "in%d", i);
+        struct mp_pin *in_pad = mp_filter_get_named_pin(lavfi_f, label);
+        if (!in_pad || mp_pin_get_dir(in_pad) != MP_PIN_IN) {
+            MP_ERR(mpctx, "Tile grid filtergraph missing input pin '%s'.\n",
+                   label);
+            goto err_out;
+        }
+        tile_track->sink = in_pad;
+        mp_pin_connect(tile_track->sink, tile_track->dec->f->pins[0]);
+    }
+
+    struct track *primary = find_tile_track(mpctx, grid, 0);
+    vo_c->track = primary;
+    primary->vo_c = vo_c;
+    vo_c->filter->container_fps =
+        mp_decoder_wrapper_get_container_fps(primary->dec);
+    vo_c->is_coverart = !!primary->attached_picture;
+    vo_c->is_sparse = primary->stream->still_image || vo_c->is_coverart;
+
+    if (vo_c->is_coverart)
+        mp_decoder_wrapper_set_coverart_flag(track->dec, true);
+
+    MP_VERBOSE(mpctx, "Tile grid: assembling %d tile(s) into %dx%d image.\n",
+               grid->nb_tiles, grid->width, grid->height);
+    return;
+
+err_out:
+    uninit_video_chain(mpctx);
+    error_on_track(mpctx, track);
+    handle_force_window(mpctx, true);
+}
+
 void reinit_video_chain(struct MPContext *mpctx)
 {
     struct track *track = mpctx->current_track[0][STREAM_VIDEO];
     if (!track || !track->stream) {
         error_on_track(mpctx, track);
         return;
     }
-    reinit_video_chain_src(mpctx, track);
+
+    if (track->stream->tile_grid) {
+        reinit_video_chain_tiled(mpctx, track);
+    } else {
+        reinit_video_chain_src(mpctx, track);
+    }
 }
 
 static void filter_update_subtitles(void *ctx, double pts)