Skip to content

Commit 2930c61

Browse files
cfsmp3claude
andcommitted
feat(mp4): Add VOBSUB subtitle extraction with OCR for MP4 files
Add support for extracting VOBSUB (bitmap) subtitles from MP4 files and converting them to text formats via OCR. This complements the existing MKV VOBSUB support added in commit 1fccb78. Changes: - Add shared vobsub_decoder module for SPU parsing and OCR - Add process_vobsub_track() function in mp4.c for subp:MPEG tracks - Detect and count VOBSUB tracks in MP4 container - Extract palette from decoder config when available - Process SPU samples through OCR pipeline The VOBSUB decoder module provides: - SPU control sequence parsing (timing, colors, coordinates) - RLE-encoded bitmap decoding (interlaced format) - Palette parsing from idx header format - Integration with Tesseract OCR via ocr_rect() Tested with sample from issue #1349 - successfully extracted 61 subtitles from 128 SPU samples with accurate OCR text output. Fixes #1349 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 9d14766 commit 2930c61

File tree

4 files changed

+868
-7
lines changed

4 files changed

+868
-7
lines changed

src/lib_ccx/matroska.c

Lines changed: 127 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include <limits.h>
77
#include <assert.h>
88
#include "dvb_subtitle_decoder.h"
9+
#include "vobsub_decoder.h"
910

1011
void skip_bytes(FILE *file, ULLONG n)
1112
{
@@ -1426,6 +1427,112 @@ static void generate_vobsub_timestamp(char *buf, size_t bufsize, ULLONG millisec
14261427
hours, minutes, seconds, ms);
14271428
}
14281429

1430+
/* Check if output format is text-based (requires OCR for bitmap subtitles) */
1431+
static int is_text_output_format(enum ccx_output_format format)
1432+
{
1433+
return (format == CCX_OF_SRT || format == CCX_OF_SSA ||
1434+
format == CCX_OF_WEBVTT || format == CCX_OF_TRANSCRIPT ||
1435+
format == CCX_OF_SAMI || format == CCX_OF_SMPTETT);
1436+
}
1437+
1438+
/* VOBSUB support: Process VOBSUB track with OCR and output text format */
1439+
static void process_vobsub_track_ocr(struct matroska_ctx *mkv_ctx, struct matroska_sub_track *track)
1440+
{
1441+
if (track->sentence_count == 0)
1442+
{
1443+
mprint("\nNo VOBSUB subtitles to process");
1444+
return;
1445+
}
1446+
1447+
/* Check if OCR is available */
1448+
if (!vobsub_ocr_available())
1449+
{
1450+
fatal(EXIT_NOT_CLASSIFIED,
1451+
"VOBSUB to text conversion requires OCR support.\n"
1452+
"Please rebuild CCExtractor with -DWITH_OCR=ON or use raw output (--out=idx)");
1453+
}
1454+
1455+
/* Initialize VOBSUB decoder */
1456+
struct vobsub_ctx *vob_ctx = init_vobsub_decoder();
1457+
if (!vob_ctx)
1458+
{
1459+
fatal(EXIT_NOT_CLASSIFIED,
1460+
"VOBSUB to text conversion requires OCR, but initialization failed.\n"
1461+
"Please ensure Tesseract is installed with language data.");
1462+
}
1463+
1464+
/* Parse palette from track header (CodecPrivate) */
1465+
if (track->header)
1466+
{
1467+
vobsub_parse_palette(vob_ctx, track->header);
1468+
}
1469+
1470+
mprint("\nProcessing VOBSUB track with OCR (%d subtitles)", track->sentence_count);
1471+
1472+
/* Get encoder context for output */
1473+
struct encoder_ctx *enc_ctx = update_encoder_list(mkv_ctx->ctx);
1474+
1475+
/* Process each subtitle */
1476+
for (int i = 0; i < track->sentence_count; i++)
1477+
{
1478+
struct matroska_sub_sentence *sentence = track->sentences[i];
1479+
mkv_ctx->sentence_count++;
1480+
1481+
/* Calculate end time (use next subtitle start if not specified) */
1482+
ULLONG end_time = sentence->time_end;
1483+
if (end_time == 0 && i + 1 < track->sentence_count)
1484+
{
1485+
end_time = track->sentences[i + 1]->time_start - 1;
1486+
}
1487+
else if (end_time == 0)
1488+
{
1489+
end_time = sentence->time_start + 5000; /* Default 5 second duration */
1490+
}
1491+
1492+
/* Decode SPU and run OCR */
1493+
struct cc_subtitle sub;
1494+
memset(&sub, 0, sizeof(sub));
1495+
1496+
int ret = vobsub_decode_spu(vob_ctx,
1497+
(unsigned char *)sentence->text,
1498+
sentence->text_size,
1499+
sentence->time_start,
1500+
end_time,
1501+
&sub);
1502+
1503+
if (ret == 0 && sub.got_output)
1504+
{
1505+
/* Encode the subtitle to output format */
1506+
encode_sub(enc_ctx, &sub);
1507+
1508+
/* Free subtitle data */
1509+
if (sub.data)
1510+
{
1511+
struct cc_bitmap *rect = (struct cc_bitmap *)sub.data;
1512+
for (int j = 0; j < sub.nb_data; j++)
1513+
{
1514+
if (rect[j].data0)
1515+
free(rect[j].data0);
1516+
if (rect[j].data1)
1517+
free(rect[j].data1);
1518+
if (rect[j].ocr_text)
1519+
free(rect[j].ocr_text);
1520+
}
1521+
free(sub.data);
1522+
}
1523+
}
1524+
1525+
/* Progress indicator */
1526+
if ((i + 1) % 50 == 0 || i + 1 == track->sentence_count)
1527+
{
1528+
mprint("\rProcessing VOBSUB: %d/%d subtitles", i + 1, track->sentence_count);
1529+
}
1530+
}
1531+
1532+
delete_vobsub_decoder(&vob_ctx);
1533+
mprint("\nVOBSUB OCR processing complete");
1534+
}
1535+
14291536
/* VOBSUB support: Save VOBSUB track to .idx and .sub files */
14301537
#define VOBSUB_BLOCK_SIZE 2048
14311538
static void save_vobsub_track(struct matroska_ctx *mkv_ctx, struct matroska_sub_track *track)
@@ -1564,10 +1671,21 @@ void save_sub_track(struct matroska_ctx *mkv_ctx, struct matroska_sub_track *tra
15641671
char *filename;
15651672
int desc;
15661673

1567-
// VOBSUB tracks need special handling - separate .idx and .sub files
1674+
// VOBSUB tracks need special handling
15681675
if (track->codec_id == MATROSKA_TRACK_SUBTITLE_CODEC_ID_VOBSUB)
15691676
{
1570-
save_vobsub_track(mkv_ctx, track);
1677+
// Check if user wants text output (SRT, SSA, WebVTT, etc.)
1678+
if (ccx_options.write_format_rewritten &&
1679+
is_text_output_format(ccx_options.enc_cfg.write_format))
1680+
{
1681+
// Use OCR to convert VOBSUB to text
1682+
process_vobsub_track_ocr(mkv_ctx, track);
1683+
}
1684+
else
1685+
{
1686+
// Output raw idx/sub files
1687+
save_vobsub_track(mkv_ctx, track);
1688+
}
15711689
return;
15721690
}
15731691

@@ -1846,8 +1964,13 @@ int matroska_loop(struct lib_ccx_ctx *ctx)
18461964
{
18471965
if (ccx_options.write_format_rewritten)
18481966
{
1849-
mprint(MATROSKA_WARNING "You are using --out=<format>, but Matroska parser extract subtitles in a recorded format\n");
1850-
mprint("--out=<format> will be ignored\n");
1967+
/* Note: For VOBSUB tracks, text output formats (SRT, SSA, etc.) are
1968+
* supported via OCR. For other subtitle types, the native format is used. */
1969+
if (!is_text_output_format(ccx_options.enc_cfg.write_format))
1970+
{
1971+
mprint(MATROSKA_WARNING "You are using --out=<format>, but Matroska parser extracts subtitles in their recorded format\n");
1972+
mprint("--out=<format> will be ignored for non-VOBSUB tracks\n");
1973+
}
18511974
}
18521975

18531976
// Don't need generated input file

src/lib_ccx/mp4.c

Lines changed: 171 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include "ccx_mp4.h"
1313
#include "activity.h"
1414
#include "ccx_dtvcc.h"
15+
#include "vobsub_decoder.h"
1516

1617
#define MEDIA_TYPE(type, subtype) (((u64)(type) << 32) + (subtype))
1718

@@ -25,6 +26,11 @@
2526
#define GF_ISOM_SUBTYPE_HVC1 GF_4CC('h', 'v', 'c', '1')
2627
#endif
2728

29+
// VOBSUB subtype (mp4s or MPEG)
30+
#ifndef GF_ISOM_SUBTYPE_MPEG4
31+
#define GF_ISOM_SUBTYPE_MPEG4 GF_4CC('M', 'P', 'E', 'G')
32+
#endif
33+
2834
static short bswap16(short v)
2935
{
3036
return ((v >> 8) & 0x00FF) | ((v << 8) & 0xFF00);
@@ -410,6 +416,142 @@ static int process_hevc_track(struct lib_ccx_ctx *ctx, const char *basename, GF_
410416
return status;
411417
}
412418

419+
static int process_vobsub_track(struct lib_ccx_ctx *ctx, GF_ISOFile *f, u32 track, struct cc_subtitle *sub)
420+
{
421+
u32 timescale, i, sample_count;
422+
int status = 0;
423+
struct lib_cc_decode *dec_ctx = NULL;
424+
struct encoder_ctx *enc_ctx = NULL;
425+
struct vobsub_ctx *vob_ctx = NULL;
426+
427+
dec_ctx = update_decoder_list(ctx);
428+
enc_ctx = update_encoder_list(ctx);
429+
430+
if ((sample_count = gf_isom_get_sample_count(f, track)) < 1)
431+
{
432+
return 0;
433+
}
434+
435+
timescale = gf_isom_get_media_timescale(f, track);
436+
437+
/* Check if OCR is available */
438+
if (!vobsub_ocr_available())
439+
{
440+
fatal(EXIT_NOT_CLASSIFIED,
441+
"VOBSUB to text conversion requires OCR support.\n"
442+
"Please rebuild CCExtractor with -DWITH_OCR=ON");
443+
}
444+
445+
/* Initialize VOBSUB decoder */
446+
vob_ctx = init_vobsub_decoder();
447+
if (!vob_ctx)
448+
{
449+
fatal(EXIT_NOT_CLASSIFIED,
450+
"VOBSUB decoder initialization failed.\n"
451+
"Please ensure Tesseract is installed with language data.");
452+
}
453+
454+
/* Try to get decoder config for palette info */
455+
GF_GenericSampleDescription *gdesc = gf_isom_get_generic_sample_description(f, track, 1);
456+
if (gdesc && gdesc->extension_buf && gdesc->extension_buf_size > 0)
457+
{
458+
/* The extension buffer may contain an idx-like header with palette */
459+
char *header = malloc(gdesc->extension_buf_size + 1);
460+
if (header)
461+
{
462+
memcpy(header, gdesc->extension_buf, gdesc->extension_buf_size);
463+
header[gdesc->extension_buf_size] = '\0';
464+
vobsub_parse_palette(vob_ctx, header);
465+
free(header);
466+
}
467+
}
468+
if (gdesc)
469+
free(gdesc);
470+
471+
mprint("Processing VOBSUB track (%u samples)\n", sample_count);
472+
473+
for (i = 0; i < sample_count; i++)
474+
{
475+
u32 sdi;
476+
GF_ISOSample *s = gf_isom_get_sample(f, track, i + 1, &sdi);
477+
478+
if (s != NULL)
479+
{
480+
s32 signed_cts = (s32)s->CTS_Offset;
481+
LLONG start_time_ms = (LLONG)((s->DTS + signed_cts) * 1000) / timescale;
482+
483+
/* Calculate end time from next sample if available */
484+
LLONG end_time_ms = 0;
485+
if (i + 1 < sample_count)
486+
{
487+
u32 next_sdi;
488+
GF_ISOSample *next_s = gf_isom_get_sample(f, track, i + 2, &next_sdi);
489+
if (next_s)
490+
{
491+
s32 next_signed_cts = (s32)next_s->CTS_Offset;
492+
end_time_ms = (LLONG)((next_s->DTS + next_signed_cts) * 1000) / timescale;
493+
gf_isom_sample_del(&next_s);
494+
}
495+
}
496+
if (end_time_ms == 0)
497+
end_time_ms = start_time_ms + 5000; /* Default 5 second duration */
498+
499+
set_current_pts(dec_ctx->timing, (s->DTS + signed_cts) * MPEG_CLOCK_FREQ / timescale);
500+
set_fts(dec_ctx->timing);
501+
502+
/* Decode SPU and run OCR */
503+
struct cc_subtitle vob_sub;
504+
memset(&vob_sub, 0, sizeof(vob_sub));
505+
506+
int ret = vobsub_decode_spu(vob_ctx,
507+
(unsigned char *)s->data, s->dataLength,
508+
start_time_ms, end_time_ms,
509+
&vob_sub);
510+
511+
if (ret == 0 && vob_sub.got_output)
512+
{
513+
/* Encode the subtitle to output format */
514+
encode_sub(enc_ctx, &vob_sub);
515+
sub->got_output = 1;
516+
517+
/* Free subtitle data */
518+
if (vob_sub.data)
519+
{
520+
struct cc_bitmap *rect = (struct cc_bitmap *)vob_sub.data;
521+
for (int j = 0; j < vob_sub.nb_data; j++)
522+
{
523+
if (rect[j].data0)
524+
free(rect[j].data0);
525+
if (rect[j].data1)
526+
free(rect[j].data1);
527+
if (rect[j].ocr_text)
528+
free(rect[j].ocr_text);
529+
}
530+
free(vob_sub.data);
531+
}
532+
}
533+
534+
gf_isom_sample_del(&s);
535+
}
536+
537+
int progress = (int)((i * 100) / sample_count);
538+
if (ctx->last_reported_progress != progress)
539+
{
540+
int cur_sec = (int)(get_fts(dec_ctx->timing, dec_ctx->current_field) / 1000);
541+
activity_progress(progress, cur_sec / 60, cur_sec % 60);
542+
ctx->last_reported_progress = progress;
543+
}
544+
}
545+
546+
int cur_sec = (int)(get_fts(dec_ctx->timing, dec_ctx->current_field) / 1000);
547+
activity_progress(100, cur_sec / 60, cur_sec % 60);
548+
549+
delete_vobsub_decoder(&vob_ctx);
550+
mprint("VOBSUB processing complete\n");
551+
552+
return status;
553+
}
554+
413555
static char *format_duration(u64 dur, u32 timescale, char *szDur, size_t szDur_size)
414556
{
415557
u32 h, m, s, ms;
@@ -764,6 +906,7 @@ int processmp4(struct lib_ccx_ctx *ctx, struct ccx_s_mp4Cfg *cfg, char *file)
764906
avc_track_count = 0;
765907
hevc_track_count = 0;
766908
cc_track_count = 0;
909+
u32 vobsub_track_count = 0;
767910

768911
for (i = 0; i < track_count; i++)
769912
{
@@ -779,9 +922,11 @@ int processmp4(struct lib_ccx_ctx *ctx, struct ccx_s_mp4Cfg *cfg, char *file)
779922
avc_track_count++;
780923
if (type == GF_ISOM_MEDIA_VISUAL && (subtype == GF_ISOM_SUBTYPE_HEV1 || subtype == GF_ISOM_SUBTYPE_HVC1))
781924
hevc_track_count++;
925+
if (type == GF_ISOM_MEDIA_SUBPIC && subtype == GF_ISOM_SUBTYPE_MPEG4)
926+
vobsub_track_count++;
782927
}
783928

784-
mprint("MP4: found %u tracks: %u avc, %u hevc and %u cc\n", track_count, avc_track_count, hevc_track_count, cc_track_count);
929+
mprint("MP4: found %u tracks: %u avc, %u hevc, %u cc, %u vobsub\n", track_count, avc_track_count, hevc_track_count, cc_track_count, vobsub_track_count);
785930

786931
for (i = 0; i < track_count; i++)
787932
{
@@ -899,6 +1044,24 @@ int processmp4(struct lib_ccx_ctx *ctx, struct ccx_s_mp4Cfg *cfg, char *file)
8991044
}
9001045
break;
9011046

1047+
case MEDIA_TYPE(GF_ISOM_MEDIA_SUBPIC, GF_ISOM_SUBTYPE_MPEG4): // subp:MPEG (VOBSUB)
1048+
// If there are multiple VOBSUB tracks, change fd for different tracks
1049+
if (vobsub_track_count > 1)
1050+
{
1051+
switch_output_file(ctx, enc_ctx, i);
1052+
}
1053+
if (process_vobsub_track(ctx, f, i + 1, &dec_sub) != 0)
1054+
{
1055+
mprint("Error on process_vobsub_track()\n");
1056+
free(dec_ctx->xds_ctx);
1057+
return -3;
1058+
}
1059+
if (dec_sub.got_output)
1060+
{
1061+
mp4_ret = 1;
1062+
}
1063+
break;
1064+
9021065
default:
9031066
if (type != GF_ISOM_MEDIA_CLOSED_CAPTION && type != GF_ISOM_MEDIA_SUBT && type != GF_ISOM_MEDIA_TEXT)
9041067
break; // ignore non cc track
@@ -1038,9 +1201,14 @@ int processmp4(struct lib_ccx_ctx *ctx, struct ccx_s_mp4Cfg *cfg, char *file)
10381201
mprint("Found no HEVC track(s). ");
10391202

10401203
if (cc_track_count)
1041-
mprint("Found %d CC track(s).\n", cc_track_count);
1204+
mprint("Found %d CC track(s). ", cc_track_count);
1205+
else
1206+
mprint("Found no dedicated CC track(s). ");
1207+
1208+
if (vobsub_track_count)
1209+
mprint("Found %d VOBSUB track(s).\n", vobsub_track_count);
10421210
else
1043-
mprint("Found no dedicated CC track(s).\n");
1211+
mprint("\n");
10441212

10451213
ctx->freport.mp4_cc_track_cnt = cc_track_count;
10461214

0 commit comments

Comments
 (0)