Skip to content

Commit 1fccb78

Browse files
cfsmp3claude
andcommitted
feat(matroska): Add VOBSUB subtitle extraction support for MKV files
Previously, CCExtractor would only print "Error: VOBSUB not supported" when encountering VOBSUB (S_VOBSUB) subtitle tracks in Matroska files. This left users without any usable output. This commit adds full VOBSUB extraction support: - Generate proper .idx index files with timestamps and file positions - Generate proper .sub files with PS-wrapped SPU data - Correct PS Pack header with SCR derived from timestamps - Correct PES header with PTS for each subtitle - 2048-byte block alignment (standard VOBSUB format) The output is compatible with VLC, FFmpeg, and other players that support VobSub subtitle format. Tested with sample from issue #1371 - output validates correctly with FFprobe and produces identical subtitle data to mkvextract. Fixes #1371 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <[email protected]>
1 parent ec30a79 commit 1fccb78

File tree

2 files changed

+239
-9
lines changed

2 files changed

+239
-9
lines changed

src/lib_ccx/matroska.c

Lines changed: 234 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1334,11 +1334,245 @@ char *ass_ssa_sentence_erase_read_order(char *text)
13341334
return buf;
13351335
}
13361336

1337+
/* VOBSUB support: Generate PS Pack header
1338+
* The PS Pack header is 14 bytes:
1339+
* - 4 bytes: start code (00 00 01 ba)
1340+
* - 6 bytes: SCR (System Clock Reference) in MPEG-2 format
1341+
* - 3 bytes: mux rate
1342+
* - 1 byte: stuffing length (0)
1343+
*/
1344+
static void generate_ps_pack_header(unsigned char *buf, ULLONG pts_90khz)
1345+
{
1346+
// PS Pack start code
1347+
buf[0] = 0x00;
1348+
buf[1] = 0x00;
1349+
buf[2] = 0x01;
1350+
buf[3] = 0xBA;
1351+
1352+
// SCR (System Clock Reference) - use PTS as SCR base, SCR extension = 0
1353+
// MPEG-2 format: 01 SCR[32:30] 1 SCR[29:15] 1 SCR[14:0] 1 SCR_ext[8:0] 1
1354+
ULLONG scr = pts_90khz;
1355+
ULLONG scr_base = scr;
1356+
int scr_ext = 0;
1357+
1358+
buf[4] = 0x44 | ((scr_base >> 27) & 0x38) | ((scr_base >> 28) & 0x03);
1359+
buf[5] = (scr_base >> 20) & 0xFF;
1360+
buf[6] = 0x04 | ((scr_base >> 12) & 0xF8) | ((scr_base >> 13) & 0x03);
1361+
buf[7] = (scr_base >> 5) & 0xFF;
1362+
buf[8] = 0x04 | ((scr_base << 3) & 0xF8) | ((scr_ext >> 7) & 0x03);
1363+
buf[9] = ((scr_ext << 1) & 0xFE) | 0x01;
1364+
1365+
// Mux rate (10080 = standard DVD rate)
1366+
int mux_rate = 10080;
1367+
buf[10] = (mux_rate >> 14) & 0xFF;
1368+
buf[11] = (mux_rate >> 6) & 0xFF;
1369+
buf[12] = ((mux_rate << 2) & 0xFC) | 0x03;
1370+
1371+
// Stuffing length = 0, with marker bits
1372+
buf[13] = 0xF8;
1373+
}
1374+
1375+
/* VOBSUB support: Generate PES header for private stream 1
1376+
* Returns the total header size (variable based on PTS)
1377+
*/
1378+
static int generate_pes_header(unsigned char *buf, ULLONG pts_90khz, int payload_size, int stream_id)
1379+
{
1380+
// PES start code for private stream 1
1381+
buf[0] = 0x00;
1382+
buf[1] = 0x00;
1383+
buf[2] = 0x01;
1384+
buf[3] = 0xBD; // Private stream 1
1385+
1386+
// PES packet length = header data (3 + 5 for PTS) + 1 (substream ID) + payload
1387+
int pes_header_data_len = 5; // PTS only
1388+
int pes_packet_len = 3 + pes_header_data_len + 1 + payload_size;
1389+
buf[4] = (pes_packet_len >> 8) & 0xFF;
1390+
buf[5] = pes_packet_len & 0xFF;
1391+
1392+
// PES flags: MPEG-2, original
1393+
buf[6] = 0x81;
1394+
// PTS_DTS_flags = 10 (PTS only)
1395+
buf[7] = 0x80;
1396+
// PES header data length
1397+
buf[8] = pes_header_data_len;
1398+
1399+
// PTS (5 bytes): '0010' | PTS[32:30] | '1' | PTS[29:15] | '1' | PTS[14:0] | '1'
1400+
buf[9] = 0x21 | ((pts_90khz >> 29) & 0x0E);
1401+
buf[10] = (pts_90khz >> 22) & 0xFF;
1402+
buf[11] = 0x01 | ((pts_90khz >> 14) & 0xFE);
1403+
buf[12] = (pts_90khz >> 7) & 0xFF;
1404+
buf[13] = 0x01 | ((pts_90khz << 1) & 0xFE);
1405+
1406+
// Substream ID (0x20 = first VOBSUB stream)
1407+
buf[14] = 0x20 + stream_id;
1408+
1409+
return 15; // Total PES header size
1410+
}
1411+
1412+
/* VOBSUB support: Generate timestamp string for .idx file
1413+
* Format: HH:MM:SS:mmm (where mmm is milliseconds)
1414+
*/
1415+
static void generate_vobsub_timestamp(char *buf, size_t bufsize, ULLONG milliseconds)
1416+
{
1417+
ULLONG ms = milliseconds % 1000;
1418+
milliseconds /= 1000;
1419+
ULLONG seconds = milliseconds % 60;
1420+
milliseconds /= 60;
1421+
ULLONG minutes = milliseconds % 60;
1422+
milliseconds /= 60;
1423+
ULLONG hours = milliseconds;
1424+
1425+
snprintf(buf, bufsize, "%02" LLU_M ":%02" LLU_M ":%02" LLU_M ":%03" LLU_M,
1426+
hours, minutes, seconds, ms);
1427+
}
1428+
1429+
/* VOBSUB support: Save VOBSUB track to .idx and .sub files */
1430+
static void save_vobsub_track(struct matroska_ctx *mkv_ctx, struct matroska_sub_track *track)
1431+
{
1432+
if (track->sentence_count == 0)
1433+
{
1434+
mprint("\nNo VOBSUB subtitles to write");
1435+
return;
1436+
}
1437+
1438+
// Generate base filename (without extension)
1439+
const char *lang_to_use = track->lang_ietf ? track->lang_ietf : track->lang;
1440+
const char *basename = get_basename(mkv_ctx->filename);
1441+
size_t needed = strlen(basename) + strlen(lang_to_use) + 32;
1442+
char *base_filename = malloc(needed);
1443+
if (base_filename == NULL)
1444+
fatal(EXIT_NOT_ENOUGH_MEMORY, "In save_vobsub_track: Out of memory.");
1445+
1446+
if (track->lang_index == 0)
1447+
snprintf(base_filename, needed, "%s_%s", basename, lang_to_use);
1448+
else
1449+
snprintf(base_filename, needed, "%s_%s_" LLD, basename, lang_to_use, track->lang_index);
1450+
1451+
// Create .sub filename
1452+
char *sub_filename = malloc(needed + 5);
1453+
if (sub_filename == NULL)
1454+
fatal(EXIT_NOT_ENOUGH_MEMORY, "In save_vobsub_track: Out of memory.");
1455+
snprintf(sub_filename, needed + 5, "%s.sub", base_filename);
1456+
1457+
// Create .idx filename
1458+
char *idx_filename = malloc(needed + 5);
1459+
if (idx_filename == NULL)
1460+
fatal(EXIT_NOT_ENOUGH_MEMORY, "In save_vobsub_track: Out of memory.");
1461+
snprintf(idx_filename, needed + 5, "%s.idx", base_filename);
1462+
1463+
mprint("\nOutput files: %s, %s", idx_filename, sub_filename);
1464+
1465+
// Open .sub file
1466+
int sub_desc;
1467+
#ifdef WIN32
1468+
sub_desc = open(sub_filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IREAD | S_IWRITE);
1469+
#else
1470+
sub_desc = open(sub_filename, O_WRONLY | O_CREAT | O_TRUNC, S_IWUSR | S_IRUSR);
1471+
#endif
1472+
if (sub_desc < 0)
1473+
{
1474+
mprint("\nError: Cannot create .sub file");
1475+
free(base_filename);
1476+
free(sub_filename);
1477+
free(idx_filename);
1478+
return;
1479+
}
1480+
1481+
// Open .idx file
1482+
int idx_desc;
1483+
#ifdef WIN32
1484+
idx_desc = open(idx_filename, O_WRONLY | O_CREAT | O_TRUNC, S_IREAD | S_IWRITE);
1485+
#else
1486+
idx_desc = open(idx_filename, O_WRONLY | O_CREAT | O_TRUNC, S_IWUSR | S_IRUSR);
1487+
#endif
1488+
if (idx_desc < 0)
1489+
{
1490+
mprint("\nError: Cannot create .idx file");
1491+
close(sub_desc);
1492+
free(base_filename);
1493+
free(sub_filename);
1494+
free(idx_filename);
1495+
return;
1496+
}
1497+
1498+
// Write .idx header (from CodecPrivate)
1499+
if (track->header != NULL)
1500+
write_wrapped(idx_desc, track->header, strlen(track->header));
1501+
1502+
// Add language identifier line
1503+
char lang_line[128];
1504+
snprintf(lang_line, sizeof(lang_line), "\nid: %s, index: 0\n", lang_to_use);
1505+
write_wrapped(idx_desc, lang_line, strlen(lang_line));
1506+
1507+
// Block size for alignment (2048 bytes = 0x800)
1508+
const int VOBSUB_BLOCK_SIZE = 2048;
1509+
1510+
// Buffer for PS/PES headers and padding
1511+
unsigned char header_buf[32];
1512+
unsigned char zero_buf[VOBSUB_BLOCK_SIZE];
1513+
memset(zero_buf, 0, VOBSUB_BLOCK_SIZE);
1514+
1515+
ULLONG file_pos = 0;
1516+
1517+
// Write each subtitle
1518+
for (int i = 0; i < track->sentence_count; i++)
1519+
{
1520+
struct matroska_sub_sentence *sentence = track->sentences[i];
1521+
mkv_ctx->sentence_count++;
1522+
1523+
// Convert timestamp to 90kHz PTS
1524+
ULLONG pts_90khz = sentence->time_start * 90;
1525+
1526+
// Write timestamp entry to .idx
1527+
char timestamp[32];
1528+
generate_vobsub_timestamp(timestamp, sizeof(timestamp), sentence->time_start);
1529+
char idx_entry[128];
1530+
snprintf(idx_entry, sizeof(idx_entry), "timestamp: %s, filepos: %09" LLX_M "\n",
1531+
timestamp, file_pos);
1532+
write_wrapped(idx_desc, idx_entry, strlen(idx_entry));
1533+
1534+
// Generate PS Pack header (14 bytes)
1535+
generate_ps_pack_header(header_buf, pts_90khz);
1536+
write_wrapped(sub_desc, (char *)header_buf, 14);
1537+
1538+
// Generate PES header (15 bytes)
1539+
int pes_header_len = generate_pes_header(header_buf, pts_90khz, sentence->text_size, 0);
1540+
write_wrapped(sub_desc, (char *)header_buf, pes_header_len);
1541+
1542+
// Write SPU data
1543+
write_wrapped(sub_desc, sentence->text, sentence->text_size);
1544+
1545+
// Calculate bytes written and pad to block boundary
1546+
ULLONG bytes_written = 14 + pes_header_len + sentence->text_size;
1547+
ULLONG padding_needed = VOBSUB_BLOCK_SIZE - (bytes_written % VOBSUB_BLOCK_SIZE);
1548+
if (padding_needed < VOBSUB_BLOCK_SIZE)
1549+
{
1550+
write_wrapped(sub_desc, (char *)zero_buf, padding_needed);
1551+
bytes_written += padding_needed;
1552+
}
1553+
1554+
file_pos += bytes_written;
1555+
}
1556+
1557+
close(sub_desc);
1558+
close(idx_desc);
1559+
free(base_filename);
1560+
free(sub_filename);
1561+
free(idx_filename);
1562+
}
1563+
13371564
void save_sub_track(struct matroska_ctx *mkv_ctx, struct matroska_sub_track *track)
13381565
{
13391566
char *filename;
13401567
int desc;
13411568

1569+
// VOBSUB tracks need special handling - separate .idx and .sub files
1570+
if (track->codec_id == MATROSKA_TRACK_SUBTITLE_CODEC_ID_VOBSUB)
1571+
{
1572+
save_vobsub_track(mkv_ctx, track);
1573+
return;
1574+
}
1575+
13421576
if (mkv_ctx->ctx->cc_to_stdout == CCX_TRUE)
13431577
{
13441578
desc = 1; // file descriptor of stdout
@@ -1358,11 +1592,6 @@ void save_sub_track(struct matroska_ctx *mkv_ctx, struct matroska_sub_track *tra
13581592
if (track->header != NULL)
13591593
write_wrapped(desc, track->header, strlen(track->header));
13601594

1361-
if (track->codec_id == MATROSKA_TRACK_SUBTITLE_CODEC_ID_VOBSUB)
1362-
{
1363-
mprint("\nError: VOBSUB not supported");
1364-
}
1365-
13661595
for (int i = 0; i < track->sentence_count; i++)
13671596
{
13681597
struct matroska_sub_sentence *sentence = track->sentences[i];
@@ -1497,10 +1726,6 @@ void save_sub_track(struct matroska_ctx *mkv_ctx, struct matroska_sub_track *tra
14971726
free(timestamp_start);
14981727
free(timestamp_end);
14991728
}
1500-
else if (track->codec_id == MATROSKA_TRACK_SUBTITLE_CODEC_ID_VOBSUB)
1501-
{
1502-
// TODO: Add support for VOBSUB
1503-
}
15041729
}
15051730
}
15061731

src/lib_ccx/matroska.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,26 +5,31 @@
55
#if (defined(WIN32) || defined(_WIN32_WCE)) && (defined(__MINGW32__) || !defined(__GNUC__))
66
#define LLD_M "I64d"
77
#define LLU_M "I64u"
8+
#define LLX_M "I64x"
89
#define LLD "%I64d"
910
#define LLU "%I64u"
1011
#elif defined(__SYMBIAN32__)
1112
#define LLD_M "d"
1213
#define LLU_M "u"
14+
#define LLX_M "x"
1315
#define LLD "%d"
1416
#define LLU "%u"
1517
#elif defined(__DARWIN__) || defined(__APPLE__)
1618
#define LLD_M "lld"
1719
#define LLU_M "llu"
20+
#define LLX_M "llx"
1821
#define LLD "%lld"
1922
#define LLU "%llu"
2023
#elif defined(_LP64) /* Unix 64 bits */
2124
#define LLD_M "ld"
2225
#define LLU_M "lu"
26+
#define LLX_M "lx"
2327
#define LLD "%ld"
2428
#define LLU "%lu"
2529
#else /* Unix 32 bits */
2630
#define LLD_M "lld"
2731
#define LLU_M "llu"
32+
#define LLX_M "llx"
2833
#define LLD "%lld"
2934
#define LLU "%llu"
3035
#endif

0 commit comments

Comments
 (0)