Skip to content

Commit b57a623

Browse files
authored
Merge branch 'CCExtractor:master' into feature/split-dvb-subs
2 parents 6e5872b + fc4a14e commit b57a623

File tree

14 files changed

+431
-123
lines changed

14 files changed

+431
-123
lines changed

.github/workflows/build_appimage.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ jobs:
4242
4343
- name: Checkout repository
4444
if: steps.should_build.outputs.should_build == 'true'
45-
uses: actions/checkout@v4
45+
uses: actions/checkout@v6
4646

4747
- name: Install base dependencies
4848
if: steps.should_build.outputs.should_build == 'true'
@@ -93,7 +93,7 @@ jobs:
9393
- name: Cache GPAC build
9494
if: steps.should_build.outputs.should_build == 'true'
9595
id: cache-gpac
96-
uses: actions/cache@v4
96+
uses: actions/cache@v5
9797
with:
9898
path: /usr/local/lib/libgpac*
9999
key: gpac-v2.4.0-ubuntu22
@@ -143,14 +143,14 @@ jobs:
143143
144144
- name: Upload AppImage artifact
145145
if: steps.should_build.outputs.should_build == 'true'
146-
uses: actions/upload-artifact@v4
146+
uses: actions/upload-artifact@v6
147147
with:
148148
name: ${{ steps.appimage_name.outputs.name }}
149149
path: linux/${{ steps.appimage_name.outputs.name }}
150150

151151
- name: Upload to Release
152152
if: steps.should_build.outputs.should_build == 'true' && github.event_name == 'release'
153-
uses: softprops/action-gh-release@v1
153+
uses: softprops/action-gh-release@v2
154154
with:
155155
files: linux/${{ steps.appimage_name.outputs.name }}
156156
env:

.github/workflows/release.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,10 @@ jobs:
3434
LLVM_CONFIG_PATH: "C:\\Program Files\\LLVM\\bin\\llvm-config"
3535
CARGO_TARGET_DIR: "..\\..\\windows"
3636
BINDGEN_EXTRA_CLANG_ARGS: -fmsc-version=0
37-
run: msbuild ccextractor.sln /p:Configuration=Release-Full /p:Platform=Win32
37+
run: msbuild ccextractor.sln /p:Configuration=Release-Full /p:Platform=x64
3838
working-directory: ./windows
3939
- name: Copy files to directory for installer
40-
run: mkdir installer; cp ./Release-Full/ccextractorwinfull.exe ./installer; cp ./Release-Full/*.dll ./installer
40+
run: mkdir installer; cp ./x64/Release-Full/ccextractorwinfull.exe ./installer; cp ./x64/Release-Full/*.dll ./installer
4141
working-directory: ./windows
4242
- name: install WiX
4343
run: dotnet tool install --global wix --version 4.0.0-preview.0 && wix extension -g add WixToolset.UI.wixext

docs/CHANGES.TXT

Lines changed: 23 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -1,106 +1,37 @@
1-
0.96 (2025-12-21)
1+
0.96 (2025-12-23)
22
-----------------
3+
- New: Multi-page teletext extraction support (#665)
4+
- Extract multiple teletext pages simultaneously with separate output files
5+
- Use --tpage multiple times (e.g., --tpage 100 --tpage 200)
6+
- Output files are named with page suffix (e.g., output_p100.srt, output_p200.srt)
7+
38
- New: Added --list-tracks (-L) option to list all tracks in media files without processing
4-
- Fix: Garbled captions from HDHomeRun and I/P-only H.264 streams (#1109)
5-
- Fix: Enable stdout output for CEA-708 captions on Windows (#1693)
6-
- Fix: McPoodle DVD raw format read/write - properly handle loop markers (#1524)
7-
- Fix: Variable shadowing in general_loop causing false "premature end of file" messages
8-
- Fix: Double-free crash in teletext cleanup when processing multiple files
9-
- Fix: Uninitialized memory and memory leaks found by Valgrind testing
10-
- Fix: Dangling pointers in Rust FFI copy_from_rust functions
11-
- New: Improve -out=report to show detected Teletext subtitle pages (#1034)
12-
- FIX: Include ATSC VCT virtual channel numbers and call signs in XMLTV output
13-
- FIX: Restore ATSC XMLTV generation with ETT parsing for extended descriptions, multi-segment handling, extended table ID's (EIT/VCT), corrected <programme> XMLTV formatting, buffer bounds fixes
14-
- Fix: DVB subtitle extraction improvements for Chinese broadcasts (#224):
15-
- Fix crash in parse_PMT() due to missing bounds checks
16-
- Fix negative timestamps in DVB subtitle output
17-
- Fix crash in ignore_alpha_at_edge() OCR cropping
18-
- Improve DVB subtitle OCR accuracy with image inversion
19-
- Fix --ocrlang to accept Tesseract language names (chi_tra, chi_sim, etc.)
20-
- Add case-insensitive matching for --dvblang parameter
21-
- FIX: Add HEVC/H.265 stream type recognition to prevent crashes on ATSC 3.0 streams
22-
- New: Add demuxer and file_functions module in lib_ccxr (#1662)
23-
- Fix: handle row_count decrease in CEA-708 C decoder
24-
- Fix: Bounds checks to prevent panic on malformed CEA-708 data
25-
- Fix: Multiprogram logic in is_decoder_processed_enough() causing false warnings
26-
- Fix: Write consistent 2-byte UTF-16BE encoding for CEA-708 captions (Japanese/Chinese)
27-
- New: Add --ttxtforcelatin option to force Latin G0 charset in Teletext
28-
- Fix: Add fallback for TS files without PAT/PMT tables
29-
- Fix: PTS jump handling to continue fts_now updates after jump
30-
- Fix: Null checks for unchecked memory allocations throughout codebase
31-
- Fix: Null checks and invalid UTF-8 handling in Rust FFI functions
32-
- Fix: Panics in timing code when processing multiple files
33-
- Fix: Caption start/end times to match FFmpeg timing in MP4/MPEG/TS
34-
- Fix: Correctly count and store multiple input files
35-
- Fix: Handle MP4 c608 tracks and improve garbage frame detection
36-
- Fix: Update fts_now for each frame in elementary streams
37-
- Fix: Preserve CR time during pop-on to roll-up transition
38-
- Fix: Defer min_pts until frame type is known
39-
- Fix: Skip leading non-I-frames when setting min_pts
40-
- Fix: Memory leaks in ts_tables_epg, ocr, and ccx_encoders_spupng
41-
- Fix: Buffer overruns in 708_output, mcc_encoder, utility, xds_decoder
42-
- Fix: Replace sprintf/strcpy with bounds-checked snprintf/strncpy in encoders
43-
- Fix: HHMMSSFFF format for ttxt output timestamps
44-
- Fix: Always emit position codes at start of SCC caption
45-
- Fix: Memory safety issues in ccx_decoders_common
46-
- Fix: Null checks after malloc calls in dvb_subtitle_decoder
47-
- Fix: Memory safety checks and memory leaks in Matroska parser
48-
49-
0.95 (2025-09-15)
50-
-----------------
51-
- Fix: ARM64/aarch64 build failure due to c_char type mismatch in nal.rs
52-
- Fix: HardSubX OCR on Rust
53-
- Removed the Share Module
54-
- Fix: Regression failures on DVD files
55-
- Fix: Segmentation faults on MP4 files with CEA-708 captions
56-
- Refactor: Remove API structures from ccextractor
57-
- New: Add Encoder Module to Rust
58-
- Fix: Elementary stream regressions
59-
- Fix: Segmentation faults on XDS files
60-
- Fix: Clippy Errors Based on Rust 1.88
61-
- IMPROVEMENT: Refactor and optimize Dockerfile
62-
- Fix: Improved handling of IETF language tags in Matroska files (#1665)
63-
- New: Create unit test for rust code (#1615)
64-
- Breaking: Major argument flags revamp for CCExtractor (#1564 & #1619)
9+
New: Chinese, Korean, Japanese support - proper encoding and OCR.
10+
New: Correct McPoodle DVD raw format support
11+
Fix: Timing is now frame perfect (using FFMpeg timing dump as reference) in all formats.
12+
Fix: Solved garbling in all the pending issues we had on GitHub.
13+
Fix: All causes of "premature end of file" messages due to bugs and not actual file cuts.
14+
Fix: All memory leaks, double frees and usual C nastyness that valgrind could find.
15+
- Fix Include ATSC VCT virtual channel numbers and call signs in XMLTV output
16+
- Fix: Restore ATSC XMLTV generation with ETT parsing for extended descriptions, multi-segment handling, extended table ID's (EIT/VCT), corrected <programme> XMLTV formatting, buffer bounds fixes
17+
- Fix: Add HEVC/H.265 stream type recognition to prevent crashes on ATSC 3.0 streams.
18+
Fix: Tolerance to damaged streams - recover where possible instead of terminating.
19+
Issues closed: Over 40! Too many to list here, but each of them was either a bug squashed or a feature implemented.
20+
21+
0.95 (2025-09-15 - never formally packaged)
22+
-----------------
6523
- New: Create a Docker image to simplify the CCExtractor usage without any environmental hustle (#1611)
66-
- New: Add time units module in lib_ccxr (#1623)
67-
- New: Add bits and levenshtein module in lib_ccxr (#1627)
68-
- New: Add constants module in lib_ccxr (#1624)
69-
- New: Add log module in lib_ccxr (#1622)
70-
- New: Create `lib_ccxr` and `libccxr_exports` (#1621)
71-
- Fix: Unexpected behavior of get_write_interval (#1609)
72-
- Update: Bump rsmpeg to latest version for ffmpeg bindings (#1600)
7324
- New: Add SCC support for CEA-708 decoder (#1595)
74-
- Fix: respect `-stdout` even if multiple CC tracks are present in a Matroska input file (#1453)
75-
- Fix: crash in Rust decoder on ATSC1.0 TS Files (#1407)
76-
- Removed the --with-gui flag for linux/configure and mac/configure (use the Flutter GUI instead)
25+
Refactor: Lots of code ported to Rust.
26+
- Fix: Improved handling of IETF language tags in Matroska files (#1665)
27+
- Breaking: Major argument flags revamp for CCExtractor (#1564 & #1619)
7728
- Fix: segmentation fault in using hardsubx
78-
- New: Add function (and command) that extracts closed caption subtitles as well as burnt-in subtitles from a file in a single pass. (As proposed in issue 726)
79-
- Refactored: the `general_loop` function has some code moved to a new function
8029
- Fix: WebVTT X-TIMESTAMP-MAP placement (#1463)
81-
- Disable X-TIMESTAMP-MAP by default (changed option --no-timestamp-map to --timestamp-map)
82-
- Fix: missing `#` in color attribute of font tag
8330
- Fix: ffmpeg 5.0, tesseract 5.0 compatibility and remove deprecated methods
8431
- Fix: tesseract 5.x traineddata location in ocr
85-
- Fix: fix autoconf tesseract detection problem (#1503)
86-
- Fix: add missing compile_info_real.h source to Autotools build
87-
- Fix: add missing `-lavfilter` for hardsubx linking
88-
- Fix: make webvtt-full work correctly with multi-byte utf-8 characters
89-
- Fix: encoding of solid block in latin-1 and unicode
90-
- Fix: McPoodle Broadcast Raw format for field 1
91-
- Fix: Incorrect skipping of packets
92-
- Fix: Repeated values for enums
93-
- Cleanup: Remove the (unmaintained) Nuklear GUI code
94-
- Cleanup: Reduce the amount of Windows build options in the project file
95-
- Fix: infinite loop in MP4 file type detector.
96-
- Improvement: Use Corrosion to build Rust code
9732
- Improvement: Ignore MXF Caption Essence Container version byte to enhance SRT subtitle extraction compatibility
9833
- New: Add tesseract page segmentation modes control with `--psm` flag
99-
- Fix: Resolve compile-time error about implicit declarations (#1646)
100-
- Fix: fatal out of memory error extracting from a VOB PS
101-
- Fix: Unit Test Rust failing due to changes in Rust Version 1.86.0
10234
- Fix: Support for MINGW-w64 cross compiling
103-
- Fix: Build with ENABLE_FFMPEG to support ffmpeg 5
10435

10536
0.94 (2021-12-14)
10637
-----------------

src/lib_ccx/ccx_common_structs.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,9 @@ struct cc_subtitle
8484
/** Raw PTS value when this subtitle started (for DVB timing) */
8585
LLONG start_pts;
8686

87+
/** Teletext page number (for multi-page extraction, issue #665) */
88+
uint16_t teletext_page;
89+
8790
struct cc_subtitle *next;
8891
struct cc_subtitle *prev;
8992
};

src/lib_ccx/ccx_encoders_common.c

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -719,6 +719,9 @@ void dinit_encoder(struct encoder_ctx **arg, LLONG current_fts)
719719
write_subtitle_file_footer(ctx, ctx->out + i);
720720
}
721721

722+
// Clean up teletext multi-page output files (issue #665)
723+
dinit_teletext_outputs(ctx);
724+
722725
free_encoder_context(ctx->prev);
723726
dinit_output_ctx(ctx);
724727
freep(&ctx->subline);
@@ -838,6 +841,15 @@ struct encoder_ctx *init_encoder(struct encoder_cfg *opt)
838841
ctx->segment_last_key_frame = 0;
839842
ctx->nospupngocr = opt->nospupngocr;
840843

844+
// Initialize teletext multi-page output arrays (issue #665)
845+
ctx->tlt_out_count = 0;
846+
for (int i = 0; i < MAX_TLT_PAGES_EXTRACT; i++)
847+
{
848+
ctx->tlt_out[i] = NULL;
849+
ctx->tlt_out_pages[i] = 0;
850+
ctx->tlt_srt_counter[i] = 0;
851+
}
852+
841853
ctx->prev = NULL;
842854
return ctx;
843855
}
@@ -1298,3 +1310,168 @@ void switch_output_file(struct lib_ccx_ctx *ctx, struct encoder_ctx *enc_ctx, in
12981310
enc_ctx->cea_708_counter = 0;
12991311
enc_ctx->srt_counter = 0;
13001312
}
1313+
1314+
/**
1315+
* Get or create the output file for a specific teletext page (issue #665)
1316+
* Creates output files on-demand with suffix _pNNN (e.g., output_p891.srt)
1317+
* Returns NULL if we're in stdout mode or if too many pages are being extracted
1318+
*/
1319+
struct ccx_s_write *get_teletext_output(struct encoder_ctx *ctx, uint16_t teletext_page)
1320+
{
1321+
// If teletext_page is 0, use the default output
1322+
if (teletext_page == 0 || ctx->out == NULL)
1323+
return ctx->out;
1324+
1325+
// Check if we're sending to stdout - can't do multi-page in that case
1326+
if (ctx->out[0].fh == STDOUT_FILENO)
1327+
return ctx->out;
1328+
1329+
// Check if we already have an output file for this page
1330+
for (int i = 0; i < ctx->tlt_out_count; i++)
1331+
{
1332+
if (ctx->tlt_out_pages[i] == teletext_page)
1333+
return ctx->tlt_out[i];
1334+
}
1335+
1336+
// If we only have one teletext page requested, use the default output
1337+
// (no suffix needed for backward compatibility)
1338+
extern struct ccx_s_teletext_config tlt_config;
1339+
if (tlt_config.num_user_pages <= 1 && !tlt_config.extract_all_pages)
1340+
return ctx->out;
1341+
1342+
// Need to create a new output file for this page
1343+
if (ctx->tlt_out_count >= MAX_TLT_PAGES_EXTRACT)
1344+
{
1345+
mprint("Warning: Too many teletext pages to extract (max %d), using default output for page %03d\n",
1346+
MAX_TLT_PAGES_EXTRACT, teletext_page);
1347+
return ctx->out;
1348+
}
1349+
1350+
// Allocate the new write structure
1351+
struct ccx_s_write *new_out = (struct ccx_s_write *)malloc(sizeof(struct ccx_s_write));
1352+
if (!new_out)
1353+
{
1354+
mprint("Error: Memory allocation failed for teletext output\n");
1355+
return ctx->out;
1356+
}
1357+
memset(new_out, 0, sizeof(struct ccx_s_write));
1358+
1359+
// Create the filename with page suffix
1360+
const char *ext = get_file_extension(ctx->write_format);
1361+
char suffix[16];
1362+
snprintf(suffix, sizeof(suffix), "_p%03d", teletext_page);
1363+
1364+
char *basefilename = NULL;
1365+
if (ctx->out[0].filename != NULL)
1366+
{
1367+
basefilename = get_basename(ctx->out[0].filename);
1368+
}
1369+
else if (ctx->first_input_file != NULL)
1370+
{
1371+
basefilename = get_basename(ctx->first_input_file);
1372+
}
1373+
else
1374+
{
1375+
basefilename = strdup("untitled");
1376+
}
1377+
1378+
if (basefilename == NULL)
1379+
{
1380+
free(new_out);
1381+
return ctx->out;
1382+
}
1383+
1384+
char *filename = create_outfilename(basefilename, suffix, ext);
1385+
free(basefilename);
1386+
1387+
if (filename == NULL)
1388+
{
1389+
free(new_out);
1390+
return ctx->out;
1391+
}
1392+
1393+
// Open the file
1394+
new_out->filename = filename;
1395+
new_out->fh = open(filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY, S_IREAD | S_IWRITE);
1396+
if (new_out->fh == -1)
1397+
{
1398+
mprint("Error: Failed to open output file %s: %s\n", filename, strerror(errno));
1399+
free(filename);
1400+
free(new_out);
1401+
return ctx->out;
1402+
}
1403+
1404+
mprint("Creating teletext output file: %s\n", filename);
1405+
1406+
// Store in our array
1407+
int idx = ctx->tlt_out_count;
1408+
ctx->tlt_out[idx] = new_out;
1409+
ctx->tlt_out_pages[idx] = teletext_page;
1410+
ctx->tlt_srt_counter[idx] = 0;
1411+
ctx->tlt_out_count++;
1412+
1413+
// Write the subtitle file header
1414+
write_subtitle_file_header(ctx, new_out);
1415+
1416+
return new_out;
1417+
}
1418+
1419+
/**
1420+
* Get the SRT counter for a specific teletext page (issue #665)
1421+
* Returns pointer to the counter, or NULL if page not found
1422+
*/
1423+
unsigned int *get_teletext_srt_counter(struct encoder_ctx *ctx, uint16_t teletext_page)
1424+
{
1425+
// If teletext_page is 0, use the default counter
1426+
if (teletext_page == 0)
1427+
return &ctx->srt_counter;
1428+
1429+
// Check if we're using multi-page mode
1430+
extern struct ccx_s_teletext_config tlt_config;
1431+
if (tlt_config.num_user_pages <= 1 && !tlt_config.extract_all_pages)
1432+
return &ctx->srt_counter;
1433+
1434+
// Find the counter for this page
1435+
for (int i = 0; i < ctx->tlt_out_count; i++)
1436+
{
1437+
if (ctx->tlt_out_pages[i] == teletext_page)
1438+
return &ctx->tlt_srt_counter[i];
1439+
}
1440+
1441+
// Not found, use default counter
1442+
return &ctx->srt_counter;
1443+
}
1444+
1445+
/**
1446+
* Clean up all teletext output files (issue #665)
1447+
*/
1448+
void dinit_teletext_outputs(struct encoder_ctx *ctx)
1449+
{
1450+
if (!ctx)
1451+
return;
1452+
1453+
for (int i = 0; i < ctx->tlt_out_count; i++)
1454+
{
1455+
if (ctx->tlt_out[i] != NULL)
1456+
{
1457+
// Write footer
1458+
write_subtitle_file_footer(ctx, ctx->tlt_out[i]);
1459+
1460+
// Close file
1461+
if (ctx->tlt_out[i]->fh != -1)
1462+
{
1463+
close(ctx->tlt_out[i]->fh);
1464+
}
1465+
1466+
// Free filename
1467+
if (ctx->tlt_out[i]->filename != NULL)
1468+
{
1469+
free(ctx->tlt_out[i]->filename);
1470+
}
1471+
1472+
free(ctx->tlt_out[i]);
1473+
ctx->tlt_out[i] = NULL;
1474+
}
1475+
}
1476+
ctx->tlt_out_count = 0;
1477+
}

0 commit comments

Comments
 (0)