Skip to content

Commit 8c586bc

Browse files
cfsmp3claude
andcommitted
feat(ocr): Add character blacklist and line-split options for better accuracy
Add two new OCR options to improve subtitle recognition: 1. Character blacklist (enabled by default): - Blacklists characters |, \, `, _, ~ that are commonly misrecognized - Prevents "I" being recognized as "|" (pipe character) - Use --no-ocr-blacklist to disable if needed 2. Line-split mode (opt-in via --ocr-line-split): - Splits multi-line subtitle images into individual lines - Uses PSM 7 (single text line mode) for each line - Adds 10px padding around each line for better edge recognition - May improve accuracy for some VOBSUB subtitles Test results with VOBSUB sample: - Blacklist: Reduces pipe errors from 14 to 0 - Matches subtile-ocr's approach for preventing misrecognition 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <[email protected]>
1 parent 434cd39 commit 8c586bc

File tree

8 files changed

+320
-0
lines changed

8 files changed

+320
-0
lines changed

src/lib_ccx/ccx_common_option.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,8 @@ void init_options(struct ccx_s_options *options)
7474
options->ocr_oem = -1; // By default, OEM mode depends on the tesseract version
7575
options->psm = 3; // Default PSM mode (3 is the default tesseract as well)
7676
options->ocr_quantmode = 0; // No quantization (better OCR accuracy for DVB subtitles)
77+
options->ocr_line_split = 0; // By default, don't split images into lines (pending testing)
78+
options->ocr_blacklist = 1; // By default, use character blacklist to prevent common OCR errors (| vs I, etc.)
7779
options->mkvlang = NULL; // By default, all the languages are extracted
7880
options->ignore_pts_jumps = 1;
7981
options->analyze_video_stream = 0;

src/lib_ccx/ccx_common_option.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,8 @@ struct ccx_s_options // Options from user parameters
152152
int ocr_oem; // The Tesseract OEM mode, could be 0 (default), 1 or 2
153153
int psm; // The Tesseract PSM mode, could be between 0 and 13. 3 is tesseract default
154154
int ocr_quantmode; // How to quantize the bitmap before passing to to tesseract (0=no quantization at all, 1=CCExtractor's internal)
155+
int ocr_line_split; // If 1, split images into lines before OCR (uses PSM 7 for better accuracy)
156+
int ocr_blacklist; // If 1, use character blacklist to prevent common OCR errors (default: enabled)
155157
char *mkvlang; // The name of the language stream for MKV
156158
int analyze_video_stream; // If 1, the video stream will be processed even if we're using a different one for subtitles.
157159

src/lib_ccx/ocr.c

Lines changed: 279 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,13 @@ void *init_ocr(int lang_index)
281281
// set PSM mode
282282
TessBaseAPISetPageSegMode(ctx->api, ccx_options.psm);
283283

284+
// Set character blacklist to prevent common OCR errors (e.g. | vs I)
285+
// These characters are rarely used in subtitles but often misrecognized
286+
if (ccx_options.ocr_blacklist)
287+
{
288+
TessBaseAPISetVariable(ctx->api, "tessedit_char_blacklist", "|\\`_~");
289+
}
290+
284291
free(pars_vec);
285292
free(pars_values);
286293

@@ -351,6 +358,176 @@ BOX *ignore_alpha_at_edge(png_byte *alpha, unsigned char *indata, int w, int h,
351358
return cropWindow;
352359
}
353360

361+
/**
362+
* Structure to hold the vertical boundaries of a detected text line.
363+
*/
364+
struct line_bounds
365+
{
366+
int start_y; // Top row of line (inclusive)
367+
int end_y; // Bottom row of line (inclusive)
368+
};
369+
370+
/**
371+
* Detects horizontal text line boundaries in a bitmap by finding rows of
372+
* fully transparent pixels that separate lines of text.
373+
*
374+
* @param alpha Palette alpha values (indexed by pixel value)
375+
* @param indata Bitmap pixel data (palette indices, w*h bytes)
376+
* @param w Image width
377+
* @param h Image height
378+
* @param lines Output: allocated array of line boundaries (caller must free)
379+
* @param num_lines Output: number of lines found
380+
* @param min_gap Minimum consecutive transparent rows to count as line separator
381+
* @return 0 on success, -1 on failure
382+
*/
383+
static int detect_text_lines(png_byte *alpha, unsigned char *indata,
384+
int w, int h,
385+
struct line_bounds **lines, int *num_lines,
386+
int min_gap)
387+
{
388+
if (!alpha || !indata || !lines || !num_lines || w <= 0 || h <= 0)
389+
return -1;
390+
391+
*lines = NULL;
392+
*num_lines = 0;
393+
394+
// Allocate array to track which rows have visible content
395+
int *row_has_content = (int *)malloc(h * sizeof(int));
396+
if (!row_has_content)
397+
return -1;
398+
399+
// Scan each row to determine if it has any visible (non-transparent) pixels
400+
for (int i = 0; i < h; i++)
401+
{
402+
row_has_content[i] = 0;
403+
for (int j = 0; j < w; j++)
404+
{
405+
int index = indata[i * w + j];
406+
if (alpha[index] != 0)
407+
{
408+
row_has_content[i] = 1;
409+
break; // Found visible pixel, no need to check rest of row
410+
}
411+
}
412+
}
413+
414+
// Count lines by finding runs of content rows separated by gaps
415+
int max_lines = (h / 2) + 1; // Conservative upper bound
416+
struct line_bounds *temp_lines = (struct line_bounds *)malloc(max_lines * sizeof(struct line_bounds));
417+
if (!temp_lines)
418+
{
419+
free(row_has_content);
420+
return -1;
421+
}
422+
423+
int line_count = 0;
424+
int in_line = 0;
425+
int line_start = 0;
426+
int gap_count = 0;
427+
428+
for (int i = 0; i < h; i++)
429+
{
430+
if (row_has_content[i])
431+
{
432+
if (!in_line)
433+
{
434+
// Start of a new line
435+
line_start = i;
436+
in_line = 1;
437+
}
438+
gap_count = 0;
439+
}
440+
else
441+
{
442+
if (in_line)
443+
{
444+
gap_count++;
445+
if (gap_count >= min_gap)
446+
{
447+
// End of line found (gap is large enough)
448+
if (line_count < max_lines)
449+
{
450+
temp_lines[line_count].start_y = line_start;
451+
temp_lines[line_count].end_y = i - gap_count;
452+
line_count++;
453+
}
454+
in_line = 0;
455+
gap_count = 0;
456+
}
457+
}
458+
}
459+
}
460+
461+
// Handle last line if we ended while still in a line
462+
if (in_line && line_count < max_lines)
463+
{
464+
temp_lines[line_count].start_y = line_start;
465+
// Find the last row with content
466+
int last_content = h - 1;
467+
while (last_content > line_start && !row_has_content[last_content])
468+
last_content--;
469+
temp_lines[line_count].end_y = last_content;
470+
line_count++;
471+
}
472+
473+
free(row_has_content);
474+
475+
if (line_count == 0)
476+
{
477+
free(temp_lines);
478+
return -1;
479+
}
480+
481+
// Shrink allocation to actual size
482+
*lines = (struct line_bounds *)realloc(temp_lines, line_count * sizeof(struct line_bounds));
483+
if (!*lines)
484+
{
485+
*lines = temp_lines; // Keep original if realloc fails
486+
}
487+
*num_lines = line_count;
488+
489+
return 0;
490+
}
491+
492+
/**
493+
* Performs OCR on a single text line image using PSM 7 (single line mode).
494+
*
495+
* @param ctx OCR context (contains Tesseract API)
496+
* @param line_pix Pre-processed PIX for single line (grayscale, inverted)
497+
* @return Recognized text (caller must free with free()), or NULL on failure
498+
*/
499+
static char *ocr_single_line(struct ocrCtx *ctx, PIX *line_pix)
500+
{
501+
if (!ctx || !ctx->api || !line_pix)
502+
return NULL;
503+
504+
// Save current PSM
505+
int saved_psm = TessBaseAPIGetPageSegMode(ctx->api);
506+
507+
// Set PSM 7 for single line recognition
508+
TessBaseAPISetPageSegMode(ctx->api, 7); // PSM_SINGLE_LINE
509+
510+
// Perform OCR
511+
TessBaseAPISetImage2(ctx->api, line_pix);
512+
BOOL ret = TessBaseAPIRecognize(ctx->api, NULL);
513+
514+
char *text = NULL;
515+
if (!ret)
516+
{
517+
char *tess_text = TessBaseAPIGetUTF8Text(ctx->api);
518+
if (tess_text)
519+
{
520+
text = strdup(tess_text);
521+
TessDeleteText(tess_text);
522+
}
523+
}
524+
525+
// Restore original PSM
526+
TessBaseAPISetPageSegMode(ctx->api, saved_psm);
527+
528+
return text;
529+
}
530+
354531
void debug_tesseract(struct ocrCtx *ctx, char *dump_path)
355532
{
356533
#ifdef OCR_DEBUG
@@ -397,6 +574,8 @@ char *ocr_bitmap(void *arg, png_color *palette, png_byte *alpha, unsigned char *
397574
unsigned int *data, *ppixel;
398575
BOOL tess_ret = FALSE;
399576
struct ocrCtx *ctx = arg;
577+
char *combined_text = NULL; // Used by line-split mode
578+
size_t combined_len = 0; // Used by line-split mode
400579
pix = pixCreate(w, h, 32);
401580
color_pix = pixCreate(w, h, 32);
402581
if (pix == NULL || color_pix == NULL)
@@ -476,6 +655,98 @@ char *ocr_bitmap(void *arg, png_color *palette, png_byte *alpha, unsigned char *
476655
return NULL;
477656
}
478657

658+
// Line splitting mode: detect lines and OCR each separately with PSM 7
659+
if (ccx_options.ocr_line_split && h > 30)
660+
{
661+
struct line_bounds *lines = NULL;
662+
int num_lines = 0;
663+
664+
// Use min_gap of 3 rows to detect line boundaries
665+
if (detect_text_lines(alpha, indata, w, h, &lines, &num_lines, 3) == 0 && num_lines > 1)
666+
{
667+
// Multiple lines detected - process each separately with PSM 7
668+
// (combined_text and combined_len are declared at function scope)
669+
670+
for (int line_idx = 0; line_idx < num_lines; line_idx++)
671+
{
672+
int line_h = lines[line_idx].end_y - lines[line_idx].start_y + 1;
673+
if (line_h <= 0)
674+
continue;
675+
676+
// Extract line region from the grayscale image
677+
BOX *line_box = boxCreate(0, lines[line_idx].start_y,
678+
pixGetWidth(cpix_gs), line_h);
679+
PIX *line_pix_raw = pixClipRectangle(cpix_gs, line_box, NULL);
680+
boxDestroy(&line_box);
681+
682+
if (line_pix_raw)
683+
{
684+
// Add white padding around the line (helps Tesseract with edge characters)
685+
// The image is inverted (dark text on light bg), so add white (255) border
686+
int padding = 10;
687+
PIX *line_pix = pixAddBorderGeneral(line_pix_raw, padding, padding, padding, padding, 255);
688+
pixDestroy(&line_pix_raw);
689+
if (!line_pix)
690+
continue;
691+
char *line_text = ocr_single_line(ctx, line_pix);
692+
pixDestroy(&line_pix);
693+
694+
if (line_text)
695+
{
696+
// Trim trailing whitespace from line
697+
size_t line_len = strlen(line_text);
698+
while (line_len > 0 && (line_text[line_len - 1] == '\n' ||
699+
line_text[line_len - 1] == '\r' ||
700+
line_text[line_len - 1] == ' '))
701+
{
702+
line_text[--line_len] = '\0';
703+
}
704+
705+
if (line_len > 0)
706+
{
707+
// Append to combined result
708+
size_t new_len = combined_len + line_len + 2; // +1 for newline, +1 for null
709+
char *new_combined = (char *)realloc(combined_text, new_len);
710+
if (new_combined)
711+
{
712+
combined_text = new_combined;
713+
if (combined_len > 0)
714+
{
715+
combined_text[combined_len++] = '\n';
716+
}
717+
strcpy(combined_text + combined_len, line_text);
718+
combined_len += line_len;
719+
}
720+
}
721+
free(line_text);
722+
}
723+
}
724+
}
725+
726+
free(lines);
727+
728+
if (combined_text && combined_len > 0)
729+
{
730+
// Successfully processed lines - skip whole-image OCR
731+
// but continue to color detection below
732+
goto line_split_color_detection;
733+
}
734+
735+
// If we got here, line splitting didn't produce results
736+
// Fall through to whole-image OCR
737+
if (combined_text)
738+
free(combined_text);
739+
combined_text = NULL;
740+
}
741+
else
742+
{
743+
// Line detection failed or only 1 line - fall through to whole-image OCR
744+
if (lines)
745+
free(lines);
746+
}
747+
}
748+
749+
// Standard whole-image OCR path
479750
TessBaseAPISetImage2(ctx->api, cpix_gs);
480751
tess_ret = TessBaseAPIRecognize(ctx->api, NULL);
481752
debug_tesseract(ctx, "./temp/");
@@ -518,6 +789,14 @@ char *ocr_bitmap(void *arg, png_color *palette, png_byte *alpha, unsigned char *
518789
fatal(EXIT_NOT_ENOUGH_MEMORY, "In ocr_bitmap: Out of memory allocating text_out.");
519790
}
520791

792+
// Jump target for line-split mode: use combined_text and continue with color detection
793+
if (0)
794+
{
795+
line_split_color_detection:
796+
text_out = combined_text;
797+
combined_text = NULL; // Transfer ownership
798+
}
799+
521800
// Begin color detection
522801
// Using tlt_config.nofontcolor or ccx_options.nofontcolor (true when "--no-fontcolor" parameter used) to skip color detection if not required
523802
// This is also skipped if --no-spupngocr is set since the OCR output won't be used anyway

src/lib_ccx/params.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,13 @@ void print_usage(void)
398398
mprint(" 12 Sparse text with OSD.\n");
399399
mprint(" 13 Raw line. Treat the image as a single text line,\n");
400400
mprint(" bypassing hacks that are Tesseract-specific.\n");
401+
mprint(" --ocr-line-split: Split subtitle images into lines before OCR.\n");
402+
mprint(" Uses PSM 7 (single text line mode) for each line,\n");
403+
mprint(" which can improve accuracy for multi-line bitmap subtitles\n");
404+
mprint(" (VOBSUB, DVD, DVB).\n");
405+
mprint(" --no-ocr-blacklist: Disable the OCR character blacklist. By default,\n");
406+
mprint(" CCExtractor blacklists characters like |, \\, `, _, ~\n");
407+
mprint(" that are commonly misrecognized (e.g. 'I' as '|').\n");
401408
mprint(" --mkvlang: For MKV subtitles, select which language's caption\n");
402409
mprint(" stream will be processed. e.g. 'eng' for English.\n");
403410
mprint(" Language codes can be either the 3 letters bibliographic\n");

src/rust/lib_ccxr/src/common/options.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -462,6 +462,10 @@ pub struct Options {
462462
/// (0 = no quantization at all, 1 = CCExtractor's internal,
463463
/// 2 = reduce distinct color count in image for faster results.)
464464
pub ocr_quantmode: u8,
465+
/// If true, split images into lines before OCR (uses PSM 7 for better accuracy)
466+
pub ocr_line_split: bool,
467+
/// If true, use character blacklist to prevent common OCR errors (e.g. | vs I)
468+
pub ocr_blacklist: bool,
465469
/// The name of the language stream for MKV
466470
pub mkvlang: Option<Language>,
467471
/// If true, the video stream will be processed even if we're using a different one for subtitles.
@@ -584,6 +588,8 @@ impl Default for Options {
584588
ocr_oem: -1,
585589
psm: 3,
586590
ocr_quantmode: 0, // No quantization - better OCR accuracy for DVB subtitles
591+
ocr_line_split: false, // Don't split images into lines by default
592+
ocr_blacklist: true, // Use character blacklist by default to prevent | vs I errors
587593
mkvlang: Default::default(),
588594
analyze_video_stream: Default::default(),
589595
hardsubx_ocr_mode: Default::default(),

src/rust/src/args.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -630,6 +630,18 @@ pub struct Args {
630630
/// bypassing hacks that are Tesseract-specific.
631631
#[arg(long, verbatim_doc_comment, value_name="mode", help_heading=OUTPUT_AFFECTING_OUTPUT_FILES)]
632632
pub psm: Option<u8>,
633+
/// Split subtitle images into lines before OCR.
634+
/// Uses PSM 7 (single text line mode) for each line,
635+
/// which can improve accuracy for multi-line bitmap subtitles
636+
/// (VOBSUB, DVD, DVB).
637+
#[arg(long, verbatim_doc_comment, help_heading=OUTPUT_AFFECTING_OUTPUT_FILES)]
638+
pub ocr_line_split: bool,
639+
/// Disable the OCR character blacklist.
640+
/// By default, CCExtractor blacklists characters like |, \, `, _
641+
/// that are commonly misrecognized (e.g. 'I' as '|').
642+
/// Use this flag to disable the blacklist.
643+
#[arg(long, verbatim_doc_comment, help_heading=OUTPUT_AFFECTING_OUTPUT_FILES)]
644+
pub no_ocr_blacklist: bool,
633645
/// For MKV subtitles, select which language's caption
634646
/// stream will be processed. e.g. 'eng' for English.
635647
/// Language codes can be either the 3 letters bibliographic

0 commit comments

Comments
 (0)