Skip to content

Commit dd2ebc7

Browse files
committed
Add sanitize mode for stripping invisible text
OCR processes typically produce text objects with text render mode 3, so this option allows stripping OCR layers for recreating them. This can also simplify documents for print processing without visual impact.
1 parent 324e2cb commit dd2ebc7

File tree

5 files changed

+21
-6
lines changed

5 files changed

+21
-6
lines changed

include/mupdf/pdf/document.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -763,6 +763,7 @@ typedef struct
763763
int do_use_objstms; /* Use objstms if possible */
764764
int compression_effort; /* 0 for default. 100 = max, 1 = min. */
765765
int do_labels; /* Add labels to each object showing how it can be reached from the Root. */
766+
int do_strip_invisible_text; /* Strip invisible text (text render mode 3). */
766767
} pdf_write_options;
767768

768769
FZ_DATA extern const pdf_write_options pdf_default_write_options;

include/mupdf/pdf/interpret.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -362,6 +362,7 @@ typedef struct
362362
int (*text_filter)(fz_context *ctx, void *opaque, int *ucsbuf, int ucslen, fz_matrix trm, fz_matrix ctm, fz_rect bbox);
363363
void (*after_text_object)(fz_context *ctx, void *opaque, pdf_document *doc, pdf_processor *chain, fz_matrix ctm);
364364
int (*culler)(fz_context *ctx, void *opaque, fz_rect bbox, fz_cull_type type);
365+
int strip_invisible_text;
365366
}
366367
pdf_sanitize_filter_options;
367368

source/pdf/pdf-op-filter.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -635,7 +635,11 @@ filter_show_char(fz_context *ctx, pdf_sanitize_processor *p, int cid, int *unico
635635
}
636636
*unicode = ucsbuf[0];
637637

638-
if (p->options->text_filter || p->options->culler)
638+
if (p->options->strip_invisible_text && gstate->pending.text.render == 3)
639+
{
640+
remove = 1;
641+
}
642+
else if (p->options->text_filter || p->options->culler)
639643
{
640644
fz_matrix ctm;
641645
fz_rect bbox;

source/pdf/pdf-write.c

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1721,7 +1721,7 @@ static void complete_signatures(fz_context *ctx, pdf_document *doc, pdf_write_st
17211721
}
17221722
}
17231723

1724-
static void clean_content_streams(fz_context *ctx, pdf_document *doc, int sanitize, int ascii, int newlines)
1724+
static void clean_content_streams(fz_context *ctx, pdf_document *doc, int sanitize, int ascii, int newlines, int strip_invisible_text)
17251725
{
17261726
int n = pdf_count_pages(ctx, doc);
17271727
int i;
@@ -1733,7 +1733,9 @@ static void clean_content_streams(fz_context *ctx, pdf_document *doc, int saniti
17331733
options.recurse = 1;
17341734
options.ascii = ascii;
17351735
options.newlines = newlines;
1736-
options.filters = sanitize ? list : NULL;
1736+
options.filters = sanitize || strip_invisible_text ? list : NULL;
1737+
if (strip_invisible_text)
1738+
sopts.strip_invisible_text = 1;
17371739
list[0].filter = pdf_new_sanitize_filter;
17381740
list[0].options = &sopts;
17391741

@@ -1916,6 +1918,8 @@ pdf_parse_write_options(fz_context *ctx, pdf_write_options *opts, const char *ar
19161918
opts->do_clean = fz_option_eq(val, "yes");
19171919
if (fz_has_option(ctx, args, "sanitize", &val))
19181920
opts->do_sanitize = fz_option_eq(val, "yes");
1921+
if (fz_has_option(ctx, args, "strip-invisible-text", &val))
1922+
opts->do_strip_invisible_text = fz_option_eq(val, "yes");
19191923
if (fz_has_option(ctx, args, "incremental", &val))
19201924
opts->do_incremental = fz_option_eq(val, "yes");
19211925
if (fz_has_option(ctx, args, "objstms", &val))
@@ -1990,12 +1994,12 @@ prepare_for_save(fz_context *ctx, pdf_document *doc, const pdf_write_options *in
19901994
fz_throw(ctx, FZ_ERROR_ARGUMENT, "annotations need resynthesis before saving");
19911995

19921996
/* Rewrite (and possibly sanitize) the operator streams */
1993-
if (in_opts->do_clean || in_opts->do_sanitize)
1997+
if (in_opts->do_clean || in_opts->do_sanitize || in_opts->do_strip_invisible_text)
19941998
{
19951999
pdf_begin_operation(ctx, doc, "Clean content streams");
19962000
fz_try(ctx)
19972001
{
1998-
clean_content_streams(ctx, doc, in_opts->do_sanitize, in_opts->do_ascii, in_opts->do_pretty);
2002+
clean_content_streams(ctx, doc, in_opts->do_sanitize, in_opts->do_ascii, in_opts->do_pretty, in_opts->do_strip_invisible_text);
19992003
pdf_end_operation(ctx, doc);
20002004
}
20012005
fz_catch(ctx)
@@ -2722,6 +2726,7 @@ void pdf_write_document(fz_context *ctx, pdf_document *doc, fz_output *out, cons
27222726
in_opts->do_linear ||
27232727
in_opts->do_clean ||
27242728
in_opts->do_sanitize ||
2729+
in_opts->do_strip_invisible_text ||
27252730
in_opts->do_appearance ||
27262731
in_opts->do_encrypt != PDF_ENCRYPT_KEEP)
27272732
fz_throw(ctx, FZ_ERROR_ARGUMENT, "Can't use these options when snapshotting!");
@@ -2864,6 +2869,8 @@ pdf_format_write_options(fz_context *ctx, char *buffer, size_t buffer_len, const
28642869
ADD_OPT("linearize=yes");
28652870
if (opts->do_clean)
28662871
ADD_OPT("clean=yes");
2872+
if (opts->do_strip_invisible_text)
2873+
ADD_OPT("strip-invisible-text=yes");
28672874
if (opts->do_sanitize)
28682875
ADD_OPT("sanitize=yes");
28692876
if (opts->do_incremental)

source/tools/pdfclean.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ static int usage(void)
6161
"\t-i\tcompress image streams\n"
6262
"\t-c\tclean content streams\n"
6363
"\t-s\tsanitize content streams\n"
64+
"\t-I\tstrip invisible text\n"
6465
"\t-t\tcompact object syntax\n"
6566
"\t-tt\tindented object syntax\n"
6667
"\t-L\twrite object labels\n"
@@ -133,7 +134,7 @@ int pdfclean_main(int argc, char **argv)
133134
opts.write = pdf_default_write_options;
134135
opts.write.dont_regenerate_id = 1;
135136

136-
while ((c = fz_getopt_long(argc, argv, "ade:fgilmp:stczDAE:LO:U:P:SZ", longopts)) != -1)
137+
while ((c = fz_getopt_long(argc, argv, "ade:fgilmp:stczDAE:ILO:U:P:SZ", longopts)) != -1)
137138
{
138139
switch (c)
139140
{
@@ -149,6 +150,7 @@ int pdfclean_main(int argc, char **argv)
149150
case 'l': opts.write.do_linear += 1; break;
150151
case 'c': opts.write.do_clean += 1; break;
151152
case 's': opts.write.do_sanitize += 1; break;
153+
case 'I': opts.write.do_strip_invisible_text += 1; break;
152154
case 't': pretty = (pretty < 0) ? 0 : 1; break;
153155
case 'A': opts.write.do_appearance += 1; break;
154156
case 'L': opts.write.do_labels = 1; break;

0 commit comments

Comments
 (0)