Skip to content

Commit 55960a5

Browse files
committed
BUG: Fix runtime on invalid UTF-8 encoding
Fixes errors like: utf8 "\xC4" does not map to Unicode at ./diff-logs line 52, <$fh1_in> line ... Malformed UTF-8 character: \xc4\x2e (unexpected non-continuation byte 0x2e, immediately after start byte 0xc4; need 2 bytes, got 1) in substitution (s///) at ./diff-logs line 59, <$fh1_in> line ... Malformed UTF-8 character (fatal) at ./diff-logs line 59, <$fh1_in> line ... The patch simply assumes UTF-8 and ignores any invalid characters, replacing them with \xHH escapes.
1 parent 3cd0d8d commit 55960a5

File tree

1 file changed

+20
-3
lines changed

1 file changed

+20
-3
lines changed

diff-logs

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,23 @@ for my $pair (@PATTERNS) {
5050
sub normalize {
5151
my ($in_fh, $out_fh) = @_;
5252
while (my $line = <$in_fh>) {
53+
# sanitize bytes: preserve valid UTF-8 sequences, escape invalid single bytes as \xHH
54+
$line =~ s{( # capture either:
55+
(?: # valid UTF-8 sequences
56+
[\x00-\x7F] # 1
57+
| [\xC2-\xDF][\x80-\xBF] # 2
58+
| \xE0[\xA0-\xBF][\x80-\xBF] # 3 (E0)
59+
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # 3 (others)
60+
| \xED[\x80-\x9F][\x80-\xBF] # 3 (ED)
61+
| \xF0[\x90-\xBF][\x80-\xBF]{2} # 4 (F0)
62+
| [\xF1-\xF3][\x80-\xBF]{3} # 4 (F1-F3)
63+
| \xF4[\x80-\x8F][\x80-\xBF]{2} # 4 (F4)
64+
)
65+
|
66+
(.) # or a single (invalid) byte
67+
)}{ defined $2 ? sprintf('\\x%02X', ord $2) : $1 }gex;
68+
utf8::decode($line) or die "Assertion failed: invalid UTF-8 even after sanitization";
69+
5370
for my $rule (@PATTERNS) {
5471
my ($pattern, $replacement) = @$rule;
5572
# Use /ee (evaluate 2x) for replacements with named backreferences
@@ -66,15 +83,15 @@ sub normalize {
6683
# Main
6784
my $argc = @ARGV;
6885
if ($argc == 0) {
69-
binmode(STDIN, ":utf8"); binmode(STDOUT, ":utf8"); ## no critic
86+
binmode(STDIN, ":raw"); binmode(STDOUT, ":utf8"); ## no critic
7087
normalize(\*STDIN, \*STDOUT);
7188
} elsif ($argc == 2) {
7289
my ($file1, $file2) = @ARGV;
7390
my $temp1 = File::Temp->new(UNLINK => 1);
7491
my $temp2 = File::Temp->new(UNLINK => 1);
7592
binmode($temp1, ':utf8'); binmode($temp2, ':utf8'); ## no critic
76-
open my $fh1_in, '<:utf8', $file1 or die "Error: Cannot read '$file1': $!"; ## no critic
77-
open my $fh2_in, '<:utf8', $file2 or die "Error: Cannot read '$file2': $!"; ## no critic
93+
open my $fh1_in, '<:raw', $file1 or die "Error: Cannot read '$file1': $!"; ## no critic
94+
open my $fh2_in, '<:raw', $file2 or die "Error: Cannot read '$file2': $!"; ## no critic
7895
normalize($fh1_in, $temp1);
7996
normalize($fh2_in, $temp2);
8097
close $fh1_in; close $fh2_in;

0 commit comments

Comments
 (0)