Skip to content

Commit 00f429a

Browse files
Martin Koeglergitster
authored andcommitted
gitweb: Handle non UTF-8 text better
gitweb assumes that everything is in UTF-8. If a text contains invalid UTF-8 character sequences, the text must be in a different encoding. This commit introduces $fallback_encoding which would be used as input encoding if gitweb encounters text with is not valid UTF-8. Add basic test for this in t/t9500-gitweb-standalone-no-errors.sh Signed-off-by: Martin Koegler <[email protected]> Signed-off-by: Jakub Narebski <[email protected]> Tested-by: Alexandre Julliard <[email protected]> Tested-by: Ismail Dönmez <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent 2169368 commit 00f429a

File tree

2 files changed

+59
-10
lines changed

2 files changed

+59
-10
lines changed

gitweb/gitweb.perl

Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,13 @@ BEGIN
9494
# (relative to the current git repository)
9595
our $mimetypes_file = undef;
9696

97+
# assume this charset if line contains non-UTF-8 characters;
98+
# it should be valid encoding (see Encoding::Supported(3pm) for list),
99+
# for which encoding all byte sequences are valid, for example
100+
# 'iso-8859-1' aka 'latin1' (it is decoded without checking, so it
101+
# could be even 'utf-8' for the old behavior)
102+
our $fallback_encoding = 'latin1';
103+
97104
# You define site-wide feature defaults here; override them with
98105
# $GITWEB_CONFIG as necessary.
99106
our %feature = (
@@ -602,6 +609,20 @@ sub validate_refname {
602609
return $input;
603610
}
604611

612+
# decode sequences of octets in utf8 into Perl's internal form,
613+
# which is utf-8 with utf8 flag set if needed. gitweb writes out
614+
# in utf-8 thanks to "binmode STDOUT, ':utf8'" at beginning
615+
sub to_utf8 {
616+
my $str = shift;
617+
my $res;
618+
eval { $res = decode_utf8($str, Encode::FB_CROAK); };
619+
if (defined $res) {
620+
return $res;
621+
} else {
622+
return decode($fallback_encoding, $str, Encode::FB_DEFAULT);
623+
}
624+
}
625+
605626
# quote unsafe chars, but keep the slash, even when it's not
606627
# correct, but quoted slashes look too horrible in bookmarks
607628
sub esc_param {
@@ -626,7 +647,7 @@ ($;%)
626647
my $str = shift;
627648
my %opts = @_;
628649

629-
$str = decode_utf8($str);
650+
$str = to_utf8($str);
630651
$str = $cgi->escapeHTML($str);
631652
if ($opts{'-nbsp'}) {
632653
$str =~ s/ /&nbsp;/g;
@@ -640,7 +661,7 @@ sub esc_path {
640661
my $str = shift;
641662
my %opts = @_;
642663

643-
$str = decode_utf8($str);
664+
$str = to_utf8($str);
644665
$str = $cgi->escapeHTML($str);
645666
if ($opts{'-nbsp'}) {
646667
$str =~ s/ /&nbsp;/g;
@@ -925,7 +946,7 @@ sub format_subject_html {
925946

926947
if (length($short) < length($long)) {
927948
return $cgi->a({-href => $href, -class => "list subject",
928-
-title => decode_utf8($long)},
949+
-title => to_utf8($long)},
929950
esc_html($short) . $extra);
930951
} else {
931952
return $cgi->a({-href => $href, -class => "list subject"},
@@ -1239,7 +1260,7 @@ sub git_get_projects_list {
12391260
if (check_export_ok("$projectroot/$path")) {
12401261
my $pr = {
12411262
path => $path,
1242-
owner => decode_utf8($owner),
1263+
owner => to_utf8($owner),
12431264
};
12441265
push @list, $pr;
12451266
(my $forks_path = $path) =~ s/\.git$//;
@@ -1269,7 +1290,7 @@ sub git_get_project_owner {
12691290
$pr = unescape($pr);
12701291
$ow = unescape($ow);
12711292
if ($pr eq $project) {
1272-
$owner = decode_utf8($ow);
1293+
$owner = to_utf8($ow);
12731294
last;
12741295
}
12751296
}
@@ -1759,7 +1780,7 @@ sub get_file_owner {
17591780
}
17601781
my $owner = $gcos;
17611782
$owner =~ s/[,;].*$//;
1762-
return decode_utf8($owner);
1783+
return to_utf8($owner);
17631784
}
17641785

17651786
## ......................................................................
@@ -1842,7 +1863,7 @@ sub git_header_html {
18421863

18431864
my $title = "$site_name";
18441865
if (defined $project) {
1845-
$title .= " - " . decode_utf8($project);
1866+
$title .= " - " . to_utf8($project);
18461867
if (defined $action) {
18471868
$title .= "/$action";
18481869
if (defined $file_name) {
@@ -2116,7 +2137,7 @@ sub git_print_page_path {
21162137

21172138
print "<div class=\"page_path\">";
21182139
print $cgi->a({-href => href(action=>"tree", hash_base=>$hb),
2119-
-title => 'tree root'}, decode_utf8("[$project]"));
2140+
-title => 'tree root'}, to_utf8("[$project]"));
21202141
print " / ";
21212142
if (defined $name) {
21222143
my @dirname = split '/', $name;
@@ -2936,7 +2957,7 @@ sub git_project_list_body {
29362957
($pr->{'age'}, $pr->{'age_string'}) = @aa;
29372958
if (!defined $pr->{'descr'}) {
29382959
my $descr = git_get_project_description($pr->{'path'}) || "";
2939-
$pr->{'descr_long'} = decode_utf8($descr);
2960+
$pr->{'descr_long'} = to_utf8($descr);
29402961
$pr->{'descr'} = chop_str($descr, 25, 5);
29412962
}
29422963
if (!defined $pr->{'owner'}) {
@@ -3981,7 +4002,7 @@ sub git_snapshot {
39814002
my $git = git_cmd_str();
39824003
my $name = $project;
39834004
$name =~ s/\047/\047\\\047\047/g;
3984-
my $filename = decode_utf8(basename($project));
4005+
my $filename = to_utf8(basename($project));
39854006
my $cmd;
39864007
if ($suffix eq 'zip') {
39874008
$filename .= "-$hash.$suffix";

t/t9500-gitweb-standalone-no-errors.sh

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -487,4 +487,32 @@ test_expect_success \
487487
'gitweb_run "p=.git;a=atom"'
488488
test_debug 'cat gitweb.log'
489489

490+
# ----------------------------------------------------------------------
491+
# encoding/decoding
492+
493+
test_expect_success \
494+
'encode(commit): utf8' \
495+
'. ../t3901-utf8.txt &&
496+
echo "UTF-8" >> file &&
497+
git add file &&
498+
git commit -F ../t3900/1-UTF-8.txt &&
499+
gitweb_run "p=.git;a=commit"'
500+
test_debug 'cat gitweb.log'
501+
502+
test_expect_success \
503+
'encode(commit): iso-8859-1' \
504+
'. ../t3901-8859-1.txt &&
505+
echo "ISO-8859-1" >> file &&
506+
git add file &&
507+
git config i18n.commitencoding ISO-8859-1 &&
508+
git commit -F ../t3900/ISO-8859-1.txt &&
509+
git config --unset i18n.commitencoding &&
510+
gitweb_run "p=.git;a=commit"'
511+
test_debug 'cat gitweb.log'
512+
513+
test_expect_success \
514+
'encode(log): utf-8 and iso-8859-1' \
515+
'gitweb_run "p=.git;a=log"'
516+
test_debug 'cat gitweb.log'
517+
490518
test_done

0 commit comments

Comments
 (0)