Skip to content

Commit 15ca822

Browse files
committed
Merge branch 'en/fast-export-encoding' into jch
The "git fast-export/import" pair has been taught to handle commits with log messages in encoding other than UTF-8 better. * en/fast-export-encoding: fast-export: do automatic reencoding of commit messages only if requested fast-export: differentiate between explicitly utf-8 and implicitly utf-8 fast-export: avoid stripping encoding header if we cannot reencode fast-import: support 'encoding' commit header t9350: fix encoding test to actually test reencoding
2 parents 40330f8 + 6a36263 commit 15ca822

File tree

7 files changed

+142
-17
lines changed

7 files changed

+142
-17
lines changed

Documentation/git-fast-import.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,7 @@ change to the project.
388388
original-oid?
389389
('author' (SP <name>)? SP LT <email> GT SP <when> LF)?
390390
'committer' (SP <name>)? SP LT <email> GT SP <when> LF
391+
('encoding' SP <encoding>)?
391392
data
392393
('from' SP <commit-ish> LF)?
393394
('merge' SP <commit-ish> LF)?
@@ -455,6 +456,12 @@ that was selected by the --date-format=<fmt> command-line option.
455456
See ``Date Formats'' above for the set of supported formats, and
456457
their syntax.
457458

459+
`encoding`
460+
^^^^^^^^^^
461+
The optional `encoding` command indicates the encoding of the commit
462+
message. Most commits are UTF-8 and the encoding is omitted, but this
463+
allows importing commit messages into git without first reencoding them.
464+
458465
`from`
459466
^^^^^^
460467
The `from` command is used to specify the commit to initialize

builtin/fast-export.c

Lines changed: 38 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ static const char *fast_export_usage[] = {
3333
static int progress;
3434
static enum { SIGNED_TAG_ABORT, VERBATIM, WARN, WARN_STRIP, STRIP } signed_tag_mode = SIGNED_TAG_ABORT;
3535
static enum { TAG_FILTERING_ABORT, DROP, REWRITE } tag_of_filtered_mode = TAG_FILTERING_ABORT;
36+
static enum { REENCODE_ABORT, REENCODE_PLEASE, REENCODE_NEVER } reencode_mode = REENCODE_ABORT;
3637
static int fake_missing_tagger;
3738
static int use_done_feature;
3839
static int no_data;
@@ -77,6 +78,20 @@ static int parse_opt_tag_of_filtered_mode(const struct option *opt,
7778
return 0;
7879
}
7980

81+
static int parse_opt_reencode_mode(const struct option *opt,
82+
const char *arg, int unset)
83+
{
84+
if (unset || !strcmp(arg, "abort"))
85+
reencode_mode = REENCODE_ABORT;
86+
else if (!strcmp(arg, "yes"))
87+
reencode_mode = REENCODE_PLEASE;
88+
else if (!strcmp(arg, "no"))
89+
reencode_mode = REENCODE_NEVER;
90+
else
91+
return error("Unknown reencoding mode: %s", arg);
92+
return 0;
93+
}
94+
8095
static struct decoration idnums;
8196
static uint32_t last_idnum;
8297

@@ -453,7 +468,7 @@ static const char *find_encoding(const char *begin, const char *end)
453468
bol = memmem(begin, end ? end - begin : strlen(begin),
454469
needle, strlen(needle));
455470
if (!bol)
456-
return git_commit_encoding;
471+
return NULL;
457472
bol += strlen(needle);
458473
eol = strchrnul(bol, '\n');
459474
*eol = '\0';
@@ -633,18 +648,32 @@ static void handle_commit(struct commit *commit, struct rev_info *rev,
633648
}
634649

635650
mark_next_object(&commit->object);
636-
if (anonymize)
651+
if (anonymize) {
637652
reencoded = anonymize_commit_message(message);
638-
else if (!is_encoding_utf8(encoding))
639-
reencoded = reencode_string(message, "UTF-8", encoding);
653+
} else if (encoding) {
654+
switch(reencode_mode) {
655+
case REENCODE_PLEASE:
656+
reencoded = reencode_string(message, "UTF-8", encoding);
657+
break;
658+
case REENCODE_NEVER:
659+
break;
660+
case REENCODE_ABORT:
661+
die("Encountered commit-specific encoding %s in commit "
662+
"%s; use --reencode=<mode> to handle it",
663+
encoding, oid_to_hex(&commit->object.oid));
664+
}
665+
}
640666
if (!commit->parents)
641667
printf("reset %s\n", refname);
642668
printf("commit %s\nmark :%"PRIu32"\n", refname, last_idnum);
643669
if (show_original_ids)
644670
printf("original-oid %s\n", oid_to_hex(&commit->object.oid));
645-
printf("%.*s\n%.*s\ndata %u\n%s",
671+
printf("%.*s\n%.*s\n",
646672
(int)(author_end - author), author,
647-
(int)(committer_end - committer), committer,
673+
(int)(committer_end - committer), committer);
674+
if (!reencoded && encoding)
675+
printf("encoding %s\n", encoding);
676+
printf("data %u\n%s",
648677
(unsigned)(reencoded
649678
? strlen(reencoded) : message
650679
? strlen(message) : 0),
@@ -1088,6 +1117,9 @@ int cmd_fast_export(int argc, const char **argv, const char *prefix)
10881117
OPT_CALLBACK(0, "tag-of-filtered-object", &tag_of_filtered_mode, N_("mode"),
10891118
N_("select handling of tags that tag filtered objects"),
10901119
parse_opt_tag_of_filtered_mode),
1120+
OPT_CALLBACK(0, "reencode", &reencode_mode, N_("mode"),
1121+
N_("select handling of commit messages in an alternate encoding"),
1122+
parse_opt_reencode_mode),
10911123
OPT_STRING(0, "export-marks", &export_filename, N_("file"),
10921124
N_("Dump marks to this file")),
10931125
OPT_STRING(0, "import-marks", &import_filename, N_("file"),

fast-import.c

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2585,6 +2585,7 @@ static void parse_new_commit(const char *arg)
25852585
struct branch *b;
25862586
char *author = NULL;
25872587
char *committer = NULL;
2588+
const char *encoding = NULL;
25882589
struct hash_list *merge_list = NULL;
25892590
unsigned int merge_count;
25902591
unsigned char prev_fanout, new_fanout;
@@ -2607,6 +2608,8 @@ static void parse_new_commit(const char *arg)
26072608
}
26082609
if (!committer)
26092610
die("Expected committer but didn't get one");
2611+
if (skip_prefix(command_buf.buf, "encoding ", &encoding))
2612+
read_next_command();
26102613
parse_data(&msg, 0, NULL);
26112614
read_next_command();
26122615
parse_from(b);
@@ -2670,9 +2673,13 @@ static void parse_new_commit(const char *arg)
26702673
}
26712674
strbuf_addf(&new_data,
26722675
"author %s\n"
2673-
"committer %s\n"
2674-
"\n",
2676+
"committer %s\n",
26752677
author ? author : committer, committer);
2678+
if (encoding)
2679+
strbuf_addf(&new_data,
2680+
"encoding %s\n",
2681+
encoding);
2682+
strbuf_addch(&new_data, '\n');
26762683
strbuf_addbuf(&new_data, &msg);
26772684
free(author);
26782685
free(committer);

t/t9300-fast-import.sh

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3299,4 +3299,24 @@ test_expect_success !MINGW 'W: get-mark & empty orphan commit with erroneous thi
32993299
sed -e s/LFs/LLL/ W-input | tr L "\n" | test_must_fail git fast-import
33003300
'
33013301

3302+
###
3303+
### series X (other new features)
3304+
###
3305+
3306+
test_expect_success 'X: handling encoding' '
3307+
test_tick &&
3308+
cat >input <<-INPUT_END &&
3309+
commit refs/heads/encoding
3310+
committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
3311+
encoding iso-8859-7
3312+
data <<COMMIT
3313+
INPUT_END
3314+
3315+
printf "Pi: \360\nCOMMIT\n" >>input &&
3316+
3317+
git fast-import <input &&
3318+
git cat-file -p encoding | grep $(printf "\360") &&
3319+
git log -1 --format=%B encoding | grep $(printf "\317\200")
3320+
'
3321+
33023322
test_done

t/t9350-fast-export.sh

Lines changed: 66 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -94,22 +94,80 @@ test_expect_success 'fast-export --show-original-ids | git fast-import' '
9494
test $MUSS = $(git rev-parse --verify refs/tags/muss)
9595
'
9696

97-
test_expect_success 'iso-8859-1' '
97+
test_expect_success 'reencoding iso-8859-7' '
9898
99-
git config i18n.commitencoding ISO8859-1 &&
100-
# use author and committer name in ISO-8859-1 to match it.
101-
. "$TEST_DIRECTORY"/t3901/8859-1.txt &&
99+
test_when_finished "git reset --hard HEAD~1" &&
100+
test_config i18n.commitencoding iso-8859-7 &&
102101
test_tick &&
103102
echo rosten >file &&
104-
git commit -s -m den file &&
105-
git fast-export wer^..wer >iso8859-1.fi &&
106-
sed "s/wer/i18n/" iso8859-1.fi |
103+
git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
104+
git fast-export --reencode=yes wer^..wer >iso-8859-7.fi &&
105+
sed "s/wer/i18n/" iso-8859-7.fi |
107106
(cd new &&
108107
git fast-import &&
108+
# The commit object, if not re-encoded, would be 240 bytes.
109+
# Removing the "encoding iso-8859-7\n" header drops 20 bytes.
110+
# Re-encoding the Pi character from \xF0 in iso-8859-7 to
111+
# \xCF\x80 in utf-8 adds a byte. Grepping for specific bytes
112+
# would be nice, but Windows apparently munges user data
113+
# in the form of bytes on the command line to force them to
114+
# be characters instead, so we are limited for portability
115+
# reasons in subsequent similar tests in this file to check
116+
# for size rather than what bytes are present.
117+
test 221 -eq "$(git cat-file -s i18n)" &&
118+
# Also make sure the commit does not have the "encoding" header
109119
git cat-file commit i18n >actual &&
110-
grep "Áéí óú" actual)
120+
! grep ^encoding actual)
121+
'
122+
123+
test_expect_success 'aborting on iso-8859-7' '
111124
125+
test_when_finished "git reset --hard HEAD~1" &&
126+
test_config i18n.commitencoding iso-8859-7 &&
127+
echo rosten >file &&
128+
git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
129+
test_must_fail git fast-export --reencode=abort wer^..wer >iso-8859-7.fi
112130
'
131+
132+
test_expect_success 'preserving iso-8859-7' '
133+
134+
test_when_finished "git reset --hard HEAD~1" &&
135+
test_config i18n.commitencoding iso-8859-7 &&
136+
echo rosten >file &&
137+
git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
138+
git fast-export --reencode=no wer^..wer >iso-8859-7.fi &&
139+
sed "s/wer/i18n-no-recoding/" iso-8859-7.fi |
140+
(cd new &&
141+
git fast-import &&
142+
# The commit object, if not re-encoded, is 240 bytes.
143+
# Removing the "encoding iso-8859-7\n" header would drops 20
144+
# bytes. Re-encoding the Pi character from \xF0 in
145+
# iso-8859-7 to \xCF\x80 in utf-8 would add a byte. I would
146+
# grep for the # specific bytes, but Windows lamely does not
147+
# allow that, so just search for the expected size.
148+
test 240 -eq "$(git cat-file -s i18n-no-recoding)" &&
149+
# Also make sure the commit has the "encoding" header
150+
git cat-file commit i18n-no-recoding >actual &&
151+
grep ^encoding actual)
152+
'
153+
154+
test_expect_success 'encoding preserved if reencoding fails' '
155+
156+
test_when_finished "git reset --hard HEAD~1" &&
157+
test_config i18n.commitencoding iso-8859-7 &&
158+
echo rosten >file &&
159+
git commit -s -F "$TEST_DIRECTORY/t9350/broken-iso-8859-7-commit-message.txt" file &&
160+
git fast-export --reencode=yes wer^..wer >iso-8859-7.fi &&
161+
sed "s/wer/i18n-invalid/" iso-8859-7.fi |
162+
(cd new &&
163+
git fast-import &&
164+
git cat-file commit i18n-invalid >actual &&
165+
grep ^encoding actual &&
166+
# Also verify that the commit has the expected size; i.e.
167+
# that no bytes were re-encoded to a different encoding.
168+
test 252 -eq "$(git cat-file -s i18n-invalid)")
169+
'
170+
113171
test_expect_success 'import/export-marks' '
114172
115173
git checkout -b marks master &&
@@ -224,7 +282,6 @@ GIT_COMMITTER_NAME='C O Mitter'; export GIT_COMMITTER_NAME
224282

225283
test_expect_success 'setup copies' '
226284
227-
git config --unset i18n.commitencoding &&
228285
git checkout -b copy rein &&
229286
git mv file file3 &&
230287
git commit -m move1 &&
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Pi: �; Invalid: �
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Pi: �

0 commit comments

Comments
 (0)