Skip to content

Commit e6658b9

Browse files
committed
Merge branch 'ks/rfc2047-one-char-at-a-time' into maint
When "format-patch" quoted a non-ascii strings on the header files, it incorrectly applied rfc2047 and chopped a single character in the middle of it. * ks/rfc2047-one-char-at-a-time: format-patch: RFC 2047 says multi-octet character may not be split
2 parents a9dc3b6 + 6cd3c05 commit e6658b9

File tree

4 files changed

+77
-25
lines changed

4 files changed

+77
-25
lines changed

pretty.c

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,7 @@ static int needs_rfc2047_encoding(const char *line, int len,
345345
return 0;
346346
}
347347

348-
static void add_rfc2047(struct strbuf *sb, const char *line, int len,
348+
static void add_rfc2047(struct strbuf *sb, const char *line, size_t len,
349349
const char *encoding, enum rfc2047_type type)
350350
{
351351
static const int max_encoded_length = 76; /* per rfc2047 */
@@ -355,9 +355,22 @@ static void add_rfc2047(struct strbuf *sb, const char *line, int len,
355355
strbuf_grow(sb, len * 3 + strlen(encoding) + 100);
356356
strbuf_addf(sb, "=?%s?q?", encoding);
357357
line_len += strlen(encoding) + 5; /* 5 for =??q? */
358-
for (i = 0; i < len; i++) {
359-
unsigned ch = line[i] & 0xFF;
360-
int is_special = is_rfc2047_special(ch, type);
358+
359+
while (len) {
360+
/*
361+
* RFC 2047, section 5 (3):
362+
*
363+
* Each 'encoded-word' MUST represent an integral number of
364+
* characters. A multi-octet character may not be split across
365+
* adjacent 'encoded- word's.
366+
*/
367+
const unsigned char *p = (const unsigned char *)line;
368+
int chrlen = mbs_chrlen(&line, &len, encoding);
369+
int is_special = (chrlen > 1) || is_rfc2047_special(*p, type);
370+
371+
/* "=%02X" * chrlen, or the byte itself */
372+
const char *encoded_fmt = is_special ? "=%02X" : "%c";
373+
int encoded_len = is_special ? 3 * chrlen : 1;
361374

362375
/*
363376
* According to RFC 2047, we could encode the special character
@@ -367,18 +380,15 @@ static void add_rfc2047(struct strbuf *sb, const char *line, int len,
367380
* causes ' ' to be encoded as '=20', avoiding this problem.
368381
*/
369382

370-
if (line_len + 2 + (is_special ? 3 : 1) > max_encoded_length) {
383+
if (line_len + encoded_len + 2 > max_encoded_length) {
384+
/* It won't fit with trailing "?=" --- break the line */
371385
strbuf_addf(sb, "?=\n =?%s?q?", encoding);
372386
line_len = strlen(encoding) + 5 + 1; /* =??q? plus SP */
373387
}
374388

375-
if (is_special) {
376-
strbuf_addf(sb, "=%02X", ch);
377-
line_len += 3;
378-
} else {
379-
strbuf_addch(sb, ch);
380-
line_len++;
381-
}
389+
for (i = 0; i < chrlen; i++)
390+
strbuf_addf(sb, encoded_fmt, p[i]);
391+
line_len += encoded_len;
382392
}
383393
strbuf_addstr(sb, "?=");
384394
}

t/t4014-format-patch.sh

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -837,25 +837,26 @@ Subject: [PATCH] =?UTF-8?q?f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f?=
837837
=?UTF-8?q?=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar?=
838838
=?UTF-8?q?=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20?=
839839
=?UTF-8?q?bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6?=
840-
=?UTF-8?q?=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3?=
841-
=?UTF-8?q?=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6?=
842-
=?UTF-8?q?=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3?=
843-
=?UTF-8?q?=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f?=
840+
=?UTF-8?q?=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6?=
841+
=?UTF-8?q?=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f?=
844842
=?UTF-8?q?=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar?=
845843
=?UTF-8?q?=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20?=
846844
=?UTF-8?q?bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6?=
847-
=?UTF-8?q?=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3?=
848-
=?UTF-8?q?=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6?=
849-
=?UTF-8?q?=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3?=
850-
=?UTF-8?q?=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f?=
845+
=?UTF-8?q?=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6?=
846+
=?UTF-8?q?=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f?=
851847
=?UTF-8?q?=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar?=
852848
=?UTF-8?q?=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20?=
853849
=?UTF-8?q?bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6?=
854-
=?UTF-8?q?=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3?=
855-
=?UTF-8?q?=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6?=
856-
=?UTF-8?q?=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3?=
857-
=?UTF-8?q?=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f?=
858-
=?UTF-8?q?=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar?=
850+
=?UTF-8?q?=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6?=
851+
=?UTF-8?q?=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f?=
852+
=?UTF-8?q?=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar?=
853+
=?UTF-8?q?=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20?=
854+
=?UTF-8?q?bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6?=
855+
=?UTF-8?q?=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6?=
856+
=?UTF-8?q?=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f?=
857+
=?UTF-8?q?=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar?=
858+
=?UTF-8?q?=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20bar=20f=C3=B6=C3=B6=20?=
859+
=?UTF-8?q?bar?=
859860
EOF
860861
test_expect_success 'format-patch wraps extremely long subject (rfc2047)' '
861862
rm -rf patches/ &&

utf8.c

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -531,3 +531,42 @@ char *reencode_string(const char *in, const char *out_encoding, const char *in_e
531531
return out;
532532
}
533533
#endif
534+
535+
/*
536+
* Returns first character length in bytes for multi-byte `text` according to
537+
* `encoding`.
538+
*
539+
* - The `text` pointer is updated to point at the next character.
540+
* - When `remainder_p` is not NULL, on entry `*remainder_p` is how much bytes
541+
* we can consume from text, and on exit `*remainder_p` is reduced by returned
542+
* character length. Otherwise `text` is treated as limited by NUL.
543+
*/
544+
int mbs_chrlen(const char **text, size_t *remainder_p, const char *encoding)
545+
{
546+
int chrlen;
547+
const char *p = *text;
548+
size_t r = (remainder_p ? *remainder_p : SIZE_MAX);
549+
550+
if (r < 1)
551+
return 0;
552+
553+
if (is_encoding_utf8(encoding)) {
554+
pick_one_utf8_char(&p, &r);
555+
556+
chrlen = p ? (p - *text)
557+
: 1 /* not valid UTF-8 -> raw byte sequence */;
558+
}
559+
else {
560+
/*
561+
* TODO use iconv to decode one char and obtain its chrlen
562+
* for now, let's treat encodings != UTF-8 as one-byte
563+
*/
564+
chrlen = 1;
565+
}
566+
567+
*text += chrlen;
568+
if (remainder_p)
569+
*remainder_p -= chrlen;
570+
571+
return chrlen;
572+
}

utf8.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,6 @@ char *reencode_string(const char *in, const char *out_encoding, const char *in_e
2222
#define reencode_string(a,b,c) NULL
2323
#endif
2424

25+
int mbs_chrlen(const char **text, size_t *remainder_p, const char *encoding);
26+
2527
#endif

0 commit comments

Comments
 (0)