@@ -1112,8 +1112,92 @@ int commit_tree(const struct strbuf *msg, unsigned char *tree,
1112
1112
return result ;
1113
1113
}
1114
1114
1115
+ static int find_invalid_utf8 (const char * buf , int len )
1116
+ {
1117
+ int offset = 0 ;
1118
+
1119
+ while (len ) {
1120
+ unsigned char c = * buf ++ ;
1121
+ int bytes , bad_offset ;
1122
+
1123
+ len -- ;
1124
+ offset ++ ;
1125
+
1126
+ /* Simple US-ASCII? No worries. */
1127
+ if (c < 0x80 )
1128
+ continue ;
1129
+
1130
+ bad_offset = offset - 1 ;
1131
+
1132
+ /*
1133
+ * Count how many more high bits set: that's how
1134
+ * many more bytes this sequence should have.
1135
+ */
1136
+ bytes = 0 ;
1137
+ while (c & 0x40 ) {
1138
+ c <<= 1 ;
1139
+ bytes ++ ;
1140
+ }
1141
+
1142
+ /* Must be between 1 and 5 more bytes */
1143
+ if (bytes < 1 || bytes > 5 )
1144
+ return bad_offset ;
1145
+
1146
+ /* Do we *have* that many bytes? */
1147
+ if (len < bytes )
1148
+ return bad_offset ;
1149
+
1150
+ offset += bytes ;
1151
+ len -= bytes ;
1152
+
1153
+ /* And verify that they are good continuation bytes */
1154
+ do {
1155
+ if ((* buf ++ & 0xc0 ) != 0x80 )
1156
+ return bad_offset ;
1157
+ } while (-- bytes );
1158
+
1159
+ /* We could/should check the value and length here too */
1160
+ }
1161
+ return -1 ;
1162
+ }
1163
+
1164
+ /*
1165
+ * This verifies that the buffer is in proper utf8 format.
1166
+ *
1167
+ * If it isn't, it assumes any non-utf8 characters are Latin1,
1168
+ * and does the conversion.
1169
+ *
1170
+ * Fixme: we should probably also disallow overlong forms and
1171
+ * invalid characters. But we don't do that currently.
1172
+ */
1173
+ static int verify_utf8 (struct strbuf * buf )
1174
+ {
1175
+ int ok = 1 ;
1176
+ long pos = 0 ;
1177
+
1178
+ for (;;) {
1179
+ int bad ;
1180
+ unsigned char c ;
1181
+ unsigned char replace [2 ];
1182
+
1183
+ bad = find_invalid_utf8 (buf -> buf + pos , buf -> len - pos );
1184
+ if (bad < 0 )
1185
+ return ok ;
1186
+ pos += bad ;
1187
+ ok = 0 ;
1188
+ c = buf -> buf [pos ];
1189
+ strbuf_remove (buf , pos , 1 );
1190
+
1191
+ /* We know 'c' must be in the range 128-255 */
1192
+ replace [0 ] = 0xc0 + (c >> 6 );
1193
+ replace [1 ] = 0x80 + (c & 0x3f );
1194
+ strbuf_insert (buf , pos , replace , 2 );
1195
+ pos += 2 ;
1196
+ }
1197
+ }
1198
+
1115
1199
static const char commit_utf8_warn [] =
1116
- "Warning: commit message does not conform to UTF-8.\n"
1200
+ "Warning: commit message did not conform to UTF-8.\n"
1117
1201
"You may want to amend it after fixing the message, or set the config\n"
1118
1202
"variable i18n.commitencoding to the encoding your project uses.\n" ;
1119
1203
@@ -1170,7 +1254,7 @@ int commit_tree_extended(const struct strbuf *msg, unsigned char *tree,
1170
1254
strbuf_addbuf (& buffer , msg );
1171
1255
1172
1256
/* And check the encoding */
1173
- if (encoding_is_utf8 && !is_utf8 ( buffer . buf ))
1257
+ if (encoding_is_utf8 && !verify_utf8 ( & buffer ))
1174
1258
fprintf (stderr , commit_utf8_warn );
1175
1259
1176
1260
if (sign_commit && do_sign_commit (& buffer , sign_commit ))
0 commit comments