Skip to content

Commit 31dfe0d

Browse files
committed
syslogd: fix UTF-8 handling with -8 flag, for RFC5424 compliance
The -8 flag was designed to preserve 8-bit data but failed with multi-byte UTF-8 sequences like em-dash (—). The parsemsg_remove_unsafe_characters() function processed UTF-8 byte-by-byte, corrupting sequences even with -8. Changes: - Add UTF-8 sequence detection and validation functions - Preserve complete valid UTF-8 sequences when -8 flag is used - Support UTF-8 BOM per RFC5424 requirements - Maintain backward compatibility and security filtering Fixes #105 Signed-off-by: Joachim Wiberg <[email protected]>
1 parent c6b48b2 commit 31dfe0d

File tree

1 file changed

+93
-2
lines changed

1 file changed

+93
-2
lines changed

src/syslogd.c

Lines changed: 93 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -997,18 +997,109 @@ void untty(void)
997997
#endif
998998
}
999999

1000+
/*
1001+
* Returns the length of a UTF-8 sequence starting at *in, or 0 if invalid.
1002+
* Does not validate the entire sequence, just checks the start byte.
1003+
*/
1004+
static size_t
1005+
utf8_len(const unsigned char *in)
1006+
{
1007+
unsigned char c = *in;
1008+
1009+
if ((c & 0x80) == 0x00) return 1; /* ASCII: 0xxxxxxx */
1010+
if ((c & 0xE0) == 0xC0) return 2; /* 110xxxxx */
1011+
if ((c & 0xF0) == 0xE0) return 3; /* 1110xxxx */
1012+
if ((c & 0xF8) == 0xF0) return 4; /* 11110xxx */
1013+
1014+
return 0; /* Invalid start byte */
1015+
}
1016+
1017+
/*
1018+
* Validates a complete UTF-8 sequence of given length.
1019+
* Returns 1 if valid, 0 if invalid.
1020+
*/
1021+
static int
1022+
utf8_valid(const unsigned char *in, size_t len)
1023+
{
1024+
size_t i;
1025+
1026+
switch (len) {
1027+
case 1:
1028+
if ((*in & 0x80) != 0x00)
1029+
return 0;
1030+
break;
1031+
case 2:
1032+
if ((*in & 0xE0) != 0xC0)
1033+
return 0;
1034+
if (*in < 0xC2)
1035+
return 0; /* Overlong encoding */
1036+
break;
1037+
case 3:
1038+
if ((*in & 0xF0) != 0xE0)
1039+
return 0;
1040+
break;
1041+
case 4:
1042+
if ((*in & 0xF8) != 0xF0)
1043+
return 0;
1044+
if (*in > 0xF4)
1045+
return 0; /* Beyond Unicode range */
1046+
break;
1047+
default:
1048+
return 0;
1049+
}
1050+
1051+
/* Check continuation bytes */
1052+
for (i = 1; i < len; i++) {
1053+
if ((in[i] & 0xC0) != 0x80)
1054+
return 0;
1055+
}
1056+
1057+
/* Check for overlong encodings and surrogates */
1058+
if (len == 3 && *in == 0xE0 && (in[1] & 0xE0) == 0x80) return 0;
1059+
if (len == 4 && *in == 0xF0 && (in[1] & 0xF0) == 0x80) return 0;
1060+
if (len == 3 && *in == 0xED && (in[1] & 0xE0) == 0xA0) return 0; /* Surrogates */
1061+
1062+
return 1;
1063+
}
1064+
10001065
/*
10011066
* Removes characters from log messages that are unsafe to display.
1002-
* TODO: Permit UTF-8 strings that include a BOM per RFC 5424?
1067+
* Preserves valid UTF-8 sequences, including BOM, with -8 flag.
10031068
*/
10041069
static void
10051070
parsemsg_remove_unsafe_characters(const char *in, char *out, size_t outlen)
10061071
{
1072+
const unsigned char *p = (const unsigned char *)in;
10071073
char *q;
10081074
int c;
10091075

10101076
q = out;
1011-
while ((c = (unsigned char)*in++) != '\0' && q < out + outlen - 4) {
1077+
while (*p && q < out + outlen - 4) {
1078+
/* When -8 flag is used, try UTF-8 processing first */
1079+
if (!mask_C1 && (*p & 0x80)) {
1080+
size_t len = utf8_len(p);
1081+
1082+
/* Check if we have a complete UTF-8 sequence */
1083+
if (len > 0 && q + len < out + outlen) {
1084+
size_t i = 0;
1085+
1086+
/* Ensure we have enough input bytes */
1087+
while (i < len && p[i] != '\0')
1088+
i++;
1089+
1090+
if (i == len && utf8_valid(p, len)) {
1091+
/* Copy the entire valid UTF-8 sequence */
1092+
for (i = 0; i < len; i++)
1093+
*q++ = *p++;
1094+
continue;
1095+
}
1096+
}
1097+
/* Fall through to byte-by-byte processing for invalid UTF-8 */
1098+
}
1099+
1100+
/* Byte-by-byte processing */
1101+
c = *p++;
1102+
10121103
if (mask_C1 && (c & 0x80) && c < 0xA0) {
10131104
c &= 0x7F;
10141105
*q++ = 'M';

0 commit comments

Comments
 (0)