From 91629c66c444ae095d837d8f8630b848c1eb81c2 Mon Sep 17 00:00:00 2001 From: German Osin Date: Wed, 14 May 2025 14:21:51 +0200 Subject: [PATCH 1/4] ISSUE-919 Best efforts to convert header into string --- .../ui/serdes/ConsumerRecordDeserializer.java | 3 +- .../java/io/kafbat/ui/util/ContentUtils.java | 115 ++++++++++++++++++ .../io/kafbat/ui/util/ContentUtilsTest.java | 64 ++++++++++ 3 files changed, 181 insertions(+), 1 deletion(-) create mode 100644 api/src/main/java/io/kafbat/ui/util/ContentUtils.java create mode 100644 api/src/test/java/io/kafbat/ui/util/ContentUtilsTest.java diff --git a/api/src/main/java/io/kafbat/ui/serdes/ConsumerRecordDeserializer.java b/api/src/main/java/io/kafbat/ui/serdes/ConsumerRecordDeserializer.java index b4eb51493..854f57b9e 100644 --- a/api/src/main/java/io/kafbat/ui/serdes/ConsumerRecordDeserializer.java +++ b/api/src/main/java/io/kafbat/ui/serdes/ConsumerRecordDeserializer.java @@ -3,6 +3,7 @@ import io.kafbat.ui.model.TopicMessageDTO; import io.kafbat.ui.model.TopicMessageDTO.TimestampTypeEnum; import io.kafbat.ui.serde.api.Serde; +import io.kafbat.ui.util.ContentUtils; import java.time.Instant; import java.time.OffsetDateTime; import java.time.ZoneId; @@ -68,7 +69,7 @@ private void fillHeaders(TopicMessageDTO message, ConsumerRecord r .forEachRemaining(header -> headers.put( header.key(), - header.value() != null ? new String(header.value()) : null + ContentUtils.convertToString(header.value()) )); message.setHeaders(headers); } diff --git a/api/src/main/java/io/kafbat/ui/util/ContentUtils.java b/api/src/main/java/io/kafbat/ui/util/ContentUtils.java new file mode 100644 index 000000000..5ff20b359 --- /dev/null +++ b/api/src/main/java/io/kafbat/ui/util/ContentUtils.java @@ -0,0 +1,115 @@ +package io.kafbat.ui.util; + +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.regex.Pattern; + +/** + * Inspired by: https://github.com/tchiotludo/akhq/blob/dev/src/main/java/org/akhq/utils/ContentUtils.java + */ +public class ContentUtils { + private static final byte[] HEX_ARRAY = "0123456789ABCDEF".getBytes(StandardCharsets.US_ASCII); + + private static final Pattern UTF8_PATTERN = Pattern.compile("\\A(\n" + + " [\\x09\\x0A\\x0D\\x20-\\x7E] # ASCII\\n" + + "| [\\xC2-\\xDF][\\x80-\\xBF] # non-overlong 2-byte\n" + + "| \\xE0[\\xA0-\\xBF][\\x80-\\xBF] # excluding overlongs\n" + + "| [\\xE1-\\xEC\\xEE\\xEF][\\x80-\\xBF]{2} # straight 3-byte\n" + + "| \\xED[\\x80-\\x9F][\\x80-\\xBF] # excluding surrogates\n" + + "| \\xF0[\\x90-\\xBF][\\x80-\\xBF]{2} # planes 1-3\n" + + "| [\\xF1-\\xF3][\\x80-\\xBF]{3} # planes 4-15\n" + + "| \\xF4[\\x80-\\x8F][\\x80-\\xBF]{2} # plane 16\n" + + ")*\\z", Pattern.COMMENTS); + + private ContentUtils() { + } + + /** + * Detects if bytes contain a UTF-8 string or something else + * Source: https://stackoverflow.com/questions/1193200/how-can-i-check-whether-a-byte-array-contains-a-unicode-string-in-java + * @param value the bytes to test for a UTF-8 encoded {@code java.lang.String} value + * @return true, if the byte[] contains a UTF-8 encode {@code java.lang.String} + */ + public static boolean isValidUtf8(byte[] value) { + //If the array is too long, it throws a StackOverflowError due to the regex, so we assume it is a String. + if (value.length <= 1000) { + String phonyString = new String(value, StandardCharsets.ISO_8859_1); + return UTF8_PATTERN.matcher(phonyString).matches(); + } + return true; + } + + /** + * Converts bytes to long. + * + * @param value the bytes to convert in to a long + * @return the long build from the given bytes + */ + public static Long asLong(byte[] value) { + return value != null ? ByteBuffer.wrap(value).getLong() : null; + } + + /** + * Converts the given bytes to {@code int}. + * + * @param value the bytes to convert into a {@code int} + * @return the {@code int} build from the given bytes + */ + public static Integer asInt(byte[] value) { + return value != null ? ByteBuffer.wrap(value).getInt() : null; + } + + /** + * Converts the given bytes to {@code short}. + * + * @param value the bytes to convert into a {@code short} + * @return the {@code short} build from the given bytes + */ + public static Short asShort(byte[] value) { + return value != null ? ByteBuffer.wrap(value).getShort() : null; + } + + /** + * Converts the given bytes either into a {@code java.lang.string}, {@code int}, + * {@code long} or {@code short} depending on the content it contains. + * @param value the bytes to convert + * @return the value as an {@code java.lang.string}, {@code int}, {@code long} or {@code short} + */ + public static String convertToString(byte[] value) { + String valueAsString = null; + + if (value != null) { + try { + if (ContentUtils.isValidUtf8(value)) { + valueAsString = new String(value); + } else { + try { + valueAsString = String.valueOf(ContentUtils.asLong(value)); + } catch (Exception e) { + try { + valueAsString = String.valueOf(ContentUtils.asInt(value)); + } catch (Exception ex) { + valueAsString = String.valueOf(ContentUtils.asShort(value)); + } + } + } + } catch (Exception ex) { + // Show the header as hexadecimal string + valueAsString = bytesToHex(value); + } + } + return valueAsString; + } + + // https://stackoverflow.com/questions/9655181/java-convert-a-byte-array-to-a-hex-string + public static String bytesToHex(byte[] bytes) { + byte[] hexChars = new byte[bytes.length * 2]; + for (int j = 0; j < bytes.length; j++) { + int v = bytes[j] & 0xFF; + hexChars[j * 2] = HEX_ARRAY[v >>> 4]; + hexChars[j * 2 + 1] = HEX_ARRAY[v & 0x0F]; + } + return new String(hexChars, StandardCharsets.UTF_8); + } + +} diff --git a/api/src/test/java/io/kafbat/ui/util/ContentUtilsTest.java b/api/src/test/java/io/kafbat/ui/util/ContentUtilsTest.java new file mode 100644 index 000000000..fa2926e3d --- /dev/null +++ b/api/src/test/java/io/kafbat/ui/util/ContentUtilsTest.java @@ -0,0 +1,64 @@ +package io.kafbat.ui.util; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import org.apache.commons.lang3.RandomStringUtils; +import org.junit.jupiter.api.Test; + +public class ContentUtilsTest { + + private static byte[] toBytes(Short value) { + ByteBuffer buffer = ByteBuffer.allocate(Short.BYTES); + buffer.putShort(value); + return buffer.array(); + } + + private static byte[] toBytes(Integer value) { + ByteBuffer buffer = ByteBuffer.allocate(Integer.BYTES); + buffer.putInt(value); + return buffer.array(); + } + + private static byte[] toBytes(Long value) { + ByteBuffer buffer = ByteBuffer.allocate(Long.BYTES); + buffer.putLong(value); + return buffer.array(); + } + + @Test + void testHeaderValueStringUtf8() { + String testValue = "Test"; + + assertEquals(testValue, ContentUtils.convertToString(testValue.getBytes(StandardCharsets.UTF_8))); + } + + @Test + void testHeaderValueInteger() { + int testValue = 1; + assertEquals(String.valueOf(testValue), ContentUtils.convertToString(toBytes(testValue))); + } + + @Test + void testHeaderValueLong() { + long testValue = 111L; + + assertEquals(String.valueOf(testValue), ContentUtils.convertToString(toBytes(testValue))); + } + + @Test + void testHeaderValueShort() { + short testValue = 10; + + assertEquals(String.valueOf(testValue), ContentUtils.convertToString(toBytes(testValue))); + } + + @Test + void testHeaderValueLongStringUtf8() { + String testValue = RandomStringUtils.random(10000, true, false); + + assertEquals(testValue, ContentUtils.convertToString(testValue.getBytes(StandardCharsets.UTF_8))); + } + +} From 02959bbd82a963ff0ec85a126d50f9836da28526 Mon Sep 17 00:00:00 2001 From: German Osin Date: Wed, 14 May 2025 14:36:24 +0200 Subject: [PATCH 2/4] Fixed sonar issues --- .../java/io/kafbat/ui/util/ContentUtils.java | 37 ++++++++++--------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/api/src/main/java/io/kafbat/ui/util/ContentUtils.java b/api/src/main/java/io/kafbat/ui/util/ContentUtils.java index 5ff20b359..100de80d1 100644 --- a/api/src/main/java/io/kafbat/ui/util/ContentUtils.java +++ b/api/src/main/java/io/kafbat/ui/util/ContentUtils.java @@ -10,16 +10,19 @@ public class ContentUtils { private static final byte[] HEX_ARRAY = "0123456789ABCDEF".getBytes(StandardCharsets.US_ASCII); - private static final Pattern UTF8_PATTERN = Pattern.compile("\\A(\n" - + " [\\x09\\x0A\\x0D\\x20-\\x7E] # ASCII\\n" - + "| [\\xC2-\\xDF][\\x80-\\xBF] # non-overlong 2-byte\n" - + "| \\xE0[\\xA0-\\xBF][\\x80-\\xBF] # excluding overlongs\n" - + "| [\\xE1-\\xEC\\xEE\\xEF][\\x80-\\xBF]{2} # straight 3-byte\n" - + "| \\xED[\\x80-\\x9F][\\x80-\\xBF] # excluding surrogates\n" - + "| \\xF0[\\x90-\\xBF][\\x80-\\xBF]{2} # planes 1-3\n" - + "| [\\xF1-\\xF3][\\x80-\\xBF]{3} # planes 4-15\n" - + "| \\xF4[\\x80-\\x8F][\\x80-\\xBF]{2} # plane 16\n" - + ")*\\z", Pattern.COMMENTS); + private static final String UTF8_REGEX = """ + \\A([\\x09\\x0A\\x0D\\x20-\\x7E] # ASCII + | [\\xC2-\\xDF][\\x80-\\xBF] # non-overlong 2-byte + | \\xE0[\\xA0-\\xBF][\\x80-\\xBF] # excluding overlongs + | [\\xE1-\\xEC\\xEE\\xEF][\\x80-\\xBF]{2} # straight 3-byte + | \\xED[\\x80-\\x9F][\\x80-\\xBF] # excluding surrogates + | \\xF0[\\x90-\\xBF][\\x80-\\xBF]{2} # planes 1-3 + | [\\xF1-\\xF3][\\x80-\\xBF]{3} # planes 4-15 + | \\xF4[\\x80-\\x8F][\\x80-\\xBF]{2} # plane 16 + )*\\z + """.trim(); + + private static final Pattern UTF8_PATTERN = Pattern.compile(UTF8_REGEX, Pattern.COMMENTS); private ContentUtils() { } @@ -83,14 +86,14 @@ public static String convertToString(byte[] value) { if (ContentUtils.isValidUtf8(value)) { valueAsString = new String(value); } else { - try { + if (value.length == 8) { valueAsString = String.valueOf(ContentUtils.asLong(value)); - } catch (Exception e) { - try { - valueAsString = String.valueOf(ContentUtils.asInt(value)); - } catch (Exception ex) { - valueAsString = String.valueOf(ContentUtils.asShort(value)); - } + } else if (value.length == 4) { + valueAsString = String.valueOf(ContentUtils.asInt(value)); + } else if (value.length == 2) { + valueAsString = String.valueOf(ContentUtils.asShort(value)); + } else { + valueAsString = bytesToHex(value); } } } catch (Exception ex) { From 1a841d76de8e3b8387f8429583c6c482afc22241 Mon Sep 17 00:00:00 2001 From: German Osin Date: Wed, 14 May 2025 14:45:58 +0200 Subject: [PATCH 3/4] Performance improvements --- .../java/io/kafbat/ui/util/ContentUtils.java | 61 ++++++++++++------- 1 file changed, 38 insertions(+), 23 deletions(-) diff --git a/api/src/main/java/io/kafbat/ui/util/ContentUtils.java b/api/src/main/java/io/kafbat/ui/util/ContentUtils.java index 100de80d1..fbd81092c 100644 --- a/api/src/main/java/io/kafbat/ui/util/ContentUtils.java +++ b/api/src/main/java/io/kafbat/ui/util/ContentUtils.java @@ -10,34 +10,49 @@ public class ContentUtils { private static final byte[] HEX_ARRAY = "0123456789ABCDEF".getBytes(StandardCharsets.US_ASCII); - private static final String UTF8_REGEX = """ - \\A([\\x09\\x0A\\x0D\\x20-\\x7E] # ASCII - | [\\xC2-\\xDF][\\x80-\\xBF] # non-overlong 2-byte - | \\xE0[\\xA0-\\xBF][\\x80-\\xBF] # excluding overlongs - | [\\xE1-\\xEC\\xEE\\xEF][\\x80-\\xBF]{2} # straight 3-byte - | \\xED[\\x80-\\x9F][\\x80-\\xBF] # excluding surrogates - | \\xF0[\\x90-\\xBF][\\x80-\\xBF]{2} # planes 1-3 - | [\\xF1-\\xF3][\\x80-\\xBF]{3} # planes 4-15 - | \\xF4[\\x80-\\x8F][\\x80-\\xBF]{2} # plane 16 - )*\\z - """.trim(); - - private static final Pattern UTF8_PATTERN = Pattern.compile(UTF8_REGEX, Pattern.COMMENTS); - private ContentUtils() { } /** - * Detects if bytes contain a UTF-8 string or something else - * Source: https://stackoverflow.com/questions/1193200/how-can-i-check-whether-a-byte-array-contains-a-unicode-string-in-java + * Detects if bytes contain a UTF-8 string or something else. * @param value the bytes to test for a UTF-8 encoded {@code java.lang.String} value * @return true, if the byte[] contains a UTF-8 encode {@code java.lang.String} */ public static boolean isValidUtf8(byte[] value) { - //If the array is too long, it throws a StackOverflowError due to the regex, so we assume it is a String. - if (value.length <= 1000) { - String phonyString = new String(value, StandardCharsets.ISO_8859_1); - return UTF8_PATTERN.matcher(phonyString).matches(); + // Any data exceeding 10KB will be treated as a string. + if (value.length > 10_000) { + return true; + } + int i = 0; + while (i < value.length) { + int b = value[i] & 0xFF; + int numBytes; + if ((b & 0x80) == 0) { + // 1-byte (ASCII) + numBytes = 1; + } else if ((b & 0xE0) == 0xC0) { + // 2-byte sequence + numBytes = 2; + } else if ((b & 0xF0) == 0xE0) { + // 3-byte sequence + numBytes = 3; + } else if ((b & 0xF8) == 0xF0) { + // 4-byte sequence + numBytes = 4; + } else { + // Invalid first byte + return false; + } + if (i + numBytes > value.length) { + return false; + } + // Check continuation bytes + for (int j = 1; j < numBytes; j++) { + if ((value[i + j] & 0xC0) != 0x80) { + return false; + } + } + i += numBytes; } return true; } @@ -86,11 +101,11 @@ public static String convertToString(byte[] value) { if (ContentUtils.isValidUtf8(value)) { valueAsString = new String(value); } else { - if (value.length == 8) { + if (value.length == Long.BYTES) { valueAsString = String.valueOf(ContentUtils.asLong(value)); - } else if (value.length == 4) { + } else if (value.length == Integer.BYTES) { valueAsString = String.valueOf(ContentUtils.asInt(value)); - } else if (value.length == 2) { + } else if (value.length == Short.BYTES) { valueAsString = String.valueOf(ContentUtils.asShort(value)); } else { valueAsString = bytesToHex(value); From d25b87d8b7196bdecc57fe1535cef6bd29460419 Mon Sep 17 00:00:00 2001 From: German Osin Date: Wed, 14 May 2025 15:51:20 +0200 Subject: [PATCH 4/4] fixes in the code --- .../java/io/kafbat/ui/util/ContentUtils.java | 58 +++++++++---------- 1 file changed, 27 insertions(+), 31 deletions(-) diff --git a/api/src/main/java/io/kafbat/ui/util/ContentUtils.java b/api/src/main/java/io/kafbat/ui/util/ContentUtils.java index fbd81092c..a3eafc2db 100644 --- a/api/src/main/java/io/kafbat/ui/util/ContentUtils.java +++ b/api/src/main/java/io/kafbat/ui/util/ContentUtils.java @@ -1,7 +1,10 @@ package io.kafbat.ui.util; import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharsetDecoder; import java.nio.charset.StandardCharsets; +import java.util.List; import java.util.regex.Pattern; /** @@ -10,6 +13,8 @@ public class ContentUtils { private static final byte[] HEX_ARRAY = "0123456789ABCDEF".getBytes(StandardCharsets.US_ASCII); + private static final CharsetDecoder UTF8_DECODER = StandardCharsets.UTF_8.newDecoder(); + private ContentUtils() { } @@ -23,38 +28,29 @@ public static boolean isValidUtf8(byte[] value) { if (value.length > 10_000) { return true; } - int i = 0; - while (i < value.length) { - int b = value[i] & 0xFF; - int numBytes; - if ((b & 0x80) == 0) { - // 1-byte (ASCII) - numBytes = 1; - } else if ((b & 0xE0) == 0xC0) { - // 2-byte sequence - numBytes = 2; - } else if ((b & 0xF0) == 0xE0) { - // 3-byte sequence - numBytes = 3; - } else if ((b & 0xF8) == 0xF0) { - // 4-byte sequence - numBytes = 4; - } else { - // Invalid first byte - return false; - } - if (i + numBytes > value.length) { - return false; - } - // Check continuation bytes - for (int j = 1; j < numBytes; j++) { - if ((value[i + j] & 0xC0) != 0x80) { - return false; - } - } - i += numBytes; + try { + CharBuffer decode = UTF8_DECODER.decode(ByteBuffer.wrap(value)); + return decode.chars().allMatch(ContentUtils::isValidUtf8); + } catch (Exception e) { + return false; + } + } + + public static boolean isValidUtf8(int c) { + // SKIP NULL Symbols + if (c == 0) { + return false; + } + // Well known symbols + if (Character.isAlphabetic(c) + || Character.isDigit(c) + || Character.isWhitespace(c) + || Character.isEmoji(c) + ) { + return true; } - return true; + // We could read only whitespace controls like + return !Character.isISOControl(c); } /**