|
10 | 10 | public class ContentUtils { |
11 | 11 | private static final byte[] HEX_ARRAY = "0123456789ABCDEF".getBytes(StandardCharsets.US_ASCII); |
12 | 12 |
|
13 | | - private static final String UTF8_REGEX = """ |
14 | | - \\A([\\x09\\x0A\\x0D\\x20-\\x7E] # ASCII |
15 | | - | [\\xC2-\\xDF][\\x80-\\xBF] # non-overlong 2-byte |
16 | | - | \\xE0[\\xA0-\\xBF][\\x80-\\xBF] # excluding overlongs |
17 | | - | [\\xE1-\\xEC\\xEE\\xEF][\\x80-\\xBF]{2} # straight 3-byte |
18 | | - | \\xED[\\x80-\\x9F][\\x80-\\xBF] # excluding surrogates |
19 | | - | \\xF0[\\x90-\\xBF][\\x80-\\xBF]{2} # planes 1-3 |
20 | | - | [\\xF1-\\xF3][\\x80-\\xBF]{3} # planes 4-15 |
21 | | - | \\xF4[\\x80-\\x8F][\\x80-\\xBF]{2} # plane 16 |
22 | | - )*\\z |
23 | | - """.trim(); |
24 | | - |
25 | | - private static final Pattern UTF8_PATTERN = Pattern.compile(UTF8_REGEX, Pattern.COMMENTS); |
26 | | - |
27 | 13 | private ContentUtils() { |
28 | 14 | } |
29 | 15 |
|
30 | 16 | /** |
31 | | - * Detects if bytes contain a UTF-8 string or something else |
32 | | - * Source: https://stackoverflow.com/questions/1193200/how-can-i-check-whether-a-byte-array-contains-a-unicode-string-in-java |
| 17 | + * Detects if bytes contain a UTF-8 string or something else. |
33 | 18 | * @param value the bytes to test for a UTF-8 encoded {@code java.lang.String} value |
34 | 19 | * @return true, if the byte[] contains a UTF-8 encode {@code java.lang.String} |
35 | 20 | */ |
36 | 21 | public static boolean isValidUtf8(byte[] value) { |
37 | | - //If the array is too long, it throws a StackOverflowError due to the regex, so we assume it is a String. |
38 | | - if (value.length <= 1000) { |
39 | | - String phonyString = new String(value, StandardCharsets.ISO_8859_1); |
40 | | - return UTF8_PATTERN.matcher(phonyString).matches(); |
| 22 | + // Any data exceeding 10KB will be treated as a string. |
| 23 | + if (value.length > 10_000) { |
| 24 | + return true; |
| 25 | + } |
| 26 | + int i = 0; |
| 27 | + while (i < value.length) { |
| 28 | + int b = value[i] & 0xFF; |
| 29 | + int numBytes; |
| 30 | + if ((b & 0x80) == 0) { |
| 31 | + // 1-byte (ASCII) |
| 32 | + numBytes = 1; |
| 33 | + } else if ((b & 0xE0) == 0xC0) { |
| 34 | + // 2-byte sequence |
| 35 | + numBytes = 2; |
| 36 | + } else if ((b & 0xF0) == 0xE0) { |
| 37 | + // 3-byte sequence |
| 38 | + numBytes = 3; |
| 39 | + } else if ((b & 0xF8) == 0xF0) { |
| 40 | + // 4-byte sequence |
| 41 | + numBytes = 4; |
| 42 | + } else { |
| 43 | + // Invalid first byte |
| 44 | + return false; |
| 45 | + } |
| 46 | + if (i + numBytes > value.length) { |
| 47 | + return false; |
| 48 | + } |
| 49 | + // Check continuation bytes |
| 50 | + for (int j = 1; j < numBytes; j++) { |
| 51 | + if ((value[i + j] & 0xC0) != 0x80) { |
| 52 | + return false; |
| 53 | + } |
| 54 | + } |
| 55 | + i += numBytes; |
41 | 56 | } |
42 | 57 | return true; |
43 | 58 | } |
@@ -86,11 +101,11 @@ public static String convertToString(byte[] value) { |
86 | 101 | if (ContentUtils.isValidUtf8(value)) { |
87 | 102 | valueAsString = new String(value); |
88 | 103 | } else { |
89 | | - if (value.length == 8) { |
| 104 | + if (value.length == Long.BYTES) { |
90 | 105 | valueAsString = String.valueOf(ContentUtils.asLong(value)); |
91 | | - } else if (value.length == 4) { |
| 106 | + } else if (value.length == Integer.BYTES) { |
92 | 107 | valueAsString = String.valueOf(ContentUtils.asInt(value)); |
93 | | - } else if (value.length == 2) { |
| 108 | + } else if (value.length == Short.BYTES) { |
94 | 109 | valueAsString = String.valueOf(ContentUtils.asShort(value)); |
95 | 110 | } else { |
96 | 111 | valueAsString = bytesToHex(value); |
|
0 commit comments