From a1d34bb426db1fcb91a61d228c19c4f13df955ec Mon Sep 17 00:00:00 2001 From: Valentyn Kolesnikov Date: Wed, 14 May 2025 10:29:14 +0300 Subject: [PATCH 1/7] Added method U.fileXmlToJson(xmlFileName, jsonFileName, identStep) --- src/main/java/com/github/underscore/U.java | 85 ++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/src/main/java/com/github/underscore/U.java b/src/main/java/com/github/underscore/U.java index 3b64f736..606829d6 100644 --- a/src/main/java/com/github/underscore/U.java +++ b/src/main/java/com/github/underscore/U.java @@ -2781,6 +2781,91 @@ public static String xmlToJson(String xml, XmlToJsonMode mode) { return xmlToJson(xml, Json.JsonStringBuilder.Step.TWO_SPACES, mode); } + public static void fileXmlToJson(String xmlFileName, String jsonFileName, Json.JsonStringBuilder.Step identStep) + throws IOException { + final byte[] bytes = Files.readAllBytes(Paths.get(xmlFileName)); + String xmlText = new String(removeBom(bytes), detectEncoding(bytes)); + Files.write(Paths.get(jsonFileName), formatString(xmlToJson(xmlText, identStep), + System.lineSeparator()).getBytes(StandardCharsets.UTF_8)); + } + + public static void fileXmlToJson(String xmlFileName, String jsonFileName) throws IOException { + fileXmlToJson(xmlFileName, jsonFileName, Json.JsonStringBuilder.Step.TWO_SPACES); + } + + public static byte[] removeBom(byte[] bytes) { + if ((bytes.length >= 3) && (bytes[0] == -17) && (bytes[1] == -69) && (bytes[2] == -65)) { + return Arrays.copyOfRange(bytes, 3, bytes.length); + } + if ((bytes.length >= 2) && (bytes[0] == -1) && (bytes[1] == -2)) { + return Arrays.copyOfRange(bytes, 2, bytes.length); + } + if ((bytes.length >= 2) && (bytes[0] == -2) && (bytes[1] == -1)) { + return Arrays.copyOfRange(bytes, 2, bytes.length); + } + return bytes; + } + + public static String detectEncoding(byte[] buffer) { + if (buffer.length < 4) { + return "UTF8"; + } + String encoding = null; + int n = ((buffer[0] & 0xFF) << 24) + | ((buffer[1] & 0xFF) << 16) + | ((buffer[2] & 0xFF) << 8) + | (buffer[3] & 0xFF); + switch (n) { + case 0x0000FEFF: + case 0x0000003C: + encoding = "UTF_32BE"; + break; + case 0x003C003F: + encoding = "UnicodeBigUnmarked"; + break; + case 0xFFFE0000: + encoding = "UTF_32LE"; + break; + case 0x3C000000: + encoding = "UTF_32LE"; + break; + // >> 8) == 0xEFBBBF) { + encoding = "UTF8"; + break; + } + if ((n >>> 24) == 0x3C) { + break; + } + switch (n >>> 16) { + case 0xFFFE: + encoding = "UnicodeLittleUnmarked"; + break; + case 0xFEFF: + encoding = "UnicodeBigUnmarked"; + break; + default: + break; + } + } + return encoding == null ? "UTF8" : encoding; + } + + public static String formatString(String data, String lineSeparator) { + if ("\n".equals(lineSeparator)) { + return data; + } + return data.replace("\n", lineSeparator); + } + public static String xmlOrJsonToJson(String xmlOrJson, Json.JsonStringBuilder.Step identStep) { TextType textType = getTextType(xmlOrJson); final String result; From c3d40ad9d8e91d9e08f184bb18d6ca5d2c56d44b Mon Sep 17 00:00:00 2001 From: Valentyn Kolesnikov Date: Wed, 14 May 2025 10:55:43 +0300 Subject: [PATCH 2/7] Added tests --- .../com/github/underscore/UnderscoreTest.java | 178 ++++++++++++++++++ 1 file changed, 178 insertions(+) diff --git a/src/test/java/com/github/underscore/UnderscoreTest.java b/src/test/java/com/github/underscore/UnderscoreTest.java index 03f16238..5a53513c 100644 --- a/src/test/java/com/github/underscore/UnderscoreTest.java +++ b/src/test/java/com/github/underscore/UnderscoreTest.java @@ -25,15 +25,22 @@ import static java.util.Arrays.asList; import static java.util.Collections.singletonList; +import static org.junit.jupiter.api.Assertions.assertAll; import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNotSame; import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; @@ -45,6 +52,7 @@ import java.util.function.Function; import java.util.function.Predicate; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; /** * Underscore library unit test. @@ -837,4 +845,174 @@ void testMapToPropertiesWithNullValues() { Properties properties2 = U.mapToProperties(null); assertEquals(0, properties2.size()); } + + @Test + void testRemoveBom() { + // Test UTF-8 BOM + byte[] utf8Bom = new byte[]{(byte)-17, (byte)-69, (byte)-65, 'a', 'b', 'c'}; + assertArrayEquals( + new byte[]{'a', 'b', 'c'}, + U.removeBom(utf8Bom), + "Should remove UTF-8 BOM (EF BB BF) and keep content" + ); + + // Test UTF-16 LE BOM + byte[] utf16LeBom = new byte[]{(byte)-1, (byte)-2, 'a', 'b'}; + assertArrayEquals( + new byte[]{'a', 'b'}, + U.removeBom(utf16LeBom), + "Should remove UTF-16 LE BOM (FF FE) and keep content" + ); + + // Test UTF-16 BE BOM + byte[] utf16BeBom = new byte[]{(byte)-2, (byte)-1, 'a', 'b'}; + assertArrayEquals( + new byte[]{'a', 'b'}, + U.removeBom(utf16BeBom), + "Should remove UTF-16 BE BOM (FE FF) and keep content" + ); + + // Test no BOM + byte[] noBom = new byte[]{'a', 'b', 'c'}; + assertArrayEquals( + noBom, + U.removeBom(noBom), + "Should return original array when no BOM is present" + ); + + // Test empty array + byte[] empty = new byte[]{}; + assertArrayEquals( + empty, + U.removeBom(empty), + "Should handle empty byte array correctly" + ); + } + + @Test + void testDetectEncoding() { + // Test UTF-32BE + byte[] utf32be = new byte[]{0x00, 0x00, (byte)0xFE, (byte)0xFF, 'a'}; + assertEquals( + "UTF_32BE", + U.detectEncoding(utf32be), + "Should detect UTF-32BE encoding from BOM" + ); + + // Test UTF-32LE + byte[] utf32le = new byte[]{(byte)0xFF, (byte)0xFE, 0x00, 0x00, 'a'}; + assertEquals( + "UTF_32LE", + U.detectEncoding(utf32le), + "Should detect UTF-32LE encoding from BOM" + ); + + // Test Unicode Big Unmarked + byte[] unicodeBig = new byte[]{0x00, 0x3C, 0x00, 0x3F}; + assertEquals( + "UnicodeBigUnmarked", + U.detectEncoding(unicodeBig), + "Should detect Unicode Big Unmarked encoding" + ); + + // Test UTF-8 XML declaration + byte[] utf8Xml = new byte[]{0x3C, 0x3F, 0x78, 0x6D}; + assertEquals( + "UTF8", + U.detectEncoding(utf8Xml), + "Should detect UTF-8 encoding from XML declaration" + ); + + // Test UTF-8 with BOM + byte[] utf8Bom = new byte[]{(byte)0xEF, (byte)0xBB, (byte)0xBF, 'a'}; + assertEquals( + "UTF8", + U.detectEncoding(utf8Bom), + "Should detect UTF-8 encoding from BOM" + ); + + // Test small buffer + byte[] small = new byte[]{0x3C, 0x3F}; + assertEquals( + "UTF8", + U.detectEncoding(small), + "Should default to UTF-8 for buffers smaller than 4 bytes" + ); + } + + @Test + void testFormatString() { + // Test with \n line separator + String input1 = "line1\nline2\nline3"; + assertEquals( + input1, + U.formatString(input1, "\n"), + "Should not modify string when line separator is already \\n" + ); + + // Test with different line separator + String input2 = "line1\nline2\nline3"; + String expected2 = "line1\r\nline2\r\nline3"; + assertEquals( + expected2, + U.formatString(input2, "\r\n"), + "Should replace \\n with specified line separator" + ); + + // Test with empty string + assertTrue( + U.formatString("", "\n").isEmpty(), + "Should handle empty string correctly" + ); + + // Test with no line breaks + String noBreaks = "text without breaks"; + assertEquals( + noBreaks, + U.formatString(noBreaks, "\r\n"), + "Should not modify string without line breaks" + ); + } + + @Test + void testFileXmlToJson(@TempDir Path tempDir) throws IOException { + // Create temporary files + Path xmlPath = tempDir.resolve("test.xml"); + Path jsonPath = tempDir.resolve("test.json"); + + // Write test XML content + String xml = "value"; + Files.write(xmlPath, xml.getBytes(StandardCharsets.UTF_8)); + + // Test file conversion + assertDoesNotThrow( + () -> U.fileXmlToJson(xmlPath.toString(), jsonPath.toString()), + "File conversion should not throw exceptions" + ); + + // Verify the JSON file + assertTrue( + Files.exists(jsonPath), + "JSON file should be created" + ); + + String jsonContent = Files.readString(jsonPath); + assertAll("JSON file content verification", + () -> assertNotNull(jsonContent, "JSON content should not be null"), + () -> assertTrue(jsonContent.contains("\"item\": \"value\""), + "JSON should contain converted XML content") + ); + } + + @Test + void testFileXmlToJsonWithInvalidInput(@TempDir Path tempDir) { + Path nonExistentXml = tempDir.resolve("nonexistent.xml"); + Path outputJson = tempDir.resolve("output.json"); + + assertThrows( + IOException.class, + () -> U.fileXmlToJson(nonExistentXml.toString(), outputJson.toString()), + "Should throw IOException when input file doesn't exist" + ); + } } From e50d914b0e51d440cacff0a063c3caa9f732cf80 Mon Sep 17 00:00:00 2001 From: Valentyn Kolesnikov Date: Wed, 14 May 2025 11:03:44 +0300 Subject: [PATCH 3/7] Improved tests --- .../com/github/underscore/UnderscoreTest.java | 85 +++++++++++-------- 1 file changed, 49 insertions(+), 36 deletions(-) diff --git a/src/test/java/com/github/underscore/UnderscoreTest.java b/src/test/java/com/github/underscore/UnderscoreTest.java index 5a53513c..18a29dbf 100644 --- a/src/test/java/com/github/underscore/UnderscoreTest.java +++ b/src/test/java/com/github/underscore/UnderscoreTest.java @@ -847,46 +847,59 @@ void testMapToPropertiesWithNullValues() { } @Test - void testRemoveBom() { - // Test UTF-8 BOM - byte[] utf8Bom = new byte[]{(byte)-17, (byte)-69, (byte)-65, 'a', 'b', 'c'}; - assertArrayEquals( - new byte[]{'a', 'b', 'c'}, - U.removeBom(utf8Bom), - "Should remove UTF-8 BOM (EF BB BF) and keep content" - ); + void testRemoveUtf8Bom() { + // UTF-8 BOM: 0xEF,0xBB,0xBF == -17, -69, -65 + byte[] withBom = new byte[] {-17, -69, -65, 1, 2, 3}; + byte[] expected = new byte[] {1, 2, 3}; + assertArrayEquals(expected, U.removeBom(withBom), "UTF-8 BOM should be removed"); + } - // Test UTF-16 LE BOM - byte[] utf16LeBom = new byte[]{(byte)-1, (byte)-2, 'a', 'b'}; - assertArrayEquals( - new byte[]{'a', 'b'}, - U.removeBom(utf16LeBom), - "Should remove UTF-16 LE BOM (FF FE) and keep content" - ); + @Test + void testRemoveUtf16BeBom() { + // UTF-16BE BOM: 0xFE,0xFF == -2, -1 + byte[] withBom = new byte[] {-2, -1, 4, 5}; + byte[] expected = new byte[] {4, 5}; + assertArrayEquals(expected, U.removeBom(withBom), "UTF-16BE BOM should be removed"); + } - // Test UTF-16 BE BOM - byte[] utf16BeBom = new byte[]{(byte)-2, (byte)-1, 'a', 'b'}; - assertArrayEquals( - new byte[]{'a', 'b'}, - U.removeBom(utf16BeBom), - "Should remove UTF-16 BE BOM (FE FF) and keep content" - ); + @Test + void testRemoveUtf16LeBom() { + // UTF-16LE BOM: 0xFF,0xFE == -1, -2 + byte[] withBom = new byte[] {-1, -2, 9}; + byte[] expected = new byte[] {9}; + assertArrayEquals(expected, U.removeBom(withBom), "UTF-16LE BOM should be removed"); + } - // Test no BOM - byte[] noBom = new byte[]{'a', 'b', 'c'}; - assertArrayEquals( - noBom, - U.removeBom(noBom), - "Should return original array when no BOM is present" - ); + @Test + void testNotShortBytesNoBom() { + // Less than 2 bytes (not possible to have BOM) + byte[] input = new byte[] {42}; + assertArrayEquals(input, U.removeBom(input), "Short arrays with no BOM should be unchanged"); + } - // Test empty array - byte[] empty = new byte[]{}; - assertArrayEquals( - empty, - U.removeBom(empty), - "Should handle empty byte array correctly" - ); + @Test + void testNoBomPresent() { + // No BOM present + byte[] input = new byte[] {3, 5, 0, -1, -7}; + assertArrayEquals(input, U.removeBom(input), "Arrays without BOM should be unchanged"); + } + + @Test + void testAlmostBomButNotEnoughBytes() { + byte[] input = new byte[] {-17, -69}; // only 2 bytes, not enough for UTF-8 BOM + assertArrayEquals(input, U.removeBom(input), "Arrays with too few BOM bytes should be unchanged"); + } + + @Test + void testPrefixSimilarButNotABom() { + byte[] input = new byte[] {-1, 0, 1}; + assertArrayEquals(input, U.removeBom(input), "Array starting with -1,0 is not a BOM, should be unchanged"); + + input = new byte[] {-2, 0, 1}; + assertArrayEquals(input, U.removeBom(input), "Array starting with -2,0 is not a BOM, should be unchanged"); + + input = new byte[] {-17, -69, 0}; // 3 bytes but third is not -65 + assertArrayEquals(input, U.removeBom(input), "Array with -17,-69, is not a BOM"); } @Test From a7b5c253970bca63bdc0a4cc9b8d8b59c08333a3 Mon Sep 17 00:00:00 2001 From: Valentyn Kolesnikov Date: Wed, 14 May 2025 11:05:48 +0300 Subject: [PATCH 4/7] Fixed style --- src/test/java/com/github/underscore/UnderscoreTest.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/test/java/com/github/underscore/UnderscoreTest.java b/src/test/java/com/github/underscore/UnderscoreTest.java index 18a29dbf..ea5759e5 100644 --- a/src/test/java/com/github/underscore/UnderscoreTest.java +++ b/src/test/java/com/github/underscore/UnderscoreTest.java @@ -886,7 +886,8 @@ void testNoBomPresent() { @Test void testAlmostBomButNotEnoughBytes() { - byte[] input = new byte[] {-17, -69}; // only 2 bytes, not enough for UTF-8 BOM + // only 2 bytes, not enough for UTF-8 BOM + byte[] input = new byte[] {-17, -69}; assertArrayEquals(input, U.removeBom(input), "Arrays with too few BOM bytes should be unchanged"); } @@ -898,7 +899,8 @@ void testPrefixSimilarButNotABom() { input = new byte[] {-2, 0, 1}; assertArrayEquals(input, U.removeBom(input), "Array starting with -2,0 is not a BOM, should be unchanged"); - input = new byte[] {-17, -69, 0}; // 3 bytes but third is not -65 + // 3 bytes but third is not -65 + input = new byte[] {-17, -69, 0}; assertArrayEquals(input, U.removeBom(input), "Array with -17,-69, is not a BOM"); } From d4ec500510330c846ff395287634460b01336758 Mon Sep 17 00:00:00 2001 From: Valentyn Kolesnikov Date: Wed, 14 May 2025 11:12:40 +0300 Subject: [PATCH 5/7] Added tests --- .../java/com/github/underscore/UnderscoreTest.java | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/test/java/com/github/underscore/UnderscoreTest.java b/src/test/java/com/github/underscore/UnderscoreTest.java index ea5759e5..4961dae1 100644 --- a/src/test/java/com/github/underscore/UnderscoreTest.java +++ b/src/test/java/com/github/underscore/UnderscoreTest.java @@ -895,13 +895,19 @@ void testAlmostBomButNotEnoughBytes() { void testPrefixSimilarButNotABom() { byte[] input = new byte[] {-1, 0, 1}; assertArrayEquals(input, U.removeBom(input), "Array starting with -1,0 is not a BOM, should be unchanged"); - input = new byte[] {-2, 0, 1}; assertArrayEquals(input, U.removeBom(input), "Array starting with -2,0 is not a BOM, should be unchanged"); - // 3 bytes but third is not -65 input = new byte[] {-17, -69, 0}; assertArrayEquals(input, U.removeBom(input), "Array with -17,-69, is not a BOM"); + input = new byte[] { -17, -69 }; + assertArrayEquals(input, U.removeBom(input), "Should not remove BOM for length < 3"); + input = new byte[] { 0, -69, -65, 33 }; + assertArrayEquals(input, U.removeBom(input), "Should not remove BOM if first byte is not -17"); + input = new byte[] { -17, 0, -65, 13 }; + assertArrayEquals(input, U.removeBom(input), "Should not remove BOM if second byte is not -69"); + input = new byte[] { -17, -69, 0, 14 }; + assertArrayEquals(input, U.removeBom(input), "Should not remove BOM if third byte is not -65"); } @Test From c9518257c5b8cb7ab47ce9ad88d13017e11d7083 Mon Sep 17 00:00:00 2001 From: Valentyn Kolesnikov Date: Wed, 14 May 2025 11:19:51 +0300 Subject: [PATCH 6/7] Added tests --- .../com/github/underscore/UnderscoreTest.java | 123 ++++++++++++------ 1 file changed, 80 insertions(+), 43 deletions(-) diff --git a/src/test/java/com/github/underscore/UnderscoreTest.java b/src/test/java/com/github/underscore/UnderscoreTest.java index 4961dae1..8294aaed 100644 --- a/src/test/java/com/github/underscore/UnderscoreTest.java +++ b/src/test/java/com/github/underscore/UnderscoreTest.java @@ -911,54 +911,91 @@ void testPrefixSimilarButNotABom() { } @Test - void testDetectEncoding() { - // Test UTF-32BE - byte[] utf32be = new byte[]{0x00, 0x00, (byte)0xFE, (byte)0xFF, 'a'}; - assertEquals( - "UTF_32BE", - U.detectEncoding(utf32be), - "Should detect UTF-32BE encoding from BOM" - ); + void testLengthLessThan4() { + byte[] buf0 = {}; + byte[] buf1 = {1}; + byte[] buf3 = {1, 2, 3}; + assertEquals("UTF8", U.detectEncoding(buf0), "Should return UTF8 for empty array"); + assertEquals("UTF8", U.detectEncoding(buf1), "Should return UTF8 for buffer length 1"); + assertEquals("UTF8", U.detectEncoding(buf3), "Should return UTF8 for buffer length 3"); + } - // Test UTF-32LE - byte[] utf32le = new byte[]{(byte)0xFF, (byte)0xFE, 0x00, 0x00, 'a'}; - assertEquals( - "UTF_32LE", - U.detectEncoding(utf32le), - "Should detect UTF-32LE encoding from BOM" - ); + @Test + void testCase_0x0000FEFF() { + byte[] buf = {(byte) 0x00, (byte) 0x00, (byte) 0xFE, (byte) 0xFF}; + assertEquals("UTF_32BE", U.detectEncoding(buf), "Should return UTF_32BE for BOM 0x0000FEFF"); + } - // Test Unicode Big Unmarked - byte[] unicodeBig = new byte[]{0x00, 0x3C, 0x00, 0x3F}; - assertEquals( - "UnicodeBigUnmarked", - U.detectEncoding(unicodeBig), - "Should detect Unicode Big Unmarked encoding" - ); + @Test + void testCase_0x0000003C() { + byte[] buf = {(byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x3C}; + assertEquals("UTF_32BE", U.detectEncoding(buf), "Should return UTF_32BE for 0x0000003C"); + } - // Test UTF-8 XML declaration - byte[] utf8Xml = new byte[]{0x3C, 0x3F, 0x78, 0x6D}; - assertEquals( - "UTF8", - U.detectEncoding(utf8Xml), - "Should detect UTF-8 encoding from XML declaration" - ); + @Test + void testCase_0x003C003F() { + byte[] buf = {(byte) 0x00, (byte) 0x3C, (byte) 0x00, (byte) 0x3F}; + assertEquals("UnicodeBigUnmarked", U.detectEncoding(buf), "Should return UnicodeBigUnmarked for 0x003C003F"); + } - // Test UTF-8 with BOM - byte[] utf8Bom = new byte[]{(byte)0xEF, (byte)0xBB, (byte)0xBF, 'a'}; - assertEquals( - "UTF8", - U.detectEncoding(utf8Bom), - "Should detect UTF-8 encoding from BOM" - ); + @Test + void testCase_0xFFFE0000() { + byte[] buf = {(byte) 0xFF, (byte) 0xFE, (byte) 0x00, (byte) 0x00}; + assertEquals("UTF_32LE", U.detectEncoding(buf), "Should return UTF_32LE for BOM 0xFFFE0000"); + } - // Test small buffer - byte[] small = new byte[]{0x3C, 0x3F}; - assertEquals( - "UTF8", - U.detectEncoding(small), - "Should default to UTF-8 for buffers smaller than 4 bytes" - ); + @Test + void testCase_0x3C000000() { + byte[] buf = {(byte) 0x3C, (byte) 0x00, (byte) 0x00, (byte) 0x00}; + assertEquals("UTF_32LE", U.detectEncoding(buf), "Should return UTF_32LE for 0x3C000000"); + } + + @Test + void testCase_0x3C003F00() { + byte[] buf = {(byte) 0x3C, (byte) 0x00, (byte) 0x3F, (byte) 0x00}; + assertEquals("UnicodeLittleUnmarked", U.detectEncoding(buf), "Should return UnicodeLittleUnmarked for 0x3C003F00"); + } + + @Test + void testCase_0x3C3F786D() { + byte[] buf = {(byte) 0x3C, (byte) 0x3F, (byte) 0x78, (byte) 0x6D}; + assertEquals("UTF8", U.detectEncoding(buf), "Should return UTF8 for 0x3C3F786D"); + } + + @Test + void testEfBbBf_UTF8() { + // 0xEFBBBF??, so n >>> 8 == 0xEFBBBF + // Let's set: [0xEF, 0xBB, 0xBF, 0x42] (0x42 is arbitrary) + byte[] buf = {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF, (byte) 0x42}; + assertEquals("UTF8", U.detectEncoding(buf), "Should return UTF8 for buffer with UTF-8 BOM"); + } + + @Test + void test_nShift24_0x3C() { + // (n >>> 24) == 0x3C, but not matching any above case + byte[] buf = {(byte) 0x3C, 1, 2, 3}; + assertEquals("UTF8", U.detectEncoding(buf), "Should return UTF8 when (n >>> 24) == 0x3C and no previous case matches"); + } + + @Test + void test_nShift16_0xFFFE() { + // (n >>> 16) == 0xFFFE (UnicodeLittleUnmarked branch) + byte[] buf = {(byte) 0xFF, (byte) 0xFE, (byte) 0x21, (byte) 0x22}; + assertEquals("UnicodeLittleUnmarked", U.detectEncoding(buf), "Should return UnicodeLittleUnmarked when (n >> 16) == 0xFFFE"); + } + + @Test + void test_nShift16_0xFEFF() { + // (n >>> 16) == 0xFEFF (UnicodeBigUnmarked branch) + byte[] buf = {(byte) 0xFE, (byte) 0xFF, (byte) 0x99, (byte) 0x88}; + assertEquals("UnicodeBigUnmarked", U.detectEncoding(buf), "Should return UnicodeBigUnmarked when (n >> 16) == 0xFEFF"); + } + + @Test + void testDefaultCase() { + // Random data, not matching any case nor any shift checks. Should default to UTF8 + byte[] buf = {(byte) 0x01, (byte) 0x23, (byte) 0x45, (byte) 0x67}; + assertEquals("UTF8", U.detectEncoding(buf), "Should default to UTF8 for unknown byte patterns"); } @Test From abefd6c2aa1f17533d15e3876cb2b75d7f4b9048 Mon Sep 17 00:00:00 2001 From: Valentyn Kolesnikov Date: Wed, 14 May 2025 11:21:05 +0300 Subject: [PATCH 7/7] Fixed style --- .../com/github/underscore/UnderscoreTest.java | 33 ++++++++++++------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/src/test/java/com/github/underscore/UnderscoreTest.java b/src/test/java/com/github/underscore/UnderscoreTest.java index 8294aaed..63ed0238 100644 --- a/src/test/java/com/github/underscore/UnderscoreTest.java +++ b/src/test/java/com/github/underscore/UnderscoreTest.java @@ -923,43 +923,50 @@ void testLengthLessThan4() { @Test void testCase_0x0000FEFF() { byte[] buf = {(byte) 0x00, (byte) 0x00, (byte) 0xFE, (byte) 0xFF}; - assertEquals("UTF_32BE", U.detectEncoding(buf), "Should return UTF_32BE for BOM 0x0000FEFF"); + assertEquals("UTF_32BE", U.detectEncoding(buf), + "Should return UTF_32BE for BOM 0x0000FEFF"); } @Test void testCase_0x0000003C() { byte[] buf = {(byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x3C}; - assertEquals("UTF_32BE", U.detectEncoding(buf), "Should return UTF_32BE for 0x0000003C"); + assertEquals("UTF_32BE", U.detectEncoding(buf), + "Should return UTF_32BE for 0x0000003C"); } @Test void testCase_0x003C003F() { byte[] buf = {(byte) 0x00, (byte) 0x3C, (byte) 0x00, (byte) 0x3F}; - assertEquals("UnicodeBigUnmarked", U.detectEncoding(buf), "Should return UnicodeBigUnmarked for 0x003C003F"); + assertEquals("UnicodeBigUnmarked", U.detectEncoding(buf), + "Should return UnicodeBigUnmarked for 0x003C003F"); } @Test void testCase_0xFFFE0000() { byte[] buf = {(byte) 0xFF, (byte) 0xFE, (byte) 0x00, (byte) 0x00}; - assertEquals("UTF_32LE", U.detectEncoding(buf), "Should return UTF_32LE for BOM 0xFFFE0000"); + assertEquals("UTF_32LE", U.detectEncoding(buf), + "Should return UTF_32LE for BOM 0xFFFE0000"); } @Test void testCase_0x3C000000() { byte[] buf = {(byte) 0x3C, (byte) 0x00, (byte) 0x00, (byte) 0x00}; - assertEquals("UTF_32LE", U.detectEncoding(buf), "Should return UTF_32LE for 0x3C000000"); + assertEquals("UTF_32LE", U.detectEncoding(buf), + "Should return UTF_32LE for 0x3C000000"); } @Test void testCase_0x3C003F00() { byte[] buf = {(byte) 0x3C, (byte) 0x00, (byte) 0x3F, (byte) 0x00}; - assertEquals("UnicodeLittleUnmarked", U.detectEncoding(buf), "Should return UnicodeLittleUnmarked for 0x3C003F00"); + assertEquals("UnicodeLittleUnmarked", U.detectEncoding(buf), + "Should return UnicodeLittleUnmarked for 0x3C003F00"); } @Test void testCase_0x3C3F786D() { byte[] buf = {(byte) 0x3C, (byte) 0x3F, (byte) 0x78, (byte) 0x6D}; - assertEquals("UTF8", U.detectEncoding(buf), "Should return UTF8 for 0x3C3F786D"); + assertEquals("UTF8", U.detectEncoding(buf), + "Should return UTF8 for 0x3C3F786D"); } @Test @@ -967,28 +974,32 @@ void testEfBbBf_UTF8() { // 0xEFBBBF??, so n >>> 8 == 0xEFBBBF // Let's set: [0xEF, 0xBB, 0xBF, 0x42] (0x42 is arbitrary) byte[] buf = {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF, (byte) 0x42}; - assertEquals("UTF8", U.detectEncoding(buf), "Should return UTF8 for buffer with UTF-8 BOM"); + assertEquals("UTF8", U.detectEncoding(buf), + "Should return UTF8 for buffer with UTF-8 BOM"); } @Test void test_nShift24_0x3C() { // (n >>> 24) == 0x3C, but not matching any above case byte[] buf = {(byte) 0x3C, 1, 2, 3}; - assertEquals("UTF8", U.detectEncoding(buf), "Should return UTF8 when (n >>> 24) == 0x3C and no previous case matches"); + assertEquals("UTF8", U.detectEncoding(buf), + "Should return UTF8 when (n >>> 24) == 0x3C and no previous case matches"); } @Test void test_nShift16_0xFFFE() { // (n >>> 16) == 0xFFFE (UnicodeLittleUnmarked branch) byte[] buf = {(byte) 0xFF, (byte) 0xFE, (byte) 0x21, (byte) 0x22}; - assertEquals("UnicodeLittleUnmarked", U.detectEncoding(buf), "Should return UnicodeLittleUnmarked when (n >> 16) == 0xFFFE"); + assertEquals("UnicodeLittleUnmarked", U.detectEncoding(buf), + "Should return UnicodeLittleUnmarked when (n >> 16) == 0xFFFE"); } @Test void test_nShift16_0xFEFF() { // (n >>> 16) == 0xFEFF (UnicodeBigUnmarked branch) byte[] buf = {(byte) 0xFE, (byte) 0xFF, (byte) 0x99, (byte) 0x88}; - assertEquals("UnicodeBigUnmarked", U.detectEncoding(buf), "Should return UnicodeBigUnmarked when (n >> 16) == 0xFEFF"); + assertEquals("UnicodeBigUnmarked", U.detectEncoding(buf), + "Should return UnicodeBigUnmarked when (n >> 16) == 0xFEFF"); } @Test