Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 85 additions & 0 deletions src/main/java/com/github/underscore/U.java
Original file line number Diff line number Diff line change
Expand Up @@ -2781,6 +2781,91 @@ public static String xmlToJson(String xml, XmlToJsonMode mode) {
return xmlToJson(xml, Json.JsonStringBuilder.Step.TWO_SPACES, mode);
}

public static void fileXmlToJson(String xmlFileName, String jsonFileName, Json.JsonStringBuilder.Step identStep)
throws IOException {
final byte[] bytes = Files.readAllBytes(Paths.get(xmlFileName));
String xmlText = new String(removeBom(bytes), detectEncoding(bytes));
Files.write(Paths.get(jsonFileName), formatString(xmlToJson(xmlText, identStep),
System.lineSeparator()).getBytes(StandardCharsets.UTF_8));
}

public static void fileXmlToJson(String xmlFileName, String jsonFileName) throws IOException {
fileXmlToJson(xmlFileName, jsonFileName, Json.JsonStringBuilder.Step.TWO_SPACES);
}

public static byte[] removeBom(byte[] bytes) {
if ((bytes.length >= 3) && (bytes[0] == -17) && (bytes[1] == -69) && (bytes[2] == -65)) {
return Arrays.copyOfRange(bytes, 3, bytes.length);
}
if ((bytes.length >= 2) && (bytes[0] == -1) && (bytes[1] == -2)) {
return Arrays.copyOfRange(bytes, 2, bytes.length);
}
if ((bytes.length >= 2) && (bytes[0] == -2) && (bytes[1] == -1)) {
return Arrays.copyOfRange(bytes, 2, bytes.length);
}
return bytes;
}

public static String detectEncoding(byte[] buffer) {
if (buffer.length < 4) {
return "UTF8";
}
String encoding = null;
int n = ((buffer[0] & 0xFF) << 24)
| ((buffer[1] & 0xFF) << 16)
| ((buffer[2] & 0xFF) << 8)
| (buffer[3] & 0xFF);
switch (n) {
case 0x0000FEFF:
case 0x0000003C:
encoding = "UTF_32BE";
break;
case 0x003C003F:
encoding = "UnicodeBigUnmarked";
break;
case 0xFFFE0000:
encoding = "UTF_32LE";
break;
case 0x3C000000:
encoding = "UTF_32LE";
break;
// <?
case 0x3C003F00:
encoding = "UnicodeLittleUnmarked";
break;
// <?xm
case 0x3C3F786D:
encoding = "UTF8";
break;
default:
if ((n >>> 8) == 0xEFBBBF) {
encoding = "UTF8";
break;
}
if ((n >>> 24) == 0x3C) {
break;
}
switch (n >>> 16) {
case 0xFFFE:
encoding = "UnicodeLittleUnmarked";
break;
case 0xFEFF:
encoding = "UnicodeBigUnmarked";
break;
default:
break;
}
}
return encoding == null ? "UTF8" : encoding;
}

public static String formatString(String data, String lineSeparator) {
if ("\n".equals(lineSeparator)) {
return data;
}
return data.replace("\n", lineSeparator);
}

public static String xmlOrJsonToJson(String xmlOrJson, Json.JsonStringBuilder.Step identStep) {
TextType textType = getTextType(xmlOrJson);
final String result;
Expand Down
247 changes: 247 additions & 0 deletions src/test/java/com/github/underscore/UnderscoreTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,22 @@

import static java.util.Arrays.asList;
import static java.util.Collections.singletonList;
import static org.junit.jupiter.api.Assertions.assertAll;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNotSame;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
Expand All @@ -45,6 +52,7 @@
import java.util.function.Function;
import java.util.function.Predicate;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;

/**
* Underscore library unit test.
Expand Down Expand Up @@ -837,4 +845,243 @@ void testMapToPropertiesWithNullValues() {
Properties properties2 = U.mapToProperties(null);
assertEquals(0, properties2.size());
}

@Test
void testRemoveUtf8Bom() {
// UTF-8 BOM: 0xEF,0xBB,0xBF == -17, -69, -65
byte[] withBom = new byte[] {-17, -69, -65, 1, 2, 3};
byte[] expected = new byte[] {1, 2, 3};
assertArrayEquals(expected, U.removeBom(withBom), "UTF-8 BOM should be removed");
}

@Test
void testRemoveUtf16BeBom() {
// UTF-16BE BOM: 0xFE,0xFF == -2, -1
byte[] withBom = new byte[] {-2, -1, 4, 5};
byte[] expected = new byte[] {4, 5};
assertArrayEquals(expected, U.removeBom(withBom), "UTF-16BE BOM should be removed");
}

@Test
void testRemoveUtf16LeBom() {
// UTF-16LE BOM: 0xFF,0xFE == -1, -2
byte[] withBom = new byte[] {-1, -2, 9};
byte[] expected = new byte[] {9};
assertArrayEquals(expected, U.removeBom(withBom), "UTF-16LE BOM should be removed");
}

@Test
void testNotShortBytesNoBom() {
// Less than 2 bytes (not possible to have BOM)
byte[] input = new byte[] {42};
assertArrayEquals(input, U.removeBom(input), "Short arrays with no BOM should be unchanged");
}

@Test
void testNoBomPresent() {
// No BOM present
byte[] input = new byte[] {3, 5, 0, -1, -7};
assertArrayEquals(input, U.removeBom(input), "Arrays without BOM should be unchanged");
}

@Test
void testAlmostBomButNotEnoughBytes() {
// only 2 bytes, not enough for UTF-8 BOM
byte[] input = new byte[] {-17, -69};
assertArrayEquals(input, U.removeBom(input), "Arrays with too few BOM bytes should be unchanged");
}

@Test
void testPrefixSimilarButNotABom() {
byte[] input = new byte[] {-1, 0, 1};
assertArrayEquals(input, U.removeBom(input), "Array starting with -1,0 is not a BOM, should be unchanged");
input = new byte[] {-2, 0, 1};
assertArrayEquals(input, U.removeBom(input), "Array starting with -2,0 is not a BOM, should be unchanged");
// 3 bytes but third is not -65
input = new byte[] {-17, -69, 0};
assertArrayEquals(input, U.removeBom(input), "Array with -17,-69,<not -65> is not a BOM");
input = new byte[] { -17, -69 };
assertArrayEquals(input, U.removeBom(input), "Should not remove BOM for length < 3");
input = new byte[] { 0, -69, -65, 33 };
assertArrayEquals(input, U.removeBom(input), "Should not remove BOM if first byte is not -17");
input = new byte[] { -17, 0, -65, 13 };
assertArrayEquals(input, U.removeBom(input), "Should not remove BOM if second byte is not -69");
input = new byte[] { -17, -69, 0, 14 };
assertArrayEquals(input, U.removeBom(input), "Should not remove BOM if third byte is not -65");
}

@Test
void testLengthLessThan4() {
byte[] buf0 = {};
byte[] buf1 = {1};
byte[] buf3 = {1, 2, 3};
assertEquals("UTF8", U.detectEncoding(buf0), "Should return UTF8 for empty array");
assertEquals("UTF8", U.detectEncoding(buf1), "Should return UTF8 for buffer length 1");
assertEquals("UTF8", U.detectEncoding(buf3), "Should return UTF8 for buffer length 3");
}

@Test
void testCase_0x0000FEFF() {
byte[] buf = {(byte) 0x00, (byte) 0x00, (byte) 0xFE, (byte) 0xFF};
assertEquals("UTF_32BE", U.detectEncoding(buf),
"Should return UTF_32BE for BOM 0x0000FEFF");
}

@Test
void testCase_0x0000003C() {
byte[] buf = {(byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x3C};
assertEquals("UTF_32BE", U.detectEncoding(buf),
"Should return UTF_32BE for 0x0000003C");
}

@Test
void testCase_0x003C003F() {
byte[] buf = {(byte) 0x00, (byte) 0x3C, (byte) 0x00, (byte) 0x3F};
assertEquals("UnicodeBigUnmarked", U.detectEncoding(buf),
"Should return UnicodeBigUnmarked for 0x003C003F");
}

@Test
void testCase_0xFFFE0000() {
byte[] buf = {(byte) 0xFF, (byte) 0xFE, (byte) 0x00, (byte) 0x00};
assertEquals("UTF_32LE", U.detectEncoding(buf),
"Should return UTF_32LE for BOM 0xFFFE0000");
}

@Test
void testCase_0x3C000000() {
byte[] buf = {(byte) 0x3C, (byte) 0x00, (byte) 0x00, (byte) 0x00};
assertEquals("UTF_32LE", U.detectEncoding(buf),
"Should return UTF_32LE for 0x3C000000");
}

@Test
void testCase_0x3C003F00() {
byte[] buf = {(byte) 0x3C, (byte) 0x00, (byte) 0x3F, (byte) 0x00};
assertEquals("UnicodeLittleUnmarked", U.detectEncoding(buf),
"Should return UnicodeLittleUnmarked for 0x3C003F00");
}

@Test
void testCase_0x3C3F786D() {
byte[] buf = {(byte) 0x3C, (byte) 0x3F, (byte) 0x78, (byte) 0x6D};
assertEquals("UTF8", U.detectEncoding(buf),
"Should return UTF8 for 0x3C3F786D");
}

@Test
void testEfBbBf_UTF8() {
// 0xEFBBBF??, so n >>> 8 == 0xEFBBBF
// Let's set: [0xEF, 0xBB, 0xBF, 0x42] (0x42 is arbitrary)
byte[] buf = {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF, (byte) 0x42};
assertEquals("UTF8", U.detectEncoding(buf),
"Should return UTF8 for buffer with UTF-8 BOM");
}

@Test
void test_nShift24_0x3C() {
// (n >>> 24) == 0x3C, but not matching any above case
byte[] buf = {(byte) 0x3C, 1, 2, 3};
assertEquals("UTF8", U.detectEncoding(buf),
"Should return UTF8 when (n >>> 24) == 0x3C and no previous case matches");
}

@Test
void test_nShift16_0xFFFE() {
// (n >>> 16) == 0xFFFE (UnicodeLittleUnmarked branch)
byte[] buf = {(byte) 0xFF, (byte) 0xFE, (byte) 0x21, (byte) 0x22};
assertEquals("UnicodeLittleUnmarked", U.detectEncoding(buf),
"Should return UnicodeLittleUnmarked when (n >> 16) == 0xFFFE");
}

@Test
void test_nShift16_0xFEFF() {
// (n >>> 16) == 0xFEFF (UnicodeBigUnmarked branch)
byte[] buf = {(byte) 0xFE, (byte) 0xFF, (byte) 0x99, (byte) 0x88};
assertEquals("UnicodeBigUnmarked", U.detectEncoding(buf),
"Should return UnicodeBigUnmarked when (n >> 16) == 0xFEFF");
}

@Test
void testDefaultCase() {
// Random data, not matching any case nor any shift checks. Should default to UTF8
byte[] buf = {(byte) 0x01, (byte) 0x23, (byte) 0x45, (byte) 0x67};
assertEquals("UTF8", U.detectEncoding(buf), "Should default to UTF8 for unknown byte patterns");
}

@Test
void testFormatString() {
// Test with \n line separator
String input1 = "line1\nline2\nline3";
assertEquals(
input1,
U.formatString(input1, "\n"),
"Should not modify string when line separator is already \\n"
);

// Test with different line separator
String input2 = "line1\nline2\nline3";
String expected2 = "line1\r\nline2\r\nline3";
assertEquals(
expected2,
U.formatString(input2, "\r\n"),
"Should replace \\n with specified line separator"
);

// Test with empty string
assertTrue(
U.formatString("", "\n").isEmpty(),
"Should handle empty string correctly"
);

// Test with no line breaks
String noBreaks = "text without breaks";
assertEquals(
noBreaks,
U.formatString(noBreaks, "\r\n"),
"Should not modify string without line breaks"
);
}

@Test
void testFileXmlToJson(@TempDir Path tempDir) throws IOException {
// Create temporary files
Path xmlPath = tempDir.resolve("test.xml");
Path jsonPath = tempDir.resolve("test.json");

// Write test XML content
String xml = "<?xml version=\"1.0\"?><root><item>value</item></root>";
Files.write(xmlPath, xml.getBytes(StandardCharsets.UTF_8));

// Test file conversion
assertDoesNotThrow(
() -> U.fileXmlToJson(xmlPath.toString(), jsonPath.toString()),
"File conversion should not throw exceptions"
);

// Verify the JSON file
assertTrue(
Files.exists(jsonPath),
"JSON file should be created"
);

String jsonContent = Files.readString(jsonPath);
assertAll("JSON file content verification",
() -> assertNotNull(jsonContent, "JSON content should not be null"),
() -> assertTrue(jsonContent.contains("\"item\": \"value\""),
"JSON should contain converted XML content")
);
}

@Test
void testFileXmlToJsonWithInvalidInput(@TempDir Path tempDir) {
Path nonExistentXml = tempDir.resolve("nonexistent.xml");
Path outputJson = tempDir.resolve("output.json");

assertThrows(
IOException.class,
() -> U.fileXmlToJson(nonExistentXml.toString(), outputJson.toString()),
"Should throw IOException when input file doesn't exist"
);
}
}
Loading