Skip to content

Commit a2ccd46

Browse files
davisusanibarkou
andauthored
GH-36421: [Java] Enable Support for reading JSON Datasets (#36422)
### Rationale for this change Enable Support for reading JSON Datasets #33732 on Java side ### What changes are included in this PR? Support for reading JSON Datasets ### Are these changes tested? Unit test added ### Are there any user-facing changes? No * Closes: #36421 Lead-authored-by: david dali susanibar arce <[email protected]> Co-authored-by: Sutou Kouhei <[email protected]> Signed-off-by: Sutou Kouhei <[email protected]>
1 parent 3f6dfb3 commit a2ccd46

File tree

6 files changed

+44
-8
lines changed

6 files changed

+44
-8
lines changed

ci/scripts/java_jni_macos_build.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ cmake \
7676
-DARROW_DEPENDENCY_USE_SHARED=OFF \
7777
-DARROW_GANDIVA=${ARROW_GANDIVA} \
7878
-DARROW_GANDIVA_STATIC_LIBSTDCPP=ON \
79+
-DARROW_JSON=${ARROW_DATASET} \
7980
-DARROW_ORC=${ARROW_ORC} \
8081
-DARROW_PARQUET=${ARROW_PARQUET} \
8182
-DARROW_S3=${ARROW_S3} \

docs/source/java/dataset.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ Currently supported file formats are:
4343
- Apache ORC (``.orc``)
4444
- Apache Parquet (``.parquet``)
4545
- Comma-Separated Values (``.csv``)
46+
- Line-delimited JSON Values (``.json``)
4647

4748
Below shows a simplest example of using Dataset to query a Parquet file in Java:
4849

java/dataset/src/main/cpp/jni_wrapper.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,10 @@ arrow::Result<std::shared_ptr<arrow::dataset::FileFormat>> GetFileFormat(
105105
#ifdef ARROW_CSV
106106
case 3:
107107
return std::make_shared<arrow::dataset::CsvFileFormat>();
108+
#endif
109+
#ifdef ARROW_JSON
110+
case 4:
111+
return std::make_shared<arrow::dataset::JsonFileFormat>();
108112
#endif
109113
default:
110114
std::string error_message =

java/dataset/src/main/java/org/apache/arrow/dataset/file/FileFormat.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ public enum FileFormat {
2525
ARROW_IPC(1),
2626
ORC(2),
2727
CSV(3),
28+
JSON(4),
2829
NONE(-1);
2930

3031
private final int id;

java/dataset/src/test/java/org/apache/arrow/dataset/CsvWriteSupport.java renamed to java/dataset/src/test/java/org/apache/arrow/dataset/TextBasedWriteSupport.java

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,17 +24,18 @@
2424
import java.net.URISyntaxException;
2525
import java.util.Random;
2626

27-
public class CsvWriteSupport {
27+
public class TextBasedWriteSupport {
2828
private final URI uri;
2929
private final Random random = new Random();
3030

31-
public CsvWriteSupport(File outputFolder) throws URISyntaxException {
32-
uri = new URI("file", outputFolder.getPath() + "/" + "generated-" + random.nextLong() + ".csv", null);
31+
public TextBasedWriteSupport(File outputFolder, String fileExtension) throws URISyntaxException {
32+
uri = new URI("file", outputFolder.getPath() + File.separator +
33+
"generated-" + random.nextLong() + fileExtension, null);
3334
}
3435

35-
public static CsvWriteSupport writeTempFile(File outputFolder, String... values)
36+
public static TextBasedWriteSupport writeTempFile(File outputFolder, String fileExtension, String... values)
3637
throws URISyntaxException, IOException {
37-
CsvWriteSupport writer = new CsvWriteSupport(outputFolder);
38+
TextBasedWriteSupport writer = new TextBasedWriteSupport(outputFolder, fileExtension);
3839
try (FileWriter addValues = new FileWriter(new File(writer.uri), true)) {
3940
for (Object value : values) {
4041
addValues.write(value + "\n");

java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,9 @@
3737
import java.util.concurrent.Executors;
3838
import java.util.stream.Collectors;
3939

40-
import org.apache.arrow.dataset.CsvWriteSupport;
4140
import org.apache.arrow.dataset.OrcWriteSupport;
4241
import org.apache.arrow.dataset.ParquetWriteSupport;
42+
import org.apache.arrow.dataset.TextBasedWriteSupport;
4343
import org.apache.arrow.dataset.jni.NativeDataset;
4444
import org.apache.arrow.dataset.jni.NativeInstanceReleasedException;
4545
import org.apache.arrow.dataset.jni.NativeMemoryPool;
@@ -407,8 +407,8 @@ public void testBaseOrcRead() throws Exception {
407407

408408
@Test
409409
public void testBaseCsvRead() throws Exception {
410-
CsvWriteSupport writeSupport = CsvWriteSupport.writeTempFile(
411-
TMP.newFolder(), "Name,Language", "Juno,Java", "Peter,Python", "Celin,C++");
410+
TextBasedWriteSupport writeSupport = TextBasedWriteSupport.writeTempFile(
411+
TMP.newFolder(), ".csv", "Name,Language", "Juno,Java", "Peter,Python", "Celin,C++");
412412
String expectedJsonUnordered = "[[\"Juno\", \"Java\"], [\"Peter\", \"Python\"], [\"Celin\", \"C++\"]]";
413413
ScanOptions options = new ScanOptions(100);
414414
try (
@@ -429,6 +429,34 @@ public void testBaseCsvRead() throws Exception {
429429
}
430430
}
431431

432+
@Test
433+
public void testBaseJsonRead() throws Exception {
434+
TextBasedWriteSupport writeSupport = TextBasedWriteSupport.writeTempFile(
435+
TMP.newFolder(), ".json",
436+
"{\"Type\": \"Compiled\", \"Language\": \"Java\"}",
437+
"{\"Type\": \"Interpreted\", \"Language\": \"Python\"}");
438+
String expectedJsonUnordered = "[[\"Compiled\", \"Java\"], " +
439+
"[\"Interpreted\", \"Python\"]]";
440+
ScanOptions options = new ScanOptions(100);
441+
try (
442+
FileSystemDatasetFactory factory = new FileSystemDatasetFactory(rootAllocator(), NativeMemoryPool.getDefault(),
443+
FileFormat.JSON, writeSupport.getOutputURI())
444+
) {
445+
List<ArrowRecordBatch> datum = collectResultFromFactory(factory, options);
446+
Schema schema = inferResultSchemaFromFactory(factory, options);
447+
448+
assertScanBatchesProduced(factory, options);
449+
assertEquals(1, datum.size());
450+
assertEquals(2, schema.getFields().size());
451+
assertEquals("Type", schema.getFields().get(0).getName());
452+
assertEquals("Language", schema.getFields().get(1).getName());
453+
454+
checkParquetReadResult(schema, expectedJsonUnordered, datum);
455+
456+
AutoCloseables.close(datum);
457+
}
458+
}
459+
432460
private void checkParquetReadResult(Schema schema, String expectedJson, List<ArrowRecordBatch> actual)
433461
throws IOException {
434462
final ObjectMapper json = new ObjectMapper();

0 commit comments

Comments
 (0)