Skip to content

Commit d995e04

Browse files
authored
Merge pull request #541 from marklogic/feature/maybe-file-fix
MLE-23330 Trimming off scheme when writing URIs to files
2 parents 519b496 + 8b33ec4 commit d995e04

File tree

3 files changed

+75
-5
lines changed

3 files changed

+75
-5
lines changed

marklogic-spark-connector/src/main/java/com/marklogic/spark/writer/file/FileUtil.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,31 @@
33
*/
44
package com.marklogic.spark.writer.file;
55

6+
import com.marklogic.spark.Util;
67
import org.apache.hadoop.fs.Path;
78

9+
import java.net.URI;
10+
import java.net.URISyntaxException;
11+
812
public interface FileUtil {
913

1014
static Path makePathFromDocumentURI(String basePath, String documentURI) {
15+
// Mostly copied from MLCP.
16+
try {
17+
URI uri = new URI(documentURI);
18+
// As of 1.4.0, an opaque URI will not cause a problem due to the inclusion of "./" below.
19+
// We still parse the URI to get the path in case there is a scheme (e.g. file://).
20+
if (!uri.isOpaque()) {
21+
documentURI = uri.getPath();
22+
}
23+
} catch (URISyntaxException e) {
24+
// MLCP logs errors from parsing the URI at the "WARN" level. That seems noisy, as large numbers of URIs
25+
// could e.g. have spaces in them. So DEBUG is used instead.
26+
if (Util.MAIN_LOGGER.isDebugEnabled()) {
27+
Util.MAIN_LOGGER.debug("Unable to parse document URI: {}; will use unparsed URI as file path.", documentURI);
28+
}
29+
}
30+
1131
if (documentURI.charAt(0) == '/') {
1232
return new Path(basePath + documentURI);
1333
}

tests/src/test/java/com/marklogic/spark/writer/file/FileUtilTest.java

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,16 @@ void makePathFromRegularURI() {
1414
assertEquals("/base/path/to/doc.xml", makePath("/path/to/doc.xml"));
1515
}
1616

17+
@Test
18+
void noForwardSlash() {
19+
assertEquals("/base/path/to/doc.xml", makePath("path/to/doc.xml"));
20+
}
21+
22+
@Test
23+
void justFilename() {
24+
assertEquals("/base/doc.xml", makePath("doc.xml"));
25+
}
26+
1727
/**
1828
* This was altered in the 2.7.0 to fix a bug where a URI with two or more colons and
1929
* no leading slash causing a URISyntaxException when the Hadoop Path constructor was called. This
@@ -38,7 +48,20 @@ void uriWithSpace() {
3848
void allKindsOfStuff() {
3949
assertEquals("/base/has+lots of&/stuff_in-it.json", makePath("has+lots of&/stuff_in-it.json"));
4050
}
41-
51+
52+
@Test
53+
void fileBasedUri() {
54+
assertEquals("/base/hey.json", makePath("file://tmp/hey.json"), "Per the fixes for 1.4.0, " +
55+
"we're retaining the behavior where the scheme is removed for a non-opaque URI. This ensures we don't " +
56+
"try to files with e.g. 'http://' or 'file://' in the URI. This won't prevent issues like writing files " +
57+
"to Azure Storage, which doesn't allow colons - a colon can appear anywhere in the URI and still be valid.");
58+
}
59+
60+
@Test
61+
void httpBasedUri() {
62+
assertEquals("/base/path/to/file.xml", makePath("https://www.exampple.org/path/to/file.xml"));
63+
}
64+
4265
private String makePath(String uri) {
4366
return FileUtil.makePathFromDocumentURI("/base", uri).toString();
4467
}

tests/src/test/java/com/marklogic/spark/writer/file/WriteDocumentZipFilesTest.java

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -85,9 +85,36 @@ void opaqueURI(@TempDir Path tempDir) throws IOException {
8585
assertEquals(1, tempDir.toFile().listFiles().length);
8686
File file = tempDir.toFile().listFiles()[0];
8787
ZipFile zipFile = new ZipFile(file);
88-
assertNotNull(zipFile.getEntry("some:org:example/123.xml"), "some:org:example/123.xml is considered an 'opaque' URI per " +
89-
"the definition of java.net.URI:isOpaque. Per MLCP behavior, the URI is expected to be set to the " +
90-
"'schema-specific part', which is just example/123.xml.");
88+
assertNotNull(zipFile.getEntry(uri), "some:org:example/123.xml is considered an 'opaque' URI per " +
89+
"the definition of java.net.URI:isOpaque. As of 1.4.0 now, we no longer modify those when the document " +
90+
"is written to a zip, as zip entry names can be opaque URIs.");
91+
}
92+
93+
@Test
94+
void uriWithScheme(@TempDir Path tempDir) throws IOException {
95+
final String uri = "http://example.org/some/file.xml";
96+
97+
getDatabaseClient().newXMLDocumentManager().write(uri,
98+
TestUtil.withDefaultPermissions(new DocumentMetadataHandle()).withCollections("http-test"),
99+
new StringHandle("<hello>world</hello>"));
100+
101+
newSparkSession().read()
102+
.format(CONNECTOR_IDENTIFIER)
103+
.option(Options.CLIENT_URI, makeClientUri())
104+
.option(Options.READ_DOCUMENTS_COLLECTIONS, "http-test")
105+
.load()
106+
.repartition(1)
107+
.write()
108+
.format(CONNECTOR_IDENTIFIER)
109+
.option(Options.WRITE_FILES_COMPRESSION, "zip")
110+
.mode(SaveMode.Append)
111+
.save(tempDir.toFile().getAbsolutePath());
112+
113+
assertEquals(1, tempDir.toFile().listFiles().length);
114+
File file = tempDir.toFile().listFiles()[0];
115+
ZipFile zipFile = new ZipFile(file);
116+
assertNotNull(zipFile.getEntry(uri), "When writing URIs to a zip, we don't need to modify them at all, " +
117+
"unless we later find out there's some set of characters that are not allowed in zip entry names.");
91118
}
92119

93120
@Test
@@ -113,7 +140,7 @@ void uriWithSpaces(@TempDir Path tempDir) throws IOException {
113140
assertEquals(1, tempDir.toFile().listFiles().length);
114141
File file = tempDir.toFile().listFiles()[0];
115142
ZipFile zipFile = new ZipFile(file);
116-
assertNotNull(zipFile.getEntry("has multiple spaces.xml"));
143+
assertNotNull(zipFile.getEntry(uri));
117144
}
118145

119146
/**

0 commit comments

Comments
 (0)