Skip to content

Commit 0d2610d

Browse files
committed
MLE-24402 Removing deprecated fileRows.documentType option
Deprecated in 2.3.0, can be removed for 3.0.0
1 parent 97c0acc commit 0d2610d

File tree

10 files changed

+24
-51
lines changed

10 files changed

+24
-51
lines changed

docs/configuration.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,8 +189,8 @@ The following options control how the connector writes rows as documents to Mark
189189
| spark.marklogic.write.abortOnFailure | Whether the Spark job should abort if a batch fails to be written; defaults to `true`. |
190190
| spark.marklogic.write.batchSize | The number of documents written in a call to MarkLogic; defaults to 100. |
191191
| spark.marklogic.write.collections | Comma-delimited string of collection names to add to each document. |
192+
| spark.marklogic.write.documentType | Forces a document type when MarkLogic does not recognize a URI extension; must be one of `JSON`, `XML`, or `TEXT`. |
192193
| spark.marklogic.write.permissions | Comma-delimited string of role names and capabilities to add to each document - e.g. role1,read,role2,update,role3,execute . |
193-
| spark.marklogic.write.fileRows.documentType | Forces a document type when MarkLogic does not recognize a URI extension; must be one of `JSON`, `XML`, or `TEXT`. |
194194
| spark.marklogic.write.jsonRootName | As of 2.3.0, specifies a root field name when writing JSON documents based on arbitrary rows. |
195195
| spark.marklogic.write.temporalCollection | Name of a temporal collection to assign each document to. |
196196
| spark.marklogic.write.threadCount | The number of threads used across all partitions to send documents to MarkLogic; defaults to 4. |

docs/reading-data/reading-files/generic-file-support.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,10 +92,10 @@ If you are writing files with extensions that MarkLogic does not recognize based
9292
you can force a document type for each file with an unrecognized extension:
9393

9494
```
95-
.option("spark.marklogic.write.fileRows.documentType", "JSON")
95+
.option("spark.marklogic.write.documentType", "JSON")
9696
```
9797

98-
The `spark.marklogic.write.fileRows.documentType` option supports values of `JSON`, `XML`, and `TEXT`.
98+
The `spark.marklogic.write.documentType` option supports values of `JSON`, `XML`, and `TEXT`.
9999

100100
Please see [the guide on writing data](../../writing.md) for information on how "file rows" can then be written to
101101
MarkLogic as documents.

docs/writing.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ The URI can then be further adjusted as described in the "Controlling document U
4949
This feature allows for ingesting files of any type. The MarkLogic REST API will
5050
[determine the document type](https://docs.marklogic.com/guide/rest-dev/intro#id_53367) based on the URI extension, if
5151
MarkLogic recognizes it. If MarkLogic does not recognize the extension, and you wish to force a document type on each of
52-
the documents, you can set the `spark.marklogic.write.fileRows.documentType` option to one of `XML`, `JSON`, or `TEXT`.
52+
the documents, you can set the `spark.marklogic.write.documentType` option to one of `XML`, `JSON`, or `TEXT`.
5353

5454
### Writing document rows
5555

gradle.properties

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,6 @@ org.gradle.java.installations.paths=/users/ml/builder/java/jdk-11.0.2,/home/buil
2525
semaphoreHost=changeme
2626
semaphoreApiKey=changeme
2727
semaphorePath=/cls/dev/cs1/
28+
29+
# Bumping this up as with Spark 4, Gradle often runs into heap space issues when trying to build the project.
30+
org.gradle.jvmargs=-Xmx4g

marklogic-spark-connector/src/main/java/com/marklogic/spark/Options.java

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -355,16 +355,6 @@ public abstract class Options {
355355
public static final String WRITE_GRAPH = "spark.marklogic.write.graph";
356356
public static final String WRITE_GRAPH_OVERRIDE = "spark.marklogic.write.graphOverride";
357357

358-
/**
359-
* For writing rows adhering to Spark's binaryFile schema - https://spark.apache.org/docs/latest/sql-data-sources-binaryFile.html .
360-
*
361-
* @deprecated since 2.3.0
362-
*/
363-
@Deprecated(since = "2.3.0", forRemoval = true)
364-
// We don't need Sonar to remind us of this deprecation.
365-
@SuppressWarnings("java:S1133")
366-
public static final String WRITE_FILE_ROWS_DOCUMENT_TYPE = "spark.marklogic.write.fileRows.documentType";
367-
368358
// Forces a document type when writing rows corresponding to our document row schema. Used when the URI extension
369359
// does not result in MarkLogic choosing the correct document type.
370360
public static final String WRITE_DOCUMENT_TYPE = "spark.marklogic.write.documentType";

marklogic-spark-connector/src/main/java/com/marklogic/spark/writer/FileRowConverter.java

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
import java.util.stream.Stream;
2020

2121
/**
22-
* Knows how to build a document from a row corresponding to our {@code FileRowSchema}.
22+
* Knows how to build a document from a row corresponding to a row from Spark's binaryFile data source.
2323
*/
2424
class FileRowConverter implements RowConverter {
2525

@@ -47,16 +47,10 @@ public Iterator<DocumentInputs> getRemainingDocumentInputs() {
4747
return Stream.<DocumentInputs>empty().iterator();
4848
}
4949

50-
@SuppressWarnings({"deprecation", "removal"})
5150
private void forceFormatIfNecessary(BytesHandle content) {
5251
Format format = writeContext.getDocumentFormat();
5352
if (format != null) {
5453
content.withFormat(format);
55-
} else {
56-
format = writeContext.getDeprecatedFileRowsDocumentFormat();
57-
if (format != null) {
58-
content.withFormat(format);
59-
}
6054
}
6155
}
6256

marklogic-spark-connector/src/main/java/com/marklogic/spark/writer/WriteContext.java

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -181,28 +181,6 @@ public Format getDocumentFormat() {
181181
return null;
182182
}
183183

184-
/**
185-
* @deprecated since 2.3.0; users should use getDocumentFormat instead.
186-
*/
187-
@Deprecated(since = "2.3.0")
188-
// We don't need Sonar to remind us of this deprecation.
189-
@SuppressWarnings({"java:S1133", "removal"})
190-
Format getDeprecatedFileRowsDocumentFormat() {
191-
final String deprecatedOption = Options.WRITE_FILE_ROWS_DOCUMENT_TYPE;
192-
if (hasOption(deprecatedOption)) {
193-
String value = getStringOption(deprecatedOption);
194-
Objects.requireNonNull(value);
195-
try {
196-
return Format.valueOf(value.toUpperCase());
197-
} catch (IllegalArgumentException e) {
198-
String message = "Invalid value for %s: %s; must be one of 'JSON', 'XML', or 'TEXT'.";
199-
String optionAlias = getOptionNameForMessage(deprecatedOption);
200-
throw new ConnectorException(String.format(message, optionAlias, value));
201-
}
202-
}
203-
return null;
204-
}
205-
206184
/**
207185
* The URI template approach will typically be used with rows with an "arbitrary" schema where each column value
208186
* may be useful in constructing a URI.

marklogic-spark-connector/src/main/resources/marklogic-spark-messages.properties

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# Copyright (c) 2023-2025 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved.
12
# Defines various messages for the connector. Intended to be inherited and overridden by the ETL tool via
23
# marklogic-spark-messages_en.properties, where each option name can be associated with a CLI option in the ETL tool.
34
spark.marklogic.client.uri=
@@ -7,7 +8,6 @@ spark.marklogic.read.numPartitions=
78
spark.marklogic.read.noOpticQuery=No Optic query found; must define spark.marklogic.read.opticQuery
89
spark.marklogic.write.batchSize=
910
spark.marklogic.write.documentType=
10-
spark.marklogic.write.fileRows.documentType=
1111
spark.marklogic.write.graph=
1212
spark.marklogic.write.graphOverride=
1313
spark.marklogic.write.jsonRootName=

marklogic-spark-connector/src/test/java/com/marklogic/spark/reader/document/ReadDocumentRowsTest.java

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -315,11 +315,21 @@ void transformThrowsError() {
315315
.load();
316316

317317
SparkException ex = assertThrows(SparkException.class, dataset::count);
318-
assertTrue(ex.getMessage().contains("This is an intentional error for testing purposes."),
318+
String message = ex.getMessage();
319+
320+
if (message.contains("java.io.InterruptedIOException")) {
321+
// This test can sometimes fail due to a timeout while being run by Jenkins. No idea why that happens.
322+
// In the event that this happens, we try again.
323+
logger.warn("Unexpected timeout error, will try again: {}", message);
324+
ex = assertThrows(SparkException.class, dataset::count);
325+
message = ex.getMessage();
326+
}
327+
328+
assertTrue(message.contains("This is an intentional error for testing purposes."),
319329
"When the transform throws an error, our connector throws a ConnectorException, but Spark seems to wrap " +
320330
"its stacktrace into a SparkException, such that we can't access the original ConnectorException " +
321331
"object. But the transform error should be in the error message. " +
322-
"Actual message: " + ex.getMessage());
332+
"Actual message: " + message);
323333
}
324334

325335
private DataFrameReader startRead() {

marklogic-spark-connector/src/test/java/com/marklogic/spark/writer/WriteFileRowsTest.java

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,6 @@ void uriTemplate() {
114114
}
115115

116116
@Test
117-
@Deprecated
118117
void forceDocumentType() {
119118
newSparkSession()
120119
.read()
@@ -124,7 +123,7 @@ void forceDocumentType() {
124123
.format(CONNECTOR_IDENTIFIER)
125124
.options(defaultWriteOptions())
126125
// Verifies that the value gets capitalized.
127-
.option(Options.WRITE_FILE_ROWS_DOCUMENT_TYPE, "jSoN")
126+
.option(Options.WRITE_DOCUMENT_TYPE, "jSoN")
128127
.option(Options.WRITE_COLLECTIONS, "json-unrecognized-extension")
129128
.mode(SaveMode.Append)
130129
.save();
@@ -138,7 +137,6 @@ void forceDocumentType() {
138137
}
139138

140139
@Test
141-
@Deprecated
142140
void invalidDocumentType() {
143141
DataFrameWriter writer = newSparkSession()
144142
.read()
@@ -147,13 +145,13 @@ void invalidDocumentType() {
147145
.write()
148146
.format(CONNECTOR_IDENTIFIER)
149147
.option(Options.CLIENT_URI, makeClientUri())
150-
.option(Options.WRITE_FILE_ROWS_DOCUMENT_TYPE, "not valid")
148+
.option(Options.WRITE_DOCUMENT_TYPE, "not valid")
151149
.mode(SaveMode.Append);
152150

153151
SparkException ex = assertThrows(SparkException.class, writer::save);
154152
assertTrue(ex.getCause() instanceof ConnectorException);
155153
ConnectorException ce = (ConnectorException) ex.getCause();
156-
assertEquals("Invalid value for " + Options.WRITE_FILE_ROWS_DOCUMENT_TYPE + ": not valid; " +
154+
assertEquals("Invalid value for " + Options.WRITE_DOCUMENT_TYPE + ": not valid; " +
157155
"must be one of 'JSON', 'XML', or 'TEXT'.", ce.getMessage());
158156
}
159157

0 commit comments

Comments
 (0)