Merge pull request #484 from marklogic/feature/medium-polaris

rjrudin · web-flow · commit 5e51ef6f1317 · 2025-04-29T16:05:04.000-04:00
Fixing medium Polaris issues
diff --git a/marklogic-spark-api/src/main/java/com/marklogic/spark/core/extraction/TikaTextExtractor.java b/marklogic-spark-api/src/main/java/com/marklogic/spark/core/extraction/TikaTextExtractor.java
@@ -15,6 +15,7 @@
 import java.io.IOException;
 import java.util.LinkedHashMap;
 import java.util.Map;
+import java.util.Objects;
 import java.util.Optional;
 
 public class TikaTextExtractor implements TextExtractor {
@@ -29,6 +30,7 @@ public Optional<ExtractionResult> extractText(DocumentInputs inputs) {
             return Optional.empty();
         }
 
+        Objects.requireNonNull(inputs.getContentAsBytes());
         try (ByteArrayInputStream stream = new ByteArrayInputStream(inputs.getContentAsBytes())) {
             Metadata metadata = new Metadata();
             String extractedText = tika.parseToString(stream, metadata);
diff --git a/marklogic-spark-api/src/main/java/com/marklogic/spark/dom/DOMHelper.java b/marklogic-spark-api/src/main/java/com/marklogic/spark/dom/DOMHelper.java
@@ -26,6 +26,7 @@
 import javax.xml.xpath.XPathFactory;
 import java.io.ByteArrayOutputStream;
 import java.io.StringReader;
+import java.util.Objects;
 
 /**
  * Simplifies operations with the Java DOM API.
@@ -55,6 +56,7 @@ public Document extractDocument(AbstractWriteHandle handle, String sourceUri) {
         }
 
         String xml = HandleAccessor.contentAsString(handle);
+        Objects.requireNonNull(xml);
         return parseXmlString(xml, sourceUri);
     }
 
diff --git a/marklogic-spark-connector/src/main/java/com/marklogic/spark/reader/document/DocumentContext.java b/marklogic-spark-connector/src/main/java/com/marklogic/spark/reader/document/DocumentContext.java
@@ -11,6 +11,7 @@
 import org.apache.spark.sql.util.CaseInsensitiveStringMap;
 
 import java.util.Map;
+import java.util.Objects;
 
 class DocumentContext extends ContextSupport {
 
@@ -42,7 +43,9 @@ SearchQueryDefinition buildSearchQuery(DatabaseClient client) {
         // REST API allows commas in URIs, but not newlines, so that's safe to use as a delimiter.
         String[] uris = null;
         if (hasOption(Options.READ_DOCUMENTS_URIS)) {
-            uris = getStringOption(Options.READ_DOCUMENTS_URIS).split("\n");
+            String value = getStringOption(Options.READ_DOCUMENTS_URIS);
+            Objects.requireNonNull(value);
+            uris = value.split("\n");
         }
         return new SearchQueryBuilder()
             .withStringQuery(props.get(Options.READ_DOCUMENTS_STRING_QUERY))
@@ -61,7 +64,9 @@ SearchQueryDefinition buildTriplesSearchQuery(DatabaseClient client) {
         final Map<String, String> props = getProperties();
         String[] uris = null;
         if (hasOption(Options.READ_TRIPLES_URIS)) {
-            uris = getStringOption(Options.READ_TRIPLES_URIS).split("\n");
+            String value = getStringOption(Options.READ_TRIPLES_URIS);
+            Objects.requireNonNull(value);
+            uris = value.split("\n");
         }
         return new SearchQueryBuilder()
             .withStringQuery(props.get(Options.READ_TRIPLES_STRING_QUERY))
diff --git a/marklogic-spark-connector/src/main/java/com/marklogic/spark/reader/document/DocumentRowSchema.java b/marklogic-spark-connector/src/main/java/com/marklogic/spark/reader/document/DocumentRowSchema.java
@@ -11,6 +11,8 @@
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
 
+import java.util.Objects;
+
 public abstract class DocumentRowSchema {
 
     public static final StructType SCHEMA = new StructType()
@@ -71,16 +73,19 @@ public static DocumentMetadataHandle makeDocumentMetadata(InternalRow row) {
     private static void addCollectionsToMetadata(InternalRow row, DocumentMetadataHandle metadata) {
         if (!row.isNullAt(3)) {
             ArrayData collections = row.getArray(3);
+            Objects.requireNonNull(collections);
             for (int i = 0; i < collections.numElements(); i++) {
-                String value = collections.get(i, DataTypes.StringType).toString();
-                metadata.getCollections().add(value);
+                Object value = collections.get(i, DataTypes.StringType);
+                Objects.requireNonNull(value);
+                metadata.getCollections().add(value.toString());
             }
         }
     }
 
     private static void addPermissionsToMetadata(InternalRow row, DocumentMetadataHandle metadata) {
         if (!row.isNullAt(4)) {
             MapData permissions = row.getMap(4);
+            Objects.requireNonNull(permissions);
             ArrayData roles = permissions.keyArray();
             ArrayData capabilities = permissions.valueArray();
             for (int i = 0; i < roles.numElements(); i++) {
@@ -109,6 +114,7 @@ private static void addPropertiesToMetadata(InternalRow row, DocumentMetadataHan
     private static void addMetadataValuesToMetadata(InternalRow row, DocumentMetadataHandle metadata) {
         if (!row.isNullAt(7)) {
             MapData properties = row.getMap(7);
+            Objects.requireNonNull(properties);
             ArrayData keys = properties.keyArray();
             ArrayData values = properties.valueArray();
             for (int i = 0; i < keys.numElements(); i++) {
diff --git a/marklogic-spark-connector/src/main/java/com/marklogic/spark/reader/document/ForestReader.java b/marklogic-spark-connector/src/main/java/com/marklogic/spark/reader/document/ForestReader.java
@@ -24,6 +24,7 @@
 import org.slf4j.LoggerFactory;
 
 import java.util.List;
+import java.util.Objects;
 import java.util.Set;
 
 /**
@@ -116,7 +117,9 @@ public InternalRow get() {
         DocumentRecord document = this.currentDocumentPage.next();
         DocumentRowBuilder builder = new DocumentRowBuilder(requestedMetadata).withUri(document.getUri());
         if (this.contentWasRequested) {
-            builder.withContent(document.getContent(new BytesHandle()).get());
+            BytesHandle content = document.getContent(new BytesHandle());
+            Objects.requireNonNull(content);
+            builder.withContent(content.get());
             builder.withFormat(document.getFormat() != null ? document.getFormat().toString() : Format.UNKNOWN.toString());
         }
         if (!requestedMetadata.isEmpty()) {
diff --git a/marklogic-spark-connector/src/main/java/com/marklogic/spark/reader/document/OpticTriplesReader.java b/marklogic-spark-connector/src/main/java/com/marklogic/spark/reader/document/OpticTriplesReader.java
@@ -25,6 +25,7 @@
 import java.net.URISyntaxException;
 import java.util.Iterator;
 import java.util.List;
+import java.util.Objects;
 
 /**
  * Reads triples from a batch of document URIs via the Optic fromTriples data accessor.
@@ -105,6 +106,7 @@ public void close() {
     }
 
     private void readNextBatchOfTriples(List<String> uris) {
+        Objects.requireNonNull(uris);
         PlanBuilder.ModifyPlan plan = op
             .fromTriples(op.pattern(op.col("subject"), op.col("predicate"), op.col(OBJECT_COLUMN), op.graphCol(GRAPH_COLUMN)))
             .where(op.cts.documentQuery(op.xs.stringSeq(uris.toArray(new String[0]))));
diff --git a/marklogic-spark-connector/src/main/java/com/marklogic/spark/reader/file/ArchiveFileReader.java b/marklogic-spark-connector/src/main/java/com/marklogic/spark/reader/file/ArchiveFileReader.java
@@ -16,6 +16,7 @@
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Objects;
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipInputStream;
 
@@ -173,7 +174,7 @@ private boolean readMetadataFollowedByContent() throws IOException {
 
         // We still do this to get the stream ready to read the next entry.
         ZipEntry contentZipEntry = FileUtil.findNextFileEntry(currentZipInputStream);
-
+        Objects.requireNonNull(contentZipEntry);
         DocumentRowBuilder rowBuilder = new DocumentRowBuilder(this.metadataCategories)
             .withUri(contentZipEntry.getName())
             .withMetadata(metadata);
diff --git a/marklogic-spark-connector/src/main/java/com/marklogic/spark/reader/file/MlcpMetadataConverter.java b/marklogic-spark-connector/src/main/java/com/marklogic/spark/reader/file/MlcpMetadataConverter.java
@@ -17,6 +17,7 @@
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Objects;
 
 /**
  * Handles converting an MLCP metadata document, generated when creating an MLCP archive, into a
@@ -54,6 +55,9 @@ private Format getFormat(Element mlcpMetadata) {
         Element format = mlcpMetadata.getChild("format");
         if (format != null && format.getChild("name") != null) {
             String value = format.getChildText("name");
+            if (value == null) {
+                return null;
+            }
             // MLCP uses "text()" for an unknown reason.
             if (value.startsWith("text")) {
                 value = "text";
@@ -137,6 +141,7 @@ private void addPermissions(Element mlcpMetadata, DocumentMetadataHandle restMet
         Element perms = this.saxBuilder.build(new StringReader(permString.getText())).getRootElement();
         for (Element perm : perms.getChildren("permission", SECURITY_NAMESPACE)) {
             String capability = perm.getChildText("capability", SECURITY_NAMESPACE);
+            Objects.requireNonNull(capability);
             DocumentMetadataHandle.Capability cap = DocumentMetadataHandle.Capability.valueOf(capability.toUpperCase());
             String roleId = perm.getChildText("role-id", SECURITY_NAMESPACE);
             String roleName = roleIdsToNames.get(roleId);
diff --git a/marklogic-spark-connector/src/main/java/com/marklogic/spark/reader/file/xml/UriElementExtractingReader.java b/marklogic-spark-connector/src/main/java/com/marklogic/spark/reader/file/xml/UriElementExtractingReader.java
@@ -13,24 +13,24 @@
  */
 class UriElementExtractingReader extends StreamReaderDelegate {
 
-    private XMLStreamReader source;
+    private XMLStreamReader reader;
     private final String uriNamespace;
     private final String uriElement;
 
     // Used to track when the URI element is detected.
     private boolean isReadingUriElement;
     private String uriValue;
 
-    UriElementExtractingReader(XMLStreamReader source, String uriNamespace, String uriElement) {
-        super(source);
-        this.source = source;
+    UriElementExtractingReader(XMLStreamReader reader, String uriNamespace, String uriElement) {
+        super(reader);
+        this.reader = reader;
         this.uriNamespace = uriNamespace;
         this.uriElement = uriElement;
     }
 
     @Override
     public int next() throws XMLStreamException {
-        int value = source.next();
+        int value = super.next();
         if (value == XMLStreamConstants.START_ELEMENT) {
             // Only use the first instance of the URI element that is found.
             if (matchesUriElement() && this.uriValue == null) {
@@ -39,7 +39,7 @@ public int next() throws XMLStreamException {
             }
         } else if (value == XMLStreamConstants.CHARACTERS) {
             if (this.isReadingUriElement) {
-                this.uriValue += source.getText();
+                this.uriValue += reader.getText();
             }
         } else if (value == XMLStreamConstants.END_ELEMENT && this.isReadingUriElement && matchesUriElement()) {
             this.isReadingUriElement = false;
@@ -48,8 +48,8 @@ public int next() throws XMLStreamException {
     }
 
     private boolean matchesUriElement() {
-        return source.getLocalName().equals(uriElement) &&
-            (this.uriNamespace == null || this.uriNamespace.equals(source.getNamespaceURI()));
+        return reader.getLocalName().equals(uriElement) &&
+            (this.uriNamespace == null || this.uriNamespace.equals(reader.getNamespaceURI()));
     }
 
     String getUriValue() {
diff --git a/marklogic-spark-connector/src/main/java/com/marklogic/spark/reader/optic/PlanAnalyzer.java b/marklogic-spark-connector/src/main/java/com/marklogic/spark/reader/optic/PlanAnalyzer.java
@@ -71,7 +71,7 @@ private PlanAnalysis readRowsInSingleCallToMarkLogic(String dslQuery) {
         return new PlanAnalysis(plan, Arrays.asList(PlanAnalysis.Partition.singleCallPartition()), 0);
     }
 
-    static List<PlanAnalysis.Partition> calculatePartitions(long rowCount, long userPartitionCount, long userBatchSize) {
+    static List<PlanAnalysis.Partition> calculatePartitions(final long rowCount, final long userPartitionCount, final long userBatchSize) {
         final long batchSize = userBatchSize > 0 ? userBatchSize : Long.parseLong("-1");
 
         long bucketsPerPartition = calculateBucketsPerPartition(rowCount, userPartitionCount, batchSize);
@@ -91,8 +91,9 @@ static List<PlanAnalysis.Partition> calculatePartitions(long rowCount, long user
      * The number of buckets per partition is always the same, as the random distribution of row IDs means we don't know
      * how rows will be distributed across buckets.
      */
-    private static long calculateBucketsPerPartition(long rowCount, long userPartitionCount, long batchSize) {
-        double rawBucketsPerPartition = ((double) rowCount / userPartitionCount) / batchSize;
+    private static long calculateBucketsPerPartition(final long rowCount, final long userPartitionCount, final long batchSize) {
+        final long divisor = userPartitionCount == 0 ? 1 : userPartitionCount;
+        double rawBucketsPerPartition = ((double) rowCount / divisor) / batchSize;
         // ceil is used here to ensure that given the batch size, a bucket typically will not have more rows in it
         // than the batch size. That's not guaranteed, as row IDs could have a distribution such that many rows are in
         // one particular bucket.
diff --git a/marklogic-spark-connector/src/main/java/com/marklogic/spark/writer/WriteBatcherDataWriter.java b/marklogic-spark-connector/src/main/java/com/marklogic/spark/writer/WriteBatcherDataWriter.java
@@ -287,7 +287,9 @@ private BatchRetrier makeBatchRetrier() {
      */
     private synchronized void writeFailedDocumentToArchive(DocumentWriteOperation failedDoc) {
         AbstractWriteHandle contentHandle = failedDoc.getContent();
-        byte[] content = ByteArray.concat(HandleAccessor.contentAsString(contentHandle).getBytes());
+        final String stringContent = HandleAccessor.contentAsString(contentHandle);
+        Objects.requireNonNull(stringContent);
+        byte[] content = ByteArray.concat(stringContent.getBytes());
 
         GenericInternalRow row = new DocumentRowBuilder(new ArrayList<>())
             .withUri(failedDoc.getUri()).withContent(content)
diff --git a/marklogic-spark-connector/src/main/java/com/marklogic/spark/writer/WriteContext.java b/marklogic-spark-connector/src/main/java/com/marklogic/spark/writer/WriteContext.java
@@ -18,10 +18,7 @@
 import com.marklogic.spark.reader.file.TripleRowSchema;
 import org.apache.spark.sql.types.StructType;
 
-import java.util.Arrays;
-import java.util.List;
-import java.util.Map;
-import java.util.Optional;
+import java.util.*;
 import java.util.function.BiConsumer;
 import java.util.stream.Stream;
 
@@ -174,6 +171,9 @@ public Format getDocumentFormat() {
             } catch (IllegalArgumentException e) {
                 String message = "Invalid value for %s: %s; must be one of 'JSON', 'XML', or 'TEXT'.";
                 String optionAlias = getOptionNameForMessage(Options.WRITE_DOCUMENT_TYPE);
+                if (optionAlias == null) {
+                    optionAlias = Options.WRITE_DOCUMENT_TYPE;
+                }
                 throw new ConnectorException(String.format(message, optionAlias, value));
             }
         }
@@ -190,6 +190,7 @@ Format getDeprecatedFileRowsDocumentFormat() {
         final String deprecatedOption = Options.WRITE_FILE_ROWS_DOCUMENT_TYPE;
         if (hasOption(deprecatedOption)) {
             String value = getStringOption(deprecatedOption);
+            Objects.requireNonNull(value);
             try {
                 return Format.valueOf(value.toUpperCase());
             } catch (IllegalArgumentException e) {
diff --git a/marklogic-spark-connector/src/main/java/com/marklogic/spark/writer/XmlUtil.java b/marklogic-spark-connector/src/main/java/com/marklogic/spark/writer/XmlUtil.java
diff --git a/marklogic-spark-connector/src/main/java/com/marklogic/spark/writer/document/DocumentRowConverter.java b/marklogic-spark-connector/src/main/java/com/marklogic/spark/writer/document/DocumentRowConverter.java
@@ -27,6 +27,7 @@
 import java.io.IOException;
 import java.io.ObjectInputStream;
 import java.util.Iterator;
+import java.util.Objects;
 import java.util.stream.Stream;
 
 /**
@@ -98,6 +99,7 @@ private JsonNode deserializeContentToJson(String initialUri, BytesHandle content
      */
     private Iterator<DocumentInputs> streamContentFromFile(String filePath, InternalRow row) {
         byte[] bytes = row.getBinary(1);
+        Objects.requireNonNull(bytes);
         FileContext fileContext;
         try (ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(bytes))) {
             fileContext = (FileContext) ois.readObject();
diff --git a/marklogic-spark-connector/src/main/java/com/marklogic/spark/writer/file/ContentWriter.java b/marklogic-spark-connector/src/main/java/com/marklogic/spark/writer/file/ContentWriter.java
@@ -25,6 +25,7 @@
 import java.io.OutputStream;
 import java.nio.charset.Charset;
 import java.util.Map;
+import java.util.Objects;
 
 /**
  * Knows how to write the value in the "content" column of a row conforming to our {@code DocumentRowSchema}. Supports
@@ -73,7 +74,9 @@ void writeContent(InternalRow row, OutputStream outputStream) throws IOException
         } else if (this.encoding != null) {
             // We know the string from MarkLogic is UTF-8, so we use getBytes to convert it to the user's
             // specified encoding (as opposed to new String(bytes, encoding)).
-            outputStream.write(new String(row.getBinary(1)).getBytes(this.encoding));
+            String content = new String(row.getBinary(1));
+            Objects.requireNonNull(content);
+            outputStream.write(content.getBytes(this.encoding));
         } else {
             outputStream.write(row.getBinary(1));
         }
@@ -146,6 +149,7 @@ private void prettyPrintContent(InternalRow row, OutputStream outputStream) thro
             if (this.encoding != null) {
                 outputStream.write(new String(content).getBytes(this.encoding));
             } else {
+                Objects.requireNonNull(content);
                 outputStream.write(content);
             }
         }
diff --git a/marklogic-spark-connector/src/main/java/com/marklogic/spark/writer/file/RdfFileWriter.java b/marklogic-spark-connector/src/main/java/com/marklogic/spark/writer/file/RdfFileWriter.java
@@ -32,6 +32,7 @@
 import java.text.SimpleDateFormat;
 import java.util.Date;
 import java.util.Map;
+import java.util.Objects;
 import java.util.zip.GZIPOutputStream;
 
 class RdfFileWriter implements DataWriter<InternalRow> {
@@ -126,6 +127,7 @@ private void createStream() throws IOException {
         }
 
         this.stream = StreamRDFWriter.getWriterStream(this.outputStream, langAndExtension.lang);
+        Objects.requireNonNull(this.stream);
         this.stream.start();
     }
 

Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,7 @@`
`26`	`26`	`import javax.xml.xpath.XPathFactory;`
`27`	`27`	`import java.io.ByteArrayOutputStream;`
`28`	`28`	`import java.io.StringReader;`
	`29`	`+import java.util.Objects;`
`29`	`30`
`30`	`31`	`/**`
`31`	`32`	`* Simplifies operations with the Java DOM API.`
`@@ -55,6 +56,7 @@ public Document extractDocument(AbstractWriteHandle handle, String sourceUri) {`
`55`	`56`	`}`
`56`	`57`
`57`	`58`	`String xml = HandleAccessor.contentAsString(handle);`
	`59`	`+ Objects.requireNonNull(xml);`
`58`	`60`	`return parseXmlString(xml, sourceUri);`
`59`	`61`	`}`
`60`	`62`