From 67dd26466f7d4fb6ac4da203f4f4006f03d1ea2a Mon Sep 17 00:00:00 2001 From: Marc Guasch Date: Mon, 30 Jun 2025 16:12:04 +0200 Subject: [PATCH 01/54] Add XmlProcessor initial implementation --- docs/reference/enrich-processor/index.md | 3 + docs/reference/enrich-processor/toc.yml | 1 + .../enrich-processor/xml-processor.md | 281 ++++++++ .../src/main/java/module-info.java | 2 + .../ingest/common/IngestCommonPlugin.java | 3 +- .../ingest/common/XmlProcessor.java | 340 ++++++++++ .../common/XmlProcessorFactoryTests.java | 82 +++ .../ingest/common/XmlProcessorTests.java | 626 ++++++++++++++++++ 8 files changed, 1337 insertions(+), 1 deletion(-) create mode 100644 docs/reference/enrich-processor/xml-processor.md create mode 100644 modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java create mode 100644 modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java create mode 100644 modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java diff --git a/docs/reference/enrich-processor/index.md b/docs/reference/enrich-processor/index.md index e220e763024e3..fb2cac99ee355 100644 --- a/docs/reference/enrich-processor/index.md +++ b/docs/reference/enrich-processor/index.md @@ -159,6 +159,9 @@ Refer to [Enrich your data](docs-content://manage-data/ingest/transform-enrich/d [`split` processor](/reference/enrich-processor/split-processor.md) : Splits a field into an array of values. +[`xml` processor](/reference/enrich-processor/xml-processor.md) +: Parses XML documents and converts them to JSON objects. + [`trim` processor](/reference/enrich-processor/trim-processor.md) : Trims whitespace from field. diff --git a/docs/reference/enrich-processor/toc.yml b/docs/reference/enrich-processor/toc.yml index 7da271e6f0554..370c020f5f393 100644 --- a/docs/reference/enrich-processor/toc.yml +++ b/docs/reference/enrich-processor/toc.yml @@ -46,3 +46,4 @@ toc: - file: urldecode-processor.md - file: uri-parts-processor.md - file: user-agent-processor.md + - file: xml-processor.md diff --git a/docs/reference/enrich-processor/xml-processor.md b/docs/reference/enrich-processor/xml-processor.md new file mode 100644 index 0000000000000..0621a66d8edc6 --- /dev/null +++ b/docs/reference/enrich-processor/xml-processor.md @@ -0,0 +1,281 @@ +--- +navigation_title: "XML" +mapped_pages: + - https://www.elastic.co/guide/en/elasticsearch/reference/current/xml-processor.html +--- + +# XML processor [xml-processor] + + +Parses XML documents and converts them to JSON objects using a streaming XML parser. This processor efficiently handles XML data by avoiding loading the entire document into memory. + +$$$xml-options$$$ + +| Name | Required | Default | Description | +| --- | --- | --- | --- | +| `field` | yes | - | The field containing the XML string to be parsed. | +| `target_field` | no | `field` | The field that the converted structured object will be written into. Any existing content in this field will be overwritten. | +| `ignore_missing` | no | `false` | If `true` and `field` does not exist, the processor quietly exits without modifying the document. | +| `ignore_failure` | no | `false` | Ignore failures for the processor. See [Handling pipeline failures](docs-content://manage-data/ingest/transform-enrich/ingest-pipelines.md#handling-pipeline-failures). | +| `to_lower` | no | `false` | Convert XML element names to lowercase. | +| `ignore_empty_value` | no | `false` | If `true`, the processor will filter out null and empty values from the parsed XML structure, including empty elements, elements with null values, and elements with whitespace-only content. | +| `description` | no | - | Description of the processor. Useful for describing the purpose of the processor or its configuration. | +| `if` | no | - | Conditionally execute the processor. See [Conditionally run a processor](docs-content://manage-data/ingest/transform-enrich/ingest-pipelines.md#conditionally-run-processor). | +| `on_failure` | no | - | Handle failures for the processor. See [Handling pipeline failures](docs-content://manage-data/ingest/transform-enrich/ingest-pipelines.md#handling-pipeline-failures). | +| `tag` | no | - | Identifier for the processor. Useful for debugging and metrics. | + +## Configuration + +```js +{ + "xml": { + "field": "xml_field", + "target_field": "parsed_xml", + "ignore_empty_value": true + } +} +``` + +## Examples + +### Basic XML parsing + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content" + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "William H. GaddisThe RecognitionsOne of the great seminal American novels." + } + } + ] +} +``` + +Result: + +```console-result +{ + "docs": [ + { + "doc": { + "_index": "_index", + "_id": "_id", + "_version": "-3", + "_source": { + "xml_content": "William H. GaddisThe RecognitionsOne of the great seminal American novels.", + "catalog": { + "book": { + "author": "William H. Gaddis", + "title": "The Recognitions", + "review": "One of the great seminal American novels." + } + } + }, + "_ingest": { + "timestamp": "2019-03-11T21:54:37.909224Z" + } + } + } + ] +} +``` + +### Filtering empty values + +When `ignore_empty_value` is set to `true`, the processor will remove empty elements from the parsed XML: + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content", + "target_field": "parsed_xml", + "ignore_empty_value": true + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "William H. GaddisOne of the great seminal American novels. Some content" + } + } + ] +} +``` + +Result with empty elements filtered out: + +```console-result +{ + "docs": [ + { + "doc": { + "_index": "_index", + "_id": "_id", + "_version": "-3", + "_source": { + "xml_content": "William H. GaddisOne of the great seminal American novels. Some content", + "parsed_xml": { + "catalog": { + "book": { + "author": "William H. Gaddis", + "review": "One of the great seminal American novels.", + "nested": { + "valid_content": "Some content" + } + } + } + } + }, + "_ingest": { + "timestamp": "2019-03-11T21:54:37.909224Z" + } + } + } + ] +} +``` + +### Converting element names to lowercase + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content", + "to_lower": true + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "William H. GaddisThe Recognitions" + } + } + ] +} +``` + +Result: + +```console-result +{ + "docs": [ + { + "doc": { + "_index": "_index", + "_id": "_id", + "_version": "-3", + "_source": { + "xml_content": "William H. GaddisThe Recognitions", + "catalog": { + "book": { + "author": "William H. Gaddis", + "title": "The Recognitions" + } + } + }, + "_ingest": { + "timestamp": "2019-03-11T21:54:37.909224Z" + } + } + } + ] +} +``` + +### Handling XML attributes + +XML attributes are included as properties in the resulting JSON object alongside element content: + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content" + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "The RecognitionsWilliam H. Gaddis" + } + } + ] +} +``` + +Result: + +```console-result +{ + "docs": [ + { + "doc": { + "_index": "_index", + "_id": "_id", + "_version": "-3", + "_source": { + "xml_content": "The RecognitionsWilliam H. Gaddis", + "catalog": { + "version": "1.0", + "book": { + "id": "123", + "isbn": "978-0-684-80335-9", + "title": { + "lang": "en", + "#text": "The Recognitions" + }, + "author": { + "nationality": "American", + "#text": "William H. Gaddis" + } + } + } + }, + "_ingest": { + "timestamp": "2019-03-11T21:54:37.909224Z" + } + } + } + ] +} +``` + +## XML features + +The XML processor supports: + +- **Elements with text content**: Converted to key-value pairs where the element name is the key and text content is the value +- **Nested elements**: Converted to nested JSON objects +- **Empty elements**: Converted to `null` values (can be filtered with `ignore_empty_value`) +- **Repeated elements**: Converted to arrays when multiple elements with the same name exist at the same level +- **XML attributes**: Included as properties in the JSON object alongside element content. When an element has both attributes and text content, the text is stored under a special `#text` key +- **Mixed content**: Elements with both text and child elements include text under a special `#text` key while attributes and child elements become object properties +- **Namespaces**: Local names are used, namespace prefixes are ignored diff --git a/modules/ingest-common/src/main/java/module-info.java b/modules/ingest-common/src/main/java/module-info.java index c3b3ab90892d9..ee98b7515e733 100644 --- a/modules/ingest-common/src/main/java/module-info.java +++ b/modules/ingest-common/src/main/java/module-info.java @@ -19,6 +19,8 @@ requires org.apache.logging.log4j; requires org.apache.lucene.analysis.common; requires org.jruby.joni; + + requires java.xml; exports org.elasticsearch.ingest.common; // for painless diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/IngestCommonPlugin.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/IngestCommonPlugin.java index 6e517d644cadb..0dc06e74af3bc 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/IngestCommonPlugin.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/IngestCommonPlugin.java @@ -74,7 +74,8 @@ public Map getProcessors(Processor.Parameters paramet entry(TrimProcessor.TYPE, new TrimProcessor.Factory()), entry(URLDecodeProcessor.TYPE, new URLDecodeProcessor.Factory()), entry(UppercaseProcessor.TYPE, new UppercaseProcessor.Factory()), - entry(UriPartsProcessor.TYPE, new UriPartsProcessor.Factory()) + entry(UriPartsProcessor.TYPE, new UriPartsProcessor.Factory()), + entry(XmlProcessor.TYPE, new XmlProcessor.Factory()) ); } diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java new file mode 100644 index 0000000000000..897d36d7084b7 --- /dev/null +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -0,0 +1,340 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.ingest.common; + +import org.elasticsearch.cluster.metadata.ProjectId; +import org.elasticsearch.ingest.AbstractProcessor; +import org.elasticsearch.ingest.ConfigurationUtils; +import org.elasticsearch.ingest.IngestDocument; +import org.elasticsearch.ingest.Processor; + +import java.io.StringReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; + +import javax.xml.stream.XMLInputFactory; +import javax.xml.stream.XMLStreamConstants; +import javax.xml.stream.XMLStreamException; +import javax.xml.stream.XMLStreamReader; + +/** + * Processor that parses XML documents and converts them to JSON objects using streaming XML parser. + * This implementation uses XMLStreamReader for efficient parsing and avoids loading the entire document in memory. + */ +public final class XmlProcessor extends AbstractProcessor { + + public static final String TYPE = "xml"; + + private final String field; + private final String targetField; + private final boolean ignoreMissing; + private final boolean ignoreFailure; + private final boolean toLower; + private final boolean ignoreEmptyValue; + + XmlProcessor( + String tag, + String description, + String field, + String targetField, + boolean ignoreMissing, + boolean ignoreFailure, + boolean toLower, + boolean ignoreEmptyValue + ) { + super(tag, description); + this.field = field; + this.targetField = targetField; + this.ignoreMissing = ignoreMissing; + this.ignoreFailure = ignoreFailure; + this.toLower = toLower; + this.ignoreEmptyValue = ignoreEmptyValue; + } + + public String getField() { + return field; + } + + public String getTargetField() { + return targetField; + } + + public boolean isIgnoreMissing() { + return ignoreMissing; + } + + public boolean isIgnoreEmptyValue() { + return ignoreEmptyValue; + } + + @Override + public IngestDocument execute(IngestDocument document) { + Object fieldValue = document.getFieldValue(field, Object.class, ignoreMissing); + + if (fieldValue == null && ignoreMissing) { + return document; + } else if (fieldValue == null) { + throw new IllegalArgumentException("field [" + field + "] is null, cannot parse XML"); + } + + if (fieldValue instanceof String == false) { + if (ignoreFailure) { + return document; + } + throw new IllegalArgumentException("field [" + field + "] is not a string, cannot parse XML"); + } + + String xmlString = (String) fieldValue; + try { + Object parsedXml = parseXml(xmlString.trim()); + if (ignoreEmptyValue) { + parsedXml = filterEmptyValues(parsedXml); + } + document.setFieldValue(targetField, parsedXml); + } catch (Exception e) { + if (ignoreFailure) { + return document; + } + throw new IllegalArgumentException("field [" + field + "] contains invalid XML: " + e.getMessage(), e); + } + + return document; + } + + @Override + public String getType() { + return TYPE; + } + + /** + * Recursively removes null and empty values from the parsed XML structure + * when ignoreEmptyValue is enabled. + */ + @SuppressWarnings("unchecked") + private Object filterEmptyValues(Object obj) { + if (obj == null) { + return null; + } + + if (obj instanceof Map) { + Map map = (Map) obj; + Map filtered = new HashMap<>(); + + for (Map.Entry entry : map.entrySet()) { + Object filteredValue = filterEmptyValues(entry.getValue()); + if (filteredValue != null && isEmptyValue(filteredValue) == false) { + filtered.put(entry.getKey(), filteredValue); + } + } + + return filtered.isEmpty() ? null : filtered; + } + + if (obj instanceof List) { + List list = (List) obj; + List filtered = new ArrayList<>(); + + for (Object item : list) { + Object filteredItem = filterEmptyValues(item); + if (filteredItem != null && isEmptyValue(filteredItem) == false) { + filtered.add(filteredItem); + } + } + + return filtered.isEmpty() ? null : filtered; + } + + return isEmptyValue(obj) ? null : obj; + } + + /** + * Determines if a value should be considered empty for filtering purposes. + */ + private boolean isEmptyValue(Object value) { + if (value == null) { + return true; + } + if (value instanceof String) { + return ((String) value).trim().isEmpty(); + } + if (value instanceof Map) { + return ((Map) value).isEmpty(); + } + if (value instanceof List) { + return ((List) value).isEmpty(); + } + return false; + } + + private Object parseXml(String xmlString) throws XMLStreamException { + if (xmlString == null || xmlString.trim().isEmpty()) { + return null; + } + + XMLInputFactory factory = XMLInputFactory.newInstance(); + factory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, false); + factory.setProperty(XMLInputFactory.IS_COALESCING, true); + factory.setProperty(XMLInputFactory.SUPPORT_DTD, false); + factory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false); + + try (StringReader reader = new StringReader(xmlString)) { + XMLStreamReader xmlReader = factory.createXMLStreamReader(reader); + + // Skip to the first element + while (xmlReader.hasNext() && xmlReader.getEventType() != XMLStreamConstants.START_ELEMENT) { + xmlReader.next(); + } + + if (xmlReader.hasNext() == false) { + return null; + } + + Object result = parseElement(xmlReader); + xmlReader.close(); + return result; + } + } + + private Object parseElement(XMLStreamReader reader) throws XMLStreamException { + if (reader.getEventType() != XMLStreamConstants.START_ELEMENT) { + return null; + } + + String elementName = reader.getLocalName(); + if (toLower) { + elementName = elementName.toLowerCase(Locale.ROOT); + } + + Map element = new HashMap<>(); + Map> repeatedElements = new HashMap<>(); + + // Parse attributes - they are available in START_ELEMENT state + int attributeCount = reader.getAttributeCount(); + boolean hasAttributes = attributeCount > 0; + for (int i = 0; i < attributeCount; i++) { + String attrName = reader.getAttributeLocalName(i); + String attrValue = reader.getAttributeValue(i); + if (toLower) { + attrName = attrName.toLowerCase(Locale.ROOT); + } + element.put(attrName, attrValue); + } + + StringBuilder textContent = new StringBuilder(); + + while (reader.hasNext()) { + int eventType = reader.next(); + + switch (eventType) { + case XMLStreamConstants.START_ELEMENT: + Object childElementResult = parseElement(reader); + String childName = reader.getLocalName(); + if (toLower) { + childName = childName.toLowerCase(Locale.ROOT); + } + + // Extract the actual content from the child element result + Object childContent = null; + if (childElementResult instanceof Map) { + @SuppressWarnings("unchecked") + Map childMap = (Map) childElementResult; + // The child element returns {elementName: content}, we want just the content + childContent = childMap.get(childName); + } else { + childContent = childElementResult; + } + + if (element.containsKey(childName) || repeatedElements.containsKey(childName)) { + // Handle repeated elements + if (repeatedElements.containsKey(childName) == false) { + List list = new ArrayList<>(); + list.add(element.get(childName)); + repeatedElements.put(childName, list); + element.remove(childName); + } + repeatedElements.get(childName).add(childContent); + } else { + element.put(childName, childContent); + } + break; + + case XMLStreamConstants.CHARACTERS: + String text = reader.getText(); + if (text != null && text.trim().isEmpty() == false) { + textContent.append(text); + } + break; + + case XMLStreamConstants.END_ELEMENT: + // Add repeated elements as arrays + for (Map.Entry> entry : repeatedElements.entrySet()) { + element.put(entry.getKey(), entry.getValue()); + } + + // Determine what to return + String trimmedText = textContent.toString().trim(); + boolean hasText = trimmedText.isEmpty() == false; + boolean hasChildren = element.size() > attributeCount; // Children beyond attributes + + Map result = new HashMap<>(); + if (hasText == false && hasChildren == false && hasAttributes == false) { + // Empty element + result.put(elementName, null); + return result; + } else if (hasText && hasChildren == false) { + // Only text content (and possibly attributes) + if (hasAttributes) { + element.put("#text", trimmedText); + result.put(elementName, element); + return result; + } else { + result.put(elementName, trimmedText); + return result; + } + } else if (hasText == false && hasChildren) { + // Only child elements (and possibly attributes) + result.put(elementName, element); + return result; + } else { + // Both text and children (and possibly attributes) + element.put("#text", trimmedText); + result.put(elementName, element); + return result; + } + } + } + + return null; + } + + public static final class Factory implements Processor.Factory { + + @Override + public XmlProcessor create( + Map registry, + String processorTag, + String description, + Map config, + ProjectId projectId + ) throws Exception { + String field = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "field"); + String targetField = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "target_field", field); + boolean ignoreMissing = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "ignore_missing", false); + boolean ignoreFailure = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "ignore_failure", false); + boolean toLower = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "to_lower", false); + boolean ignoreEmptyValue = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "ignore_empty_value", false); + + return new XmlProcessor(processorTag, description, field, targetField, ignoreMissing, ignoreFailure, toLower, ignoreEmptyValue); + } + } +} diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java new file mode 100644 index 0000000000000..ba1ba839a9edc --- /dev/null +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java @@ -0,0 +1,82 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.ingest.common; + +import org.elasticsearch.ElasticsearchParseException; +import org.elasticsearch.test.ESTestCase; + +import java.util.HashMap; +import java.util.Map; + +import static org.hamcrest.Matchers.equalTo; + +public class XmlProcessorFactoryTests extends ESTestCase { + + public void testCreate() throws Exception { + XmlProcessor.Factory factory = new XmlProcessor.Factory(); + Map config = new HashMap<>(); + config.put("field", "field1"); + config.put("target_field", "target"); + config.put("ignore_missing", true); + config.put("ignore_failure", true); + config.put("to_lower", true); + config.put("ignore_empty_value", true); + + String processorTag = randomAlphaOfLength(10); + XmlProcessor processor = factory.create(null, processorTag, null, config, null); + + assertThat(processor.getTag(), equalTo(processorTag)); + assertThat(processor.getField(), equalTo("field1")); + assertThat(processor.getTargetField(), equalTo("target")); + assertThat(processor.isIgnoreMissing(), equalTo(true)); + assertThat(processor.isIgnoreEmptyValue(), equalTo(true)); + } + + public void testCreateWithDefaults() throws Exception { + XmlProcessor.Factory factory = new XmlProcessor.Factory(); + Map config = new HashMap<>(); + config.put("field", "field1"); + + String processorTag = randomAlphaOfLength(10); + XmlProcessor processor = factory.create(null, processorTag, null, config, null); + + assertThat(processor.getTag(), equalTo(processorTag)); + assertThat(processor.getField(), equalTo("field1")); + assertThat(processor.getTargetField(), equalTo("field1")); + assertThat(processor.isIgnoreMissing(), equalTo(false)); + assertThat(processor.isIgnoreEmptyValue(), equalTo(false)); + } + + public void testCreateMissingField() throws Exception { + XmlProcessor.Factory factory = new XmlProcessor.Factory(); + Map config = new HashMap<>(); + + String processorTag = randomAlphaOfLength(10); + ElasticsearchParseException exception = expectThrows( + ElasticsearchParseException.class, + () -> factory.create(null, processorTag, null, config, null) + ); + assertThat(exception.getMessage(), equalTo("[field] required property is missing")); + } + + public void testCreateWithIgnoreEmptyValueOnly() throws Exception { + XmlProcessor.Factory factory = new XmlProcessor.Factory(); + Map config = new HashMap<>(); + config.put("field", "field1"); + config.put("ignore_empty_value", true); + + String processorTag = randomAlphaOfLength(10); + XmlProcessor processor = factory.create(null, processorTag, null, config, null); + + assertThat(processor.getField(), equalTo("field1")); + assertThat(processor.isIgnoreEmptyValue(), equalTo(true)); + assertThat(processor.isIgnoreMissing(), equalTo(false)); // other flags should remain default + } +} diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java new file mode 100644 index 0000000000000..0b1ee4123f128 --- /dev/null +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java @@ -0,0 +1,626 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.ingest.common; + +import org.elasticsearch.ingest.IngestDocument; +import org.elasticsearch.ingest.RandomDocumentPicks; +import org.elasticsearch.test.ESTestCase; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.hamcrest.Matchers.containsString; +import static org.hamcrest.Matchers.equalTo; + +public class XmlProcessorTests extends ESTestCase { + + public void testSimpleXmlDecodeWithTargetField() throws Exception { + String xml = """ + + + William H. Gaddis + The Recognitions + One of the great seminal American novels of the 20th century. + + """; + + XmlProcessor processor = new XmlProcessor("tag", null, "message", "xml", false, false, false, false); + Map document = new HashMap<>(); + document.put("message", xml); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + processor.execute(ingestDocument); + + @SuppressWarnings("unchecked") + Map xmlField = (Map) ingestDocument.getFieldValue("xml", Object.class); + @SuppressWarnings("unchecked") + Map catalog = (Map) xmlField.get("catalog"); + @SuppressWarnings("unchecked") + Map book = (Map) catalog.get("book"); + + assertThat(book.get("seq"), equalTo("1")); + assertThat(book.get("author"), equalTo("William H. Gaddis")); + assertThat(book.get("title"), equalTo("The Recognitions")); + assertThat(book.get("review"), equalTo("One of the great seminal American novels of the 20th century.")); + + // Original field should remain unchanged + assertThat(ingestDocument.getFieldValue("message", String.class), equalTo(xml)); + } + + public void testXmlDecodeToSameFieldWhenTargetIsField() throws Exception { + String xml = """ + + + + William H. Gaddis + The Recognitions + One of the great seminal American novels of the 20th century. + + """; + + XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, false, false); + Map document = new HashMap<>(); + document.put("message", xml); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + processor.execute(ingestDocument); + + @SuppressWarnings("unchecked") + Map messageField = (Map) ingestDocument.getFieldValue("message", Object.class); + @SuppressWarnings("unchecked") + Map catalog = (Map) messageField.get("catalog"); + @SuppressWarnings("unchecked") + Map book = (Map) catalog.get("book"); + + assertThat(book.get("seq"), equalTo("1")); + assertThat(book.get("author"), equalTo("William H. Gaddis")); + assertThat(book.get("title"), equalTo("The Recognitions")); + assertThat(book.get("review"), equalTo("One of the great seminal American novels of the 20th century.")); + } + + public void testXmlDecodeWithArray() throws Exception { + String xml = """ + + + + William H. Gaddis + The Recognitions + One of the great seminal American novels of the 20th century. + + + Ralls, Kim + Midnight Rain + Some review. + + """; + + XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, false, false); + Map document = new HashMap<>(); + document.put("message", xml); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + processor.execute(ingestDocument); + + @SuppressWarnings("unchecked") + Map messageField = (Map) ingestDocument.getFieldValue("message", Object.class); + @SuppressWarnings("unchecked") + Map catalog = (Map) messageField.get("catalog"); + @SuppressWarnings("unchecked") + List> books = (List>) catalog.get("book"); + + assertThat(books.size(), equalTo(2)); + + Map firstBook = books.get(0); + assertThat(firstBook.get("author"), equalTo("William H. Gaddis")); + assertThat(firstBook.get("title"), equalTo("The Recognitions")); + + Map secondBook = books.get(1); + assertThat(secondBook.get("author"), equalTo("Ralls, Kim")); + assertThat(secondBook.get("title"), equalTo("Midnight Rain")); + } + + public void testXmlDecodeWithToLower() throws Exception { + String xml = """ + + + + N/A + + + N/A + + + """; + + XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, true, false); + Map document = new HashMap<>(); + document.put("message", xml); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + processor.execute(ingestDocument); + + @SuppressWarnings("unchecked") + Map messageField = (Map) ingestDocument.getFieldValue("message", Object.class); + @SuppressWarnings("unchecked") + Map auditbase = (Map) messageField.get("auditbase"); + @SuppressWarnings("unchecked") + Map contextcomponents = (Map) auditbase.get("contextcomponents"); + @SuppressWarnings("unchecked") + List> components = (List>) contextcomponents.get("component"); + + assertThat(components.size(), equalTo(2)); + assertThat(components.get(0).get("relyingparty"), equalTo("N/A")); + assertThat(components.get(1).get("primaryauth"), equalTo("N/A")); + } + + public void testXmlDecodeWithMultipleElements() throws Exception { + String xml = """ + + + + William H. Gaddis + The Recognitions + One of the great seminal American novels of the 20th century. + + + Ralls, Kim + Midnight Rain + Some review. + + + + Ralls, Kim + A former architect battles corporate zombies, + an evil sorceress, and her own childhood to become queen of the world. + + + + """; + + XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, false, false); + Map document = new HashMap<>(); + document.put("message", xml); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + processor.execute(ingestDocument); + + @SuppressWarnings("unchecked") + Map messageField = (Map) ingestDocument.getFieldValue("message", Object.class); + @SuppressWarnings("unchecked") + Map catalog = (Map) messageField.get("catalog"); + + // Check books array + @SuppressWarnings("unchecked") + List> books = (List>) catalog.get("book"); + assertThat(books.size(), equalTo(2)); + + // Check secondcategory + @SuppressWarnings("unchecked") + Map secondcategory = (Map) catalog.get("secondcategory"); + @SuppressWarnings("unchecked") + Map paper = (Map) secondcategory.get("paper"); + assertThat(paper.get("id"), equalTo("bk102")); + assertThat(paper.get("test2"), equalTo("Ralls, Kim")); + } + + public void testXmlDecodeWithUtf16Encoding() throws Exception { + String xml = """ + + + + William H. Gaddis + The Recognitions + One of the great seminal American novels of the 20th century. + + """; + + XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, false, false); + Map document = new HashMap<>(); + document.put("message", xml); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + processor.execute(ingestDocument); + + @SuppressWarnings("unchecked") + Map messageField = (Map) ingestDocument.getFieldValue("message", Object.class); + @SuppressWarnings("unchecked") + Map catalog = (Map) messageField.get("catalog"); + @SuppressWarnings("unchecked") + Map book = (Map) catalog.get("book"); + + assertThat(book.get("author"), equalTo("William H. Gaddis")); + assertThat(book.get("title"), equalTo("The Recognitions")); + } + + public void testBrokenXmlWithIgnoreFailureFalse() { + String brokenXml = """ + + + + William H. Gaddis + The Recognitions + One of the great seminal American novels of the 20th century. + + catalog>"""; + + XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, false, false); + Map document = new HashMap<>(); + document.put("message", brokenXml); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + Exception exception = expectThrows(IllegalArgumentException.class, () -> processor.execute(ingestDocument)); + assertThat(exception.getMessage(), containsString("contains invalid XML")); + } + + public void testBrokenXmlWithIgnoreFailureTrue() throws Exception { + String brokenXml = """ + + + + William H. Gaddis + The Recognitions + One of the great seminal American novels of the 20th century. + + catalog>"""; + + XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, true, false, false); + Map document = new HashMap<>(); + document.put("message", brokenXml); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + // Should not throw exception and leave document unchanged + processor.execute(ingestDocument); + assertThat(ingestDocument.getFieldValue("message", String.class), equalTo(brokenXml)); + } + + public void testFieldNotFound() { + XmlProcessor processor = new XmlProcessor("tag", null, "nonexistent", "target", false, false, false, false); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), new HashMap<>()); + + Exception exception = expectThrows(IllegalArgumentException.class, () -> processor.execute(ingestDocument)); + assertThat(exception.getMessage(), containsString("not present as part of path [nonexistent]")); + } + + public void testFieldNotFoundWithIgnoreMissing() throws Exception { + XmlProcessor processor = new XmlProcessor("tag", null, "nonexistent", "target", true, false, false, false); + IngestDocument originalDocument = RandomDocumentPicks.randomIngestDocument(random(), new HashMap<>()); + IngestDocument ingestDocument = new IngestDocument(originalDocument); + + processor.execute(ingestDocument); + + // Document should remain unchanged + assertThat(ingestDocument.getSourceAndMetadata(), equalTo(originalDocument.getSourceAndMetadata())); + } + + public void testNullValue() { + XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, false, false); + Map document = new HashMap<>(); + document.put("field", null); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + Exception exception = expectThrows(IllegalArgumentException.class, () -> processor.execute(ingestDocument)); + assertThat(exception.getMessage(), containsString("field [field] is null")); + } + + public void testNullValueWithIgnoreMissing() throws Exception { + XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", true, false, false, false); + Map document = new HashMap<>(); + document.put("field", null); + IngestDocument originalDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + IngestDocument ingestDocument = new IngestDocument(originalDocument); + + processor.execute(ingestDocument); + + // Document should remain unchanged + assertThat(ingestDocument.getSourceAndMetadata(), equalTo(originalDocument.getSourceAndMetadata())); + } + + public void testNonStringValueWithIgnoreFailureFalse() { + XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, false, false); + Map document = new HashMap<>(); + document.put("field", 123); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + Exception exception = expectThrows(IllegalArgumentException.class, () -> processor.execute(ingestDocument)); + assertThat(exception.getMessage(), containsString("field [field] is not a string")); + } + + public void testNonStringValueWithIgnoreFailureTrue() throws Exception { + XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, true, false, false); + Map document = new HashMap<>(); + document.put("field", 123); + IngestDocument originalDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + IngestDocument ingestDocument = new IngestDocument(originalDocument); + + processor.execute(ingestDocument); + + // Document should remain unchanged + assertThat(ingestDocument.getSourceAndMetadata(), equalTo(originalDocument.getSourceAndMetadata())); + } + + public void testEmptyXml() throws Exception { + XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, false, false); + Map document = new HashMap<>(); + document.put("field", ""); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + processor.execute(ingestDocument); + + // Empty XML should result in null target + assertThat(ingestDocument.getFieldValue("target", Object.class), equalTo(null)); + } + + public void testWhitespaceOnlyXml() throws Exception { + XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, false, false); + Map document = new HashMap<>(); + document.put("field", " \n\t "); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + processor.execute(ingestDocument); + + // Whitespace-only XML should result in null target + assertThat(ingestDocument.getFieldValue("target", Object.class), equalTo(null)); + } + + public void testSelfClosingTag() throws Exception { + String xml = ""; + + XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, false, false); + Map document = new HashMap<>(); + document.put("field", xml); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + processor.execute(ingestDocument); + + @SuppressWarnings("unchecked") + Map result = (Map) ingestDocument.getFieldValue("target", Object.class); + assertThat(result.get("empty"), equalTo(null)); + } + + public void testSelfClosingTagWithAttributes() throws Exception { + String xml = ""; + + XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, false, false); + Map document = new HashMap<>(); + document.put("field", xml); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + processor.execute(ingestDocument); + + @SuppressWarnings("unchecked") + Map result = (Map) ingestDocument.getFieldValue("target", Object.class); + @SuppressWarnings("unchecked") + Map empty = (Map) result.get("empty"); + assertThat(empty.get("id"), equalTo("123")); + assertThat(empty.get("name"), equalTo("test")); + } + + public void testGetType() { + XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, false, false); + assertThat(processor.getType(), equalTo("xml")); + } + + public void testGetters() { + XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", true, true, true, false); + assertThat(processor.getField(), equalTo("field")); + assertThat(processor.getTargetField(), equalTo("target")); + assertThat(processor.isIgnoreMissing(), equalTo(true)); + } + + public void testIgnoreEmptyValueEnabled() throws Exception { + String xml = """ + + + William H. Gaddis + + One of the great seminal American novels. + + + + Some content + + + + """; + + XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, false, true); + Map document = new HashMap<>(); + document.put("message", xml); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + processor.execute(ingestDocument); + + @SuppressWarnings("unchecked") + Map messageField = (Map) ingestDocument.getFieldValue("message", Object.class); + @SuppressWarnings("unchecked") + Map catalog = (Map) messageField.get("catalog"); + @SuppressWarnings("unchecked") + Map book = (Map) catalog.get("book"); + + // Empty title should be filtered out + assertThat(book.containsKey("title"), equalTo(false)); + // Empty element should be filtered out + assertThat(book.containsKey("empty"), equalTo(false)); + // empty_book should be filtered out entirely + assertThat(catalog.containsKey("empty_book"), equalTo(false)); + + // Valid content should remain + assertThat(book.get("author"), equalTo("William H. Gaddis")); + assertThat(book.get("review"), equalTo("One of the great seminal American novels.")); + + // Nested structure handling + @SuppressWarnings("unchecked") + Map nested = (Map) book.get("nested"); + assertThat(nested.containsKey("empty_text"), equalTo(false)); // Whitespace-only should be filtered + assertThat(nested.get("valid_content"), equalTo("Some content")); + } + + public void testIgnoreEmptyValueWithArrays() throws Exception { + String xml = """ + + + Valid Book + + + + + + Another Valid Book + + """; + + XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, false, true); + Map document = new HashMap<>(); + document.put("message", xml); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + processor.execute(ingestDocument); + + @SuppressWarnings("unchecked") + Map messageField = (Map) ingestDocument.getFieldValue("message", Object.class); + @SuppressWarnings("unchecked") + Map catalog = (Map) messageField.get("catalog"); + @SuppressWarnings("unchecked") + List> books = (List>) catalog.get("book"); + + // Should have 2 books after filtering out the one with empty title + assertThat(books.size(), equalTo(2)); + assertThat(books.get(0).get("title"), equalTo("Valid Book")); + assertThat(books.get(1).get("title"), equalTo("Another Valid Book")); + } + + public void testIgnoreEmptyValueDisabled() throws Exception { + String xml = """ + + + William H. Gaddis + + + + """; + + XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, false, false); + Map document = new HashMap<>(); + document.put("message", xml); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + processor.execute(ingestDocument); + + @SuppressWarnings("unchecked") + Map messageField = (Map) ingestDocument.getFieldValue("message", Object.class); + @SuppressWarnings("unchecked") + Map catalog = (Map) messageField.get("catalog"); + @SuppressWarnings("unchecked") + Map book = (Map) catalog.get("book"); + + // Empty values should remain when ignore_empty_value is false + assertThat(book.containsKey("title"), equalTo(true)); + assertThat(book.get("title"), equalTo(null)); // Empty elements are parsed as null + assertThat(book.containsKey("empty"), equalTo(true)); + assertThat(book.get("empty"), equalTo(null)); + assertThat(book.get("author"), equalTo("William H. Gaddis")); + } + + public void testGettersWithIgnoreEmptyValue() { + XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", true, true, true, true); + assertThat(processor.getField(), equalTo("field")); + assertThat(processor.getTargetField(), equalTo("target")); + assertThat(processor.isIgnoreMissing(), equalTo(true)); + assertThat(processor.isIgnoreEmptyValue(), equalTo(true)); + } + + public void testElementsWithAttributesAndTextContent() throws Exception { + String xml = """ + + + The Recognitions + William H. Gaddis + 29.99 + + """; + + XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, false, false); + Map document = new HashMap<>(); + document.put("field", xml); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + processor.execute(ingestDocument); + + @SuppressWarnings("unchecked") + Map result = (Map) ingestDocument.getFieldValue("target", Object.class); + @SuppressWarnings("unchecked") + Map catalog = (Map) result.get("catalog"); + @SuppressWarnings("unchecked") + Map book = (Map) catalog.get("book"); + @SuppressWarnings("unchecked") + Map title = (Map) book.get("title"); + @SuppressWarnings("unchecked") + Map author = (Map) book.get("author"); + @SuppressWarnings("unchecked") + Map price = (Map) book.get("price"); + + // Test catalog with attributes only + assertThat(catalog.get("version"), equalTo("1.0")); + + // Test book with attributes only (no text content) + assertThat(book.get("id"), equalTo("123")); + assertThat(book.get("isbn"), equalTo("978-0-684-80335-9")); + + // Test elements with both attributes and text content (should use #text key) + assertThat(title.get("lang"), equalTo("en")); + assertThat(title.get("#text"), equalTo("The Recognitions")); + + assertThat(author.get("nationality"), equalTo("American")); + assertThat(author.get("#text"), equalTo("William H. Gaddis")); + + assertThat(price.get("currency"), equalTo("USD")); + assertThat(price.get("#text"), equalTo("29.99")); + } + + public void testMixedAttributesAndTextWithToLower() throws Exception { + String xml = """ + + + The Recognitions + William H. Gaddis + + """; + + XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, true, false); + Map document = new HashMap<>(); + document.put("field", xml); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + processor.execute(ingestDocument); + + @SuppressWarnings("unchecked") + Map result = (Map) ingestDocument.getFieldValue("target", Object.class); + @SuppressWarnings("unchecked") + Map catalog = (Map) result.get("catalog"); + @SuppressWarnings("unchecked") + Map book = (Map) catalog.get("book"); + @SuppressWarnings("unchecked") + Map title = (Map) book.get("title"); + @SuppressWarnings("unchecked") + Map author = (Map) book.get("author"); + + // Test that element names are converted to lowercase + assertThat(catalog.get("version"), equalTo("1.0")); + assertThat(book.get("id"), equalTo("123")); + + // Test that attribute names are converted to lowercase but values remain unchanged + assertThat(title.get("lang"), equalTo("EN")); + assertThat(title.get("#text"), equalTo("The Recognitions")); + + assertThat(author.get("nationality"), equalTo("AMERICAN")); + assertThat(author.get("#text"), equalTo("William H. Gaddis")); + } +} From 16e129ec0f2e300476423fba43dab895b97ed0da Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Mon, 30 Jun 2025 14:37:21 +0000 Subject: [PATCH 02/54] [CI] Auto commit changes from spotless --- modules/ingest-common/src/main/java/module-info.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ingest-common/src/main/java/module-info.java b/modules/ingest-common/src/main/java/module-info.java index ee98b7515e733..84e30519d2d1b 100644 --- a/modules/ingest-common/src/main/java/module-info.java +++ b/modules/ingest-common/src/main/java/module-info.java @@ -19,7 +19,7 @@ requires org.apache.logging.log4j; requires org.apache.lucene.analysis.common; requires org.jruby.joni; - + requires java.xml; exports org.elasticsearch.ingest.common; // for painless From 12f45606c2f0a86b646434037b9c13386ca66a31 Mon Sep 17 00:00:00 2001 From: Marc Guasch Date: Mon, 30 Jun 2025 18:04:49 +0200 Subject: [PATCH 03/54] Make factory static --- .../ingest/common/XmlProcessor.java | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index 897d36d7084b7..f3f1cc6d0a6a8 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -34,6 +34,8 @@ public final class XmlProcessor extends AbstractProcessor { public static final String TYPE = "xml"; + + private static final XMLInputFactory XML_INPUT_FACTORY = createXmlInputFactory(); private final String field; private final String targetField; @@ -157,6 +159,19 @@ private Object filterEmptyValues(Object obj) { return isEmptyValue(obj) ? null : obj; } + /** + * Creates and configures a secure XMLInputFactory for XML parsing. + * This factory is configured to prevent XXE attacks and optimize parsing. + */ + private static XMLInputFactory createXmlInputFactory() { + XMLInputFactory factory = XMLInputFactory.newInstance(); + factory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, false); + factory.setProperty(XMLInputFactory.IS_COALESCING, true); + factory.setProperty(XMLInputFactory.SUPPORT_DTD, false); + factory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false); + return factory; + } + /** * Determines if a value should be considered empty for filtering purposes. */ @@ -181,14 +196,8 @@ private Object parseXml(String xmlString) throws XMLStreamException { return null; } - XMLInputFactory factory = XMLInputFactory.newInstance(); - factory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, false); - factory.setProperty(XMLInputFactory.IS_COALESCING, true); - factory.setProperty(XMLInputFactory.SUPPORT_DTD, false); - factory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false); - try (StringReader reader = new StringReader(xmlString)) { - XMLStreamReader xmlReader = factory.createXMLStreamReader(reader); + XMLStreamReader xmlReader = XML_INPUT_FACTORY.createXMLStreamReader(reader); // Skip to the first element while (xmlReader.hasNext() && xmlReader.getEventType() != XMLStreamConstants.START_ELEMENT) { From 0a7205960a8838fc2f0a02b30146f2d5b99e5ad0 Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Mon, 30 Jun 2025 16:23:35 +0000 Subject: [PATCH 04/54] [CI] Auto commit changes from spotless --- .../main/java/org/elasticsearch/ingest/common/XmlProcessor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index f3f1cc6d0a6a8..8122e88f458aa 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -34,7 +34,7 @@ public final class XmlProcessor extends AbstractProcessor { public static final String TYPE = "xml"; - + private static final XMLInputFactory XML_INPUT_FACTORY = createXmlInputFactory(); private final String field; From 3a5689e472523820f5b3af5deb31f135a799a759 Mon Sep 17 00:00:00 2001 From: Marc Guasch Date: Tue, 1 Jul 2025 10:34:15 +0200 Subject: [PATCH 05/54] Update docs/changelog/130337.yaml --- docs/changelog/130337.yaml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 docs/changelog/130337.yaml diff --git a/docs/changelog/130337.yaml b/docs/changelog/130337.yaml new file mode 100644 index 0000000000000..2ea20ebd1944e --- /dev/null +++ b/docs/changelog/130337.yaml @@ -0,0 +1,6 @@ +pr: 130337 +summary: Add `XmlProcessor` initial implementation +area: Ingest Node +type: enhancement +issues: + - 97364 From 0ec3e67e81d1bd0e5285eec835fee4b8041e23eb Mon Sep 17 00:00:00 2001 From: Marc Guasch Date: Fri, 4 Jul 2025 15:56:28 +0200 Subject: [PATCH 06/54] feat: rewrite XML processor for Logstash feature parity - Replace XMLStreamReader with SAX parser + DOM for XPath support - Add XPath extraction, namespaces, strict parsing, content filtering - New options: force_array, force_content, remove_namespaces, store_xml - Enhanced security with XXE protection and pre-compiled XPath expressions - Full test coverage and updated documentation --- .../enrich-processor/xml-processor.md | 352 +++++- .../ingest/common/XmlProcessor.java | 958 ++++++++++++--- .../common/XmlProcessorFactoryTests.java | 376 +++++- .../ingest/common/XmlProcessorTests.java | 1094 +++++++++-------- 4 files changed, 1990 insertions(+), 790 deletions(-) diff --git a/docs/reference/enrich-processor/xml-processor.md b/docs/reference/enrich-processor/xml-processor.md index 0621a66d8edc6..7452fea12ed65 100644 --- a/docs/reference/enrich-processor/xml-processor.md +++ b/docs/reference/enrich-processor/xml-processor.md @@ -6,8 +6,7 @@ mapped_pages: # XML processor [xml-processor] - -Parses XML documents and converts them to JSON objects using a streaming XML parser. This processor efficiently handles XML data by avoiding loading the entire document into memory. +Parses XML documents and converts them to JSON objects using a DOM parser. This processor efficiently handles XML data with a single-parse architecture that supports both structured output and XPath extraction for optimal performance. $$$xml-options$$$ @@ -15,10 +14,17 @@ $$$xml-options$$$ | --- | --- | --- | --- | | `field` | yes | - | The field containing the XML string to be parsed. | | `target_field` | no | `field` | The field that the converted structured object will be written into. Any existing content in this field will be overwritten. | +| `store_xml` | no | `true` | If `true`, stores the parsed XML structure in the target field. If `false`, only XPath extraction results are stored. | | `ignore_missing` | no | `false` | If `true` and `field` does not exist, the processor quietly exits without modifying the document. | -| `ignore_failure` | no | `false` | Ignore failures for the processor. See [Handling pipeline failures](docs-content://manage-data/ingest/transform-enrich/ingest-pipelines.md#handling-pipeline-failures). | -| `to_lower` | no | `false` | Convert XML element names to lowercase. | +| `ignore_failure` | no | `false` | Ignore failures for the processor. When `true` and XML parsing fails, adds `_xmlparsefailure` tag to the document. See [Handling pipeline failures](docs-content://manage-data/ingest/transform-enrich/ingest-pipelines.md#handling-pipeline-failures). | +| `to_lower` | no | `false` | Convert XML element names and attribute names to lowercase. | | `ignore_empty_value` | no | `false` | If `true`, the processor will filter out null and empty values from the parsed XML structure, including empty elements, elements with null values, and elements with whitespace-only content. | +| `remove_namespaces` | no | `false` | If `true`, removes namespace prefixes from element and attribute names. | +| `force_content` | no | `false` | If `true`, forces text content and attributes to always parse to a hash value with `#text` key for content. | +| `force_array` | no | `false` | If `true`, forces all parsed values to be arrays. Single elements are wrapped in arrays. | +| `parse_options` | no | - | Controls XML parsing behavior. Set to `"strict"` for strict XML validation that fails fast on invalid content. | +| `xpath` | no | - | Map of XPath expressions to target field names. Extracts values from the XML using XPath and stores them in the specified fields. | +| `namespaces` | no | - | Map of namespace prefixes to URIs for use with XPath expressions. Required when XPath expressions contain namespace prefixes. | | `description` | no | - | Description of the processor. Useful for describing the purpose of the processor or its configuration. | | `if` | no | - | Conditionally execute the processor. See [Conditionally run a processor](docs-content://manage-data/ingest/transform-enrich/ingest-pipelines.md#conditionally-run-processor). | | `on_failure` | no | - | Handle failures for the processor. See [Handling pipeline failures](docs-content://manage-data/ingest/transform-enrich/ingest-pipelines.md#handling-pipeline-failures). | @@ -69,9 +75,7 @@ Result: "docs": [ { "doc": { - "_index": "_index", - "_id": "_id", - "_version": "-3", + ... "_source": { "xml_content": "William H. GaddisThe RecognitionsOne of the great seminal American novels.", "catalog": { @@ -81,9 +85,6 @@ Result: "review": "One of the great seminal American novels." } } - }, - "_ingest": { - "timestamp": "2019-03-11T21:54:37.909224Z" } } } @@ -126,9 +127,7 @@ Result with empty elements filtered out: "docs": [ { "doc": { - "_index": "_index", - "_id": "_id", - "_version": "-3", + ... "_source": { "xml_content": "William H. GaddisOne of the great seminal American novels. Some content", "parsed_xml": { @@ -142,9 +141,6 @@ Result with empty elements filtered out: } } } - }, - "_ingest": { - "timestamp": "2019-03-11T21:54:37.909224Z" } } } @@ -184,9 +180,7 @@ Result: "docs": [ { "doc": { - "_index": "_index", - "_id": "_id", - "_version": "-3", + ... "_source": { "xml_content": "William H. GaddisThe Recognitions", "catalog": { @@ -195,9 +189,6 @@ Result: "title": "The Recognitions" } } - }, - "_ingest": { - "timestamp": "2019-03-11T21:54:37.909224Z" } } } @@ -238,9 +229,7 @@ Result: "docs": [ { "doc": { - "_index": "_index", - "_id": "_id", - "_version": "-3", + ... "_source": { "xml_content": "The RecognitionsWilliam H. Gaddis", "catalog": { @@ -258,9 +247,314 @@ Result: } } } - }, - "_ingest": { - "timestamp": "2019-03-11T21:54:37.909224Z" + } + } + } + ] +} +``` + +### XPath extraction + +The XML processor can extract specific values using XPath expressions and store them in designated fields: + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content", + "store_xml": false, + "xpath": { + "//book/title/text()": "book_title", + "//book/author/text()": "book_author", + "//book/@id": "book_id" + } + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "The RecognitionsWilliam H. Gaddis1984George Orwell" + } + } + ] +} +``` + +Result: + +```console-result +{ + "docs": [ + { + "doc": { + ... + "_source": { + "xml_content": "The RecognitionsWilliam H. Gaddis1984George Orwell", + "book_title": ["The Recognitions", "1984"], + "book_author": ["William H. Gaddis", "George Orwell"], + "book_id": ["123", "456"] + } + } + } + ] +} +``` + +### XPath with namespaces + +When working with XML that uses namespaces, you need to configure namespace mappings: + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content", + "namespaces": { + "book": "http://example.com/book", + "author": "http://example.com/author" + }, + "xpath": { + "//book:catalog/book:item/book:title/text()": "titles", + "//author:info/@name": "author_names" + } + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "The Recognitions" + } + } + ] +} +``` + +Result: + +```console-result +{ + "docs": [ + { + "doc": { + ... + "_source": { + "xml_content": "The Recognitions", + "titles": "The Recognitions", + "author_names": "William H. Gaddis", + "book:catalog": { + "book:item": { + "book:title": "The Recognitions", + "author:info": { + "name": "William H. Gaddis" + } + } + } + } + } + } + ] +} +``` + +### Force array behavior + +When `force_array` is true, all parsed values become arrays: + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content", + "force_array": true + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "The Recognitions" + } + } + ] +} +``` + +Result: + +```console-result +{ + "docs": [ + { + "doc": { + ... + "_source": { + "xml_content": "The Recognitions", + "catalog": [ + { + "book": [ + { + "title": ["The Recognitions"] + } + ] + } + ] + } + } + } + ] +} +``` + +### Strict parsing mode + +Use `parse_options: "strict"` for strict XML validation: + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content", + "parse_options": "strict", + "ignore_failure": true + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "Invalid XML with control character" + } + } + ] +} +``` + +Result (with parsing failure): + +```console-result +{ + "docs": [ + { + "doc": { + ... + "_source": { + "xml_content": "Invalid XML with control character", + "tags": ["_xmlparsefailure"] + } + } + } + ] +} +``` + +### Mixed content handling + +When XML contains mixed content (text interspersed with elements), text fragments are combined and stored under the special `#text` key: + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content" + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "This text is bold and this is italic!" + } + } + ] +} +``` + +Result: + +```console-result +{ + "docs": [ + { + "doc": { + ... + "_source": { + "xml_content": "This text is bold and this is italic!", + "foo": { + "b": "bold", + "i": "italic", + "#text": "This text is and this is !" + } + } + } + } + ] +} +``` + +### Force content mode + +When `force_content` is `true`, all element text content is stored under the special `#text` key: + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content", + "force_content": true + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "The Recognitions" + } + } + ] +} +``` + +Result: + +```console-result +{ + "docs": [ + { + "doc": { + ... + "_source": { + "xml_content": "The Recognitions", + "book": { + "author": "William H. Gaddis", + "#text": "The Recognitions" + } } } } @@ -278,4 +572,4 @@ The XML processor supports: - **Repeated elements**: Converted to arrays when multiple elements with the same name exist at the same level - **XML attributes**: Included as properties in the JSON object alongside element content. When an element has both attributes and text content, the text is stored under a special `#text` key - **Mixed content**: Elements with both text and child elements include text under a special `#text` key while attributes and child elements become object properties -- **Namespaces**: Local names are used, namespace prefixes are ignored +- **Namespaces**: Namespace prefixes are preserved by default and can be used in XPath expressions with the `namespaces` configuration. Use `remove_namespaces: true` to strip namespace prefixes from element names diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index 8122e88f458aa..582b8d9f3e787 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -15,27 +15,51 @@ import org.elasticsearch.ingest.IngestDocument; import org.elasticsearch.ingest.Processor; -import java.io.StringReader; import java.util.ArrayList; import java.util.HashMap; +import java.util.Iterator; import java.util.List; import java.util.Locale; import java.util.Map; -import javax.xml.stream.XMLInputFactory; -import javax.xml.stream.XMLStreamConstants; -import javax.xml.stream.XMLStreamException; -import javax.xml.stream.XMLStreamReader; +import javax.xml.namespace.NamespaceContext; +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.xpath.XPath; +import javax.xml.xpath.XPathConstants; +import javax.xml.xpath.XPathExpression; +import javax.xml.xpath.XPathExpressionException; +import javax.xml.xpath.XPathFactory; + +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; /** - * Processor that parses XML documents and converts them to JSON objects using streaming XML parser. - * This implementation uses XMLStreamReader for efficient parsing and avoids loading the entire document in memory. + * Processor that parses XML documents and converts them to JSON objects using a single-pass streaming approach. + * + * Features: + * - XML to JSON conversion with configurable structure options + * - XPath extraction with namespace support + * - Configurable options: force_array, force_content, remove_namespaces, to_lower + * - Strict parsing mode for XML validation + * - Empty value filtering with ignore_empty_value option + * - Logstash-compatible error handling and behavior */ public final class XmlProcessor extends AbstractProcessor { public static final String TYPE = "xml"; - - private static final XMLInputFactory XML_INPUT_FACTORY = createXmlInputFactory(); + + private static final XPathFactory XPATH_FACTORY = XPathFactory.newInstance(); + + // Pre-configured SAX parser factories for secure XML parsing + private static final javax.xml.parsers.SAXParserFactory SAX_PARSER_FACTORY = createSecureSaxParserFactory(); + private static final javax.xml.parsers.SAXParserFactory SAX_PARSER_FACTORY_NS = createSecureSaxParserFactoryNamespaceAware(); + private static final javax.xml.parsers.SAXParserFactory SAX_PARSER_FACTORY_STRICT = createSecureSaxParserFactoryStrict(); + private static final javax.xml.parsers.SAXParserFactory SAX_PARSER_FACTORY_NS_STRICT = createSecureSaxParserFactoryNamespaceAwareStrict(); + + // Pre-configured document builder factory for DOM creation + private static final DocumentBuilderFactory DOM_FACTORY = createSecureDocumentBuilderFactory(); private final String field; private final String targetField; @@ -43,6 +67,14 @@ public final class XmlProcessor extends AbstractProcessor { private final boolean ignoreFailure; private final boolean toLower; private final boolean ignoreEmptyValue; + private final boolean storeXml; + private final boolean removeNamespaces; + private final boolean forceContent; + private final boolean forceArray; + private final Map xpathExpressions; + private final Map namespaces; + private final Map compiledXPathExpressions; + private final String parseOptions; XmlProcessor( String tag, @@ -52,7 +84,14 @@ public final class XmlProcessor extends AbstractProcessor { boolean ignoreMissing, boolean ignoreFailure, boolean toLower, - boolean ignoreEmptyValue + boolean ignoreEmptyValue, + boolean storeXml, + boolean removeNamespaces, + boolean forceContent, + boolean forceArray, + Map xpathExpressions, + Map namespaces, + String parseOptions ) { super(tag, description); this.field = field; @@ -61,6 +100,14 @@ public final class XmlProcessor extends AbstractProcessor { this.ignoreFailure = ignoreFailure; this.toLower = toLower; this.ignoreEmptyValue = ignoreEmptyValue; + this.storeXml = storeXml; + this.removeNamespaces = removeNamespaces; + this.forceContent = forceContent; + this.forceArray = forceArray; + this.xpathExpressions = xpathExpressions != null ? Map.copyOf(xpathExpressions) : Map.of(); + this.namespaces = namespaces != null ? Map.copyOf(namespaces) : Map.of(); + this.compiledXPathExpressions = compileXPathExpressions(this.xpathExpressions, this.namespaces); + this.parseOptions = parseOptions != null ? parseOptions : ""; } public String getField() { @@ -79,13 +126,46 @@ public boolean isIgnoreEmptyValue() { return ignoreEmptyValue; } + public boolean isStoreXml() { + return storeXml; + } + + public boolean isRemoveNamespaces() { + return removeNamespaces; + } + + public boolean isForceContent() { + return forceContent; + } + + public boolean isStrict() { + return "strict".equals(parseOptions); + } + + public boolean isForceArray() { + return forceArray; + } + + public boolean hasNamespaces() { + return namespaces.isEmpty() == false; + } + + public Map getNamespaces() { + return namespaces; + } + + public String getParseOptions() { + return parseOptions; + } + @Override public IngestDocument execute(IngestDocument document) { Object fieldValue = document.getFieldValue(field, Object.class, ignoreMissing); - if (fieldValue == null && ignoreMissing) { - return document; - } else if (fieldValue == null) { + if (fieldValue == null) { + if (ignoreMissing || ignoreFailure) { + return document; + } throw new IllegalArgumentException("field [" + field + "] is null, cannot parse XML"); } @@ -98,13 +178,14 @@ public IngestDocument execute(IngestDocument document) { String xmlString = (String) fieldValue; try { - Object parsedXml = parseXml(xmlString.trim()); - if (ignoreEmptyValue) { - parsedXml = filterEmptyValues(parsedXml); + // Always use streaming parser for optimal performance and memory usage + if (storeXml || xpathExpressions.isEmpty() == false) { + parseXmlAndXPath(document, xmlString.trim()); } - document.setFieldValue(targetField, parsedXml); } catch (Exception e) { if (ignoreFailure) { + // Add failure tag similar to Logstash behavior + document.appendFieldValue("tags", "_xmlparsefailure"); return document; } throw new IllegalArgumentException("field [" + field + "] contains invalid XML: " + e.getMessage(), e); @@ -118,62 +199,18 @@ public String getType() { return TYPE; } - /** - * Recursively removes null and empty values from the parsed XML structure - * when ignoreEmptyValue is enabled. - */ - @SuppressWarnings("unchecked") - private Object filterEmptyValues(Object obj) { - if (obj == null) { - return null; - } - - if (obj instanceof Map) { - Map map = (Map) obj; - Map filtered = new HashMap<>(); - - for (Map.Entry entry : map.entrySet()) { - Object filteredValue = filterEmptyValues(entry.getValue()); - if (filteredValue != null && isEmptyValue(filteredValue) == false) { - filtered.put(entry.getKey(), filteredValue); - } - } - - return filtered.isEmpty() ? null : filtered; - } - - if (obj instanceof List) { - List list = (List) obj; - List filtered = new ArrayList<>(); - - for (Object item : list) { - Object filteredItem = filterEmptyValues(item); - if (filteredItem != null && isEmptyValue(filteredItem) == false) { - filtered.add(filteredItem); - } - } - - return filtered.isEmpty() ? null : filtered; - } - - return isEmptyValue(obj) ? null : obj; - } - - /** - * Creates and configures a secure XMLInputFactory for XML parsing. - * This factory is configured to prevent XXE attacks and optimize parsing. - */ - private static XMLInputFactory createXmlInputFactory() { - XMLInputFactory factory = XMLInputFactory.newInstance(); - factory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, false); - factory.setProperty(XMLInputFactory.IS_COALESCING, true); - factory.setProperty(XMLInputFactory.SUPPORT_DTD, false); - factory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false); - return factory; - } - /** * Determines if a value should be considered empty for filtering purposes. + * Used by the ignore_empty_value feature to filter out empty content. + * + * Considers empty: + * - null values + * - empty or whitespace-only strings + * - empty Maps + * - empty Lists + * + * @param value the value to check + * @return true if the value should be considered empty */ private boolean isEmptyValue(Object value) { if (value == null) { @@ -191,139 +228,187 @@ private boolean isEmptyValue(Object value) { return false; } - private Object parseXml(String xmlString) throws XMLStreamException { - if (xmlString == null || xmlString.trim().isEmpty()) { + /** + * Extract the text value from a DOM node for XPath result processing. + * Handles different node types appropriately: + * - TEXT_NODE and CDATA_SECTION_NODE: returns node value directly + * - ATTRIBUTE_NODE: returns attribute value + * - ELEMENT_NODE: returns text content (concatenated text of all descendants) + * - Other node types: returns text content as fallback + * + * @param node the DOM node to extract text from + * @return the text content of the node, or null if node is null + */ + private String getNodeValue(Node node) { + if (node == null) { return null; } - - try (StringReader reader = new StringReader(xmlString)) { - XMLStreamReader xmlReader = XML_INPUT_FACTORY.createXMLStreamReader(reader); - - // Skip to the first element - while (xmlReader.hasNext() && xmlReader.getEventType() != XMLStreamConstants.START_ELEMENT) { - xmlReader.next(); - } - - if (xmlReader.hasNext() == false) { - return null; - } - - Object result = parseElement(xmlReader); - xmlReader.close(); - return result; + + switch (node.getNodeType()) { + case Node.ATTRIBUTE_NODE: + case Node.CDATA_SECTION_NODE: + case Node.TEXT_NODE: + return node.getNodeValue(); + case Node.ELEMENT_NODE: + default: + return node.getTextContent(); } } - private Object parseElement(XMLStreamReader reader) throws XMLStreamException { - if (reader.getEventType() != XMLStreamConstants.START_ELEMENT) { - return null; - } - - String elementName = reader.getLocalName(); - if (toLower) { - elementName = elementName.toLowerCase(Locale.ROOT); - } - - Map element = new HashMap<>(); - Map> repeatedElements = new HashMap<>(); - - // Parse attributes - they are available in START_ELEMENT state - int attributeCount = reader.getAttributeCount(); - boolean hasAttributes = attributeCount > 0; - for (int i = 0; i < attributeCount; i++) { - String attrName = reader.getAttributeLocalName(i); - String attrValue = reader.getAttributeValue(i); - if (toLower) { - attrName = attrName.toLowerCase(Locale.ROOT); - } - element.put(attrName, attrValue); + /** + * Applies force_array logic to ensure all fields are arrays when enabled. + * + * Behavior: + * - If force_array is false: returns content unchanged + * - If force_array is true and content is already a List: returns content unchanged + * - If force_array is true and content is not a List: wraps content in a new ArrayList + * - Handles null content appropriately (wraps null in array if force_array is true) + * + * @param elementName the name of the element (for context, not used in current implementation) + * @param content the content to potentially wrap in an array + * @return the content, optionally wrapped in an array based on force_array setting + */ + private Object applyForceArray(String elementName, Object content) { + if (forceArray && !(content instanceof List)) { + List arrayContent = new ArrayList<>(); + arrayContent.add(content); // Add content even if it's null (for empty elements) + return arrayContent; } + return content; + } - StringBuilder textContent = new StringBuilder(); - - while (reader.hasNext()) { - int eventType = reader.next(); - - switch (eventType) { - case XMLStreamConstants.START_ELEMENT: - Object childElementResult = parseElement(reader); - String childName = reader.getLocalName(); - if (toLower) { - childName = childName.toLowerCase(Locale.ROOT); - } - - // Extract the actual content from the child element result - Object childContent = null; - if (childElementResult instanceof Map) { - @SuppressWarnings("unchecked") - Map childMap = (Map) childElementResult; - // The child element returns {elementName: content}, we want just the content - childContent = childMap.get(childName); - } else { - childContent = childElementResult; + /** + * Evaluates precompiled XPath expressions against a DOM document and adds results to the ingest document. + * + * Features: + * - Uses precompiled XPath expressions for optimal performance + * - Extracts text values from matched nodes (elements, attributes, text nodes) + * - Single matches stored as strings, multiple matches as string arrays + * - Respects ignoreFailure setting for XPath evaluation errors + * + * @param document the ingest document to add XPath results to + * @param doc the DOM document to evaluate XPath expressions against + * @throws Exception if XPath processing fails and ignoreFailure is false + */ + private void processXPathExpressionsFromDom(IngestDocument document, Document doc) throws Exception { + // Use precompiled XPath expressions for optimal performance + for (Map.Entry entry : compiledXPathExpressions.entrySet()) { + String targetFieldName = entry.getKey(); + XPathExpression compiledExpression = entry.getValue(); + + try { + Object result = compiledExpression.evaluate(doc, XPathConstants.NODESET); + + if (result instanceof NodeList) { + NodeList nodeList = (NodeList) result; + List values = new ArrayList<>(); + + for (int i = 0; i < nodeList.getLength(); i++) { + Node node = nodeList.item(i); + String value = getNodeValue(node); + if (value != null && value.trim().isEmpty() == false) { + values.add(value); + } } - - if (element.containsKey(childName) || repeatedElements.containsKey(childName)) { - // Handle repeated elements - if (repeatedElements.containsKey(childName) == false) { - List list = new ArrayList<>(); - list.add(element.get(childName)); - repeatedElements.put(childName, list); - element.remove(childName); + + if (values.isEmpty() == false) { + if (values.size() == 1) { + document.setFieldValue(targetFieldName, values.get(0)); + } else { + document.setFieldValue(targetFieldName, values); } - repeatedElements.get(childName).add(childContent); - } else { - element.put(childName, childContent); } - break; + } + } catch (XPathExpressionException e) { + if (ignoreFailure == false) { + throw new IllegalArgumentException("XPath evaluation failed for target field [" + targetFieldName + "]: " + e.getMessage(), e); + } + } + } + } - case XMLStreamConstants.CHARACTERS: - String text = reader.getText(); - if (text != null && text.trim().isEmpty() == false) { - textContent.append(text); + /** + * Compiles XPath expressions at processor creation time for optimal runtime performance. + * This method pre-compiles all configured XPath expressions with appropriate namespace context, + * eliminating the compilation overhead during document processing. + * + * @param xpathExpressions map of XPath expressions to target field names + * @param namespaces map of namespace prefixes to URIs + * @return map of compiled XPath expressions keyed by target field name + * @throws IllegalArgumentException if XPath compilation fails or namespace validation fails + */ + private static Map compileXPathExpressions( + Map xpathExpressions, + Map namespaces + ) { + if (xpathExpressions.isEmpty()) { + return Map.of(); + } + + Map compiled = new HashMap<>(); + XPath xpath = XPATH_FACTORY.newXPath(); + + // Set namespace context if namespaces are defined + boolean hasNamespaces = namespaces.isEmpty() == false; + if (hasNamespaces) { + xpath.setNamespaceContext(new NamespaceContext() { + @Override + public String getNamespaceURI(String prefix) { + if (prefix == null) { + throw new IllegalArgumentException("Prefix cannot be null"); } - break; + return namespaces.getOrDefault(prefix, ""); + } - case XMLStreamConstants.END_ELEMENT: - // Add repeated elements as arrays - for (Map.Entry> entry : repeatedElements.entrySet()) { - element.put(entry.getKey(), entry.getValue()); + @Override + public String getPrefix(String namespaceURI) { + for (Map.Entry entry : namespaces.entrySet()) { + if (entry.getValue().equals(namespaceURI)) { + return entry.getKey(); + } } + return null; + } - // Determine what to return - String trimmedText = textContent.toString().trim(); - boolean hasText = trimmedText.isEmpty() == false; - boolean hasChildren = element.size() > attributeCount; // Children beyond attributes - - Map result = new HashMap<>(); - if (hasText == false && hasChildren == false && hasAttributes == false) { - // Empty element - result.put(elementName, null); - return result; - } else if (hasText && hasChildren == false) { - // Only text content (and possibly attributes) - if (hasAttributes) { - element.put("#text", trimmedText); - result.put(elementName, element); - return result; - } else { - result.put(elementName, trimmedText); - return result; + @Override + public Iterator getPrefixes(String namespaceURI) { + List prefixes = new ArrayList<>(); + for (Map.Entry entry : namespaces.entrySet()) { + if (entry.getValue().equals(namespaceURI)) { + prefixes.add(entry.getKey()); } - } else if (hasText == false && hasChildren) { - // Only child elements (and possibly attributes) - result.put(elementName, element); - return result; - } else { - // Both text and children (and possibly attributes) - element.put("#text", trimmedText); - result.put(elementName, element); - return result; } + return prefixes.iterator(); + } + }); + } + + // Pre-compiled pattern to detect namespace prefixes + java.util.regex.Pattern namespacePattern = + java.util.regex.Pattern.compile(".*\\b[a-zA-Z][a-zA-Z0-9_-]*:[a-zA-Z][a-zA-Z0-9_-]*.*"); + + for (Map.Entry entry : xpathExpressions.entrySet()) { + String xpathExpression = entry.getKey(); + String targetFieldName = entry.getValue(); + + // Validate namespace prefixes if no namespaces are configured + if (!hasNamespaces && namespacePattern.matcher(xpathExpression).matches()) { + throw new IllegalArgumentException( + "Invalid XPath expression [" + xpathExpression + "]: contains namespace prefixes but no namespace configuration provided" + ); + } + + try { + XPathExpression compiledExpression = xpath.compile(xpathExpression); + compiled.put(targetFieldName, compiledExpression); + } catch (XPathExpressionException e) { + throw new IllegalArgumentException( + "Invalid XPath expression [" + xpathExpression + "]: " + e.getMessage(), e + ); } } - - return null; + + return Map.copyOf(compiled); } public static final class Factory implements Processor.Factory { @@ -342,8 +427,509 @@ public XmlProcessor create( boolean ignoreFailure = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "ignore_failure", false); boolean toLower = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "to_lower", false); boolean ignoreEmptyValue = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "ignore_empty_value", false); + boolean storeXml = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "store_xml", true); + boolean removeNamespaces = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "remove_namespaces", false); + boolean forceContent = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "force_content", false); + boolean forceArray = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "force_array", false); + + // Parse XPath expressions map + Map xpathExpressions = new HashMap<>(); + Object xpathConfig = config.get("xpath"); + if (xpathConfig != null) { + if (xpathConfig instanceof Map) { + @SuppressWarnings("unchecked") + Map xpathMap = (Map) xpathConfig; + for (Map.Entry entry : xpathMap.entrySet()) { + if (entry.getValue() instanceof String) { + xpathExpressions.put(entry.getKey(), (String) entry.getValue()); + } else { + throw new IllegalArgumentException( + "XPath target field [" + entry.getKey() + "] must be a string, got [" + entry.getValue().getClass().getSimpleName() + "]" + ); + } + } + } else { + throw new IllegalArgumentException("XPath configuration must be a map of expressions to target fields"); + } + } + + // Parse namespaces map + Map namespaces = new HashMap<>(); + Object namespaceConfig = config.get("namespaces"); + if (namespaceConfig != null) { + if (namespaceConfig instanceof Map) { + @SuppressWarnings("unchecked") + Map namespaceMap = (Map) namespaceConfig; + for (Map.Entry entry : namespaceMap.entrySet()) { + if (entry.getValue() instanceof String) { + namespaces.put(entry.getKey(), (String) entry.getValue()); + } else { + throw new IllegalArgumentException( + "Namespace prefix [" + entry.getKey() + "] must have a string URI, got [" + entry.getValue().getClass().getSimpleName() + "]" + ); + } + } + } else { + throw new IllegalArgumentException("Namespaces configuration must be a map of prefixes to URIs"); + } + } + + // Parse parse_options parameter + String parseOptions = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "parse_options", ""); + if (parseOptions != null && parseOptions != "" && !"strict".equals(parseOptions)) { + throw new IllegalArgumentException("Invalid parse_options [" + parseOptions + "]. Only 'strict' is supported."); + } - return new XmlProcessor(processorTag, description, field, targetField, ignoreMissing, ignoreFailure, toLower, ignoreEmptyValue); + return new XmlProcessor(processorTag, description, field, targetField, ignoreMissing, ignoreFailure, toLower, ignoreEmptyValue, storeXml, removeNamespaces, forceContent, forceArray, xpathExpressions, namespaces, parseOptions); + } + } + + /** + * Main XML parsing method that converts XML to JSON and optionally extracts XPath values. + * Uses streaming SAX parser with optional DOM building for XPath processing. + * + * @param document the ingest document to modify with parsed results + * @param xmlString the XML string to parse (should be trimmed) + * @throws Exception if XML parsing fails + */ + private void parseXmlAndXPath(IngestDocument document, String xmlString) throws Exception { + if (xmlString == null || xmlString.trim().isEmpty()) { + return; + } + + // Determine if we need DOM for XPath processing + boolean needsDom = xpathExpressions.isEmpty() == false; + + // Use the appropriate pre-configured SAX parser factory + javax.xml.parsers.SAXParserFactory factory = selectSaxParserFactory(); + + javax.xml.parsers.SAXParser parser = factory.newSAXParser(); + + // Configure error handler for strict mode + if (isStrict()) { + parser.getXMLReader().setErrorHandler(new org.xml.sax.ErrorHandler() { + @Override + public void warning(org.xml.sax.SAXParseException exception) throws org.xml.sax.SAXException { + throw exception; + } + + @Override + public void error(org.xml.sax.SAXParseException exception) throws org.xml.sax.SAXException { + throw exception; + } + + @Override + public void fatalError(org.xml.sax.SAXParseException exception) throws org.xml.sax.SAXException { + throw exception; + } + }); + } + + // Use enhanced handler that can build DOM during streaming when needed + XmlStreamingWithDomHandler handler = new XmlStreamingWithDomHandler(needsDom); + + parser.parse(new java.io.ByteArrayInputStream(xmlString.getBytes("UTF-8")), handler); + + // Store structured result if needed + if (storeXml) { + Object streamingResult = handler.getStructuredResult(); + if (streamingResult != null) { + document.setFieldValue(targetField, streamingResult); + } + } + + // Process XPath expressions if DOM was built during streaming + if (needsDom) { + Document domDocument = handler.getDomDocument(); + if (domDocument != null) { + processXPathExpressionsFromDom(document, domDocument); + } + } + } + + /** + * SAX ContentHandler that builds structured JSON output and optionally constructs a DOM tree during parsing. + * Handles XML-to-JSON conversion with support for all processor configuration options. + */ + private class XmlStreamingWithDomHandler extends org.xml.sax.helpers.DefaultHandler { + // Streaming parser state (for structured output) + private final java.util.Deque> elementStack = new java.util.ArrayDeque<>(); + private final java.util.Deque elementNameStack = new java.util.ArrayDeque<>(); + private final java.util.Deque textStack = new java.util.ArrayDeque<>(); + private final java.util.Deque>> repeatedElementsStack = new java.util.ArrayDeque<>(); + private Object rootResult = null; + + // DOM building state (for XPath processing when needed) + private final boolean buildDom; + private Document domDocument = null; + private final java.util.Deque domElementStack = new java.util.ArrayDeque<>(); + + public XmlStreamingWithDomHandler(boolean buildDom) { + this.buildDom = buildDom; + } + + @Override + public void startDocument() throws org.xml.sax.SAXException { + // Initialize DOM document if needed + if (buildDom) { + try { + // Use pre-configured secure DOM factory + // Since we build DOM programmatically (createElementNS/createElement), + // the factory's namespace awareness doesn't affect our usage + DocumentBuilder builder = DOM_FACTORY.newDocumentBuilder(); + domDocument = builder.newDocument(); + } catch (Exception e) { + throw new org.xml.sax.SAXException("Failed to create DOM document", e); + } + } + } + + @Override + public void startElement(String uri, String localName, String qName, org.xml.sax.Attributes attributes) throws org.xml.sax.SAXException { + String elementName = getElementName(uri, localName, qName); + + // Build structured representation (always) + Map element = new HashMap<>(); + Map> repeatedElements = new HashMap<>(); + + // Process attributes for structured output + for (int i = 0; i < attributes.getLength(); i++) { + String attrName = getAttributeName(attributes.getURI(i), attributes.getLocalName(i), attributes.getQName(i)); + String attrValue = attributes.getValue(i); + + // Apply ignoreEmptyValue filtering to attributes + if (ignoreEmptyValue == false || isEmptyValue(attrValue) == false) { + element.put(attrName, attrValue); + } + } + + elementStack.push(element); + elementNameStack.push(elementName); + textStack.push(new StringBuilder()); + repeatedElementsStack.push(repeatedElements); + + // Build DOM element simultaneously if needed + if (buildDom && domDocument != null) { + org.w3c.dom.Element domElement; + if (uri != null && !uri.isEmpty() && !removeNamespaces) { + domElement = domDocument.createElementNS(uri, qName); + } else { + domElement = domDocument.createElement(removeNamespaces ? localName : qName); + } + + // Add attributes to DOM element + for (int i = 0; i < attributes.getLength(); i++) { + String attrUri = attributes.getURI(i); + String attrLocalName = attributes.getLocalName(i); + String attrQName = attributes.getQName(i); + String attrValue = attributes.getValue(i); + + if (attrUri != null && !attrUri.isEmpty() && !removeNamespaces) { + domElement.setAttributeNS(attrUri, attrQName, attrValue); + } else { + domElement.setAttribute(removeNamespaces ? attrLocalName : attrQName, attrValue); + } + } + + // Add to parent or root + if (domElementStack.isEmpty()) { + domDocument.appendChild(domElement); + } else { + domElementStack.peek().appendChild(domElement); + } + + domElementStack.push(domElement); + } + } + + @Override + public void characters(char[] ch, int start, int length) throws org.xml.sax.SAXException { + // Add to structured output text accumulator + if (!textStack.isEmpty()) { + textStack.peek().append(ch, start, length); + } + + // Add to DOM text node if needed + if (buildDom && !domElementStack.isEmpty()) { + String text = new String(ch, start, length); + if (!text.trim().isEmpty() || !ignoreEmptyValue) { + org.w3c.dom.Text textNode = domDocument.createTextNode(text); + domElementStack.peek().appendChild(textNode); + } + } + } + + @Override + public void endElement(String uri, String localName, String qName) throws org.xml.sax.SAXException { + // Complete structured output element processing + if (elementStack.isEmpty()) { + return; + } + + Map element = elementStack.pop(); + String elementName = elementNameStack.pop(); + StringBuilder textContent = textStack.pop(); + Map> repeatedElements = repeatedElementsStack.pop(); + + // Add repeated elements as arrays + for (Map.Entry> entry : repeatedElements.entrySet()) { + List values = entry.getValue(); + if (ignoreEmptyValue == false || values.isEmpty() == false) { + element.put(entry.getKey(), values); + } + } + + // Process text content and determine final element structure + String trimmedText = textContent.toString().trim(); + boolean hasText = trimmedText.isEmpty() == false; + boolean hasChildren = element.size() > 0; + + Object elementValue; + if (hasText == false && hasChildren == false) { + // Empty element + if (ignoreEmptyValue == false) { + elementValue = applyForceArray(elementName, null); + } else { + elementValue = null; + } + } else if (hasText && hasChildren == false) { + // Only text content + if (forceContent) { + Map contentMap = new HashMap<>(); + if (ignoreEmptyValue == false || isEmptyValue(trimmedText) == false) { + contentMap.put("#text", trimmedText); + } + elementValue = contentMap; + } else { + if (ignoreEmptyValue && isEmptyValue(trimmedText)) { + elementValue = null; + } else { + elementValue = trimmedText; + } + } + elementValue = applyForceArray(elementName, elementValue); + } else if (hasText == false && hasChildren) { + // Only child elements/attributes + elementValue = (forceArray && forceContent) ? applyForceArray(elementName, element) : element; + } else { + // Both text and children/attributes + if (ignoreEmptyValue == false || isEmptyValue(trimmedText) == false) { + element.put("#text", trimmedText); + } + elementValue = (forceArray && forceContent) ? applyForceArray(elementName, element) : element; + } + + // If this is the root element, store the result + if (elementStack.isEmpty()) { + if (elementValue != null) { + Map result = new HashMap<>(); + result.put(elementName, elementValue); + rootResult = result; + } + } else { + // Add to parent element + if (elementValue != null) { + Map parentElement = elementStack.peek(); + Map> parentRepeatedElements = repeatedElementsStack.peek(); + + if (parentElement.containsKey(elementName) || parentRepeatedElements.containsKey(elementName)) { + // Handle repeated elements + if (parentRepeatedElements.containsKey(elementName) == false) { + List list = new ArrayList<>(); + list.add(parentElement.get(elementName)); + parentRepeatedElements.put(elementName, list); + parentElement.remove(elementName); + } + parentRepeatedElements.get(elementName).add(elementValue); + } else { + // Apply force_array logic for single elements + Object finalContent = applyForceArray(elementName, elementValue); + parentElement.put(elementName, finalContent); + } + } + } + + // Complete DOM element if building DOM + if (buildDom && !domElementStack.isEmpty()) { + domElementStack.pop(); + } + } + + @Override + public void endDocument() throws org.xml.sax.SAXException { + // Document parsing complete + } + + public Object getStructuredResult() { + return rootResult; + } + + public Document getDomDocument() { + return domDocument; + } + + private String getElementName(String uri, String localName, String qName) { + String elementName; + if (removeNamespaces) { + elementName = localName != null && !localName.isEmpty() ? localName : qName; + } else { + elementName = qName; + } + + // Apply toLower if enabled + if (toLower) { + elementName = elementName.toLowerCase(Locale.ROOT); + } + + return elementName; + } + + private String getAttributeName(String uri, String localName, String qName) { + String attrName; + if (removeNamespaces) { + attrName = localName != null && !localName.isEmpty() ? localName : qName; + } else { + attrName = qName; + } + + // Apply toLower if enabled + if (toLower) { + attrName = attrName.toLowerCase(Locale.ROOT); + } + + return attrName; + } + } + + /** + * Creates a secure, pre-configured SAX parser factory for XML parsing. + * This factory is configured to prevent XXE attacks with SAX-specific features. + */ + private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactory() { + javax.xml.parsers.SAXParserFactory factory = javax.xml.parsers.SAXParserFactory.newInstance(); + factory.setValidating(false); + + // Configure SAX-specific security features to prevent XXE attacks + try { + // SAX parser features - these are the correct features for SAXParserFactory + factory.setFeature("http://xml.org/sax/features/external-general-entities", false); + factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false); + factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); + factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); + } catch (Exception e) { + // If features cannot be set, continue with default settings + } + + return factory; + } + + /** + * Creates a secure, pre-configured namespace-aware SAX parser factory for XML parsing. + * This factory is configured to prevent XXE attacks and has namespace awareness enabled. + */ + private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactoryNamespaceAware() { + javax.xml.parsers.SAXParserFactory factory = javax.xml.parsers.SAXParserFactory.newInstance(); + factory.setValidating(false); + factory.setNamespaceAware(true); + + // Configure SAX-specific security features to prevent XXE attacks + try { + // SAX parser features - these are the correct features for SAXParserFactory + factory.setFeature("http://xml.org/sax/features/external-general-entities", false); + factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false); + factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); + factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); + } catch (Exception e) { + // If features cannot be set, continue with default settings + } + + return factory; + } + + /** + * Creates a secure, pre-configured SAX parser factory for strict XML parsing. + * This factory is configured to prevent XXE attacks and has strict validation enabled. + */ + private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactoryStrict() { + javax.xml.parsers.SAXParserFactory factory = javax.xml.parsers.SAXParserFactory.newInstance(); + factory.setValidating(false); + + // Configure SAX-specific security features to prevent XXE attacks + try { + // SAX parser features - these are the correct features for SAXParserFactory + factory.setFeature("http://xml.org/sax/features/external-general-entities", false); + factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false); + factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); + factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); + + // Enable strict parsing features + factory.setFeature("http://apache.org/xml/features/validation/check-full-element-content", true); + } catch (Exception e) { + // If features cannot be set, continue with default settings + } + + return factory; + } + + /** + * Creates a secure, pre-configured namespace-aware SAX parser factory for strict XML parsing. + * This factory is configured to prevent XXE attacks, has namespace awareness enabled, and strict validation. + */ + private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactoryNamespaceAwareStrict() { + javax.xml.parsers.SAXParserFactory factory = javax.xml.parsers.SAXParserFactory.newInstance(); + factory.setValidating(false); + factory.setNamespaceAware(true); + + // Configure SAX-specific security features to prevent XXE attacks + try { + // SAX parser features - these are the correct features for SAXParserFactory + factory.setFeature("http://xml.org/sax/features/external-general-entities", false); + factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false); + factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); + factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); + + // Enable strict parsing features + factory.setFeature("http://apache.org/xml/features/validation/check-full-element-content", true); + } catch (Exception e) { + // If features cannot be set, continue with default settings + } + + return factory; + } + + /** + * Creates a secure, pre-configured DocumentBuilderFactory for DOM creation. + * Since we only use this factory to create empty DOM documents programmatically + * (not to parse XML), XXE security features are not needed here. + * The SAX parser handles all XML parsing with appropriate security measures. + */ + private static DocumentBuilderFactory createSecureDocumentBuilderFactory() { + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + factory.setNamespaceAware(true); // Enable for maximum compatibility + factory.setValidating(false); + + // No XXE security features needed - we only create empty documents, + // never parse XML with this factory + + return factory; + } + + /** + * Selects the appropriate pre-configured SAX parser factory based on processor configuration. + * + * Factory selection matrix: + * - Regular parsing, no namespaces: SAX_PARSER_FACTORY + * - Regular parsing, with namespaces: SAX_PARSER_FACTORY_NS + * - Strict parsing, no namespaces: SAX_PARSER_FACTORY_STRICT + * - Strict parsing, with namespaces: SAX_PARSER_FACTORY_NS_STRICT + * + * @return the appropriate SAX parser factory for the current configuration + */ + private javax.xml.parsers.SAXParserFactory selectSaxParserFactory() { + if (isStrict()) { + return hasNamespaces() ? SAX_PARSER_FACTORY_NS_STRICT : SAX_PARSER_FACTORY_STRICT; + } else { + return hasNamespaces() ? SAX_PARSER_FACTORY_NS : SAX_PARSER_FACTORY; } } } diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java index ba1ba839a9edc..d0f4437074359 100644 --- a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java @@ -16,67 +16,377 @@ import java.util.Map; import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.containsString; public class XmlProcessorFactoryTests extends ESTestCase { - public void testCreate() throws Exception { - XmlProcessor.Factory factory = new XmlProcessor.Factory(); + private static final String DEFAULT_FIELD = "field1"; + private static final String DEFAULT_TARGET_FIELD = "target"; + + /** + * Creates a new XmlProcessor.Factory instance for testing. + */ + private XmlProcessor.Factory createFactory() { + return new XmlProcessor.Factory(); + } + + /** + * Creates a basic configuration map with the specified field name. + */ + private Map createBaseConfig(String fieldName) { Map config = new HashMap<>(); - config.put("field", "field1"); - config.put("target_field", "target"); + config.put("field", fieldName); + return config; + } + + /** + * Creates a basic configuration map with the default field name. + */ + private Map createBaseConfig() { + return createBaseConfig(DEFAULT_FIELD); + } + + /** + * Creates a configuration map with XPath expressions. + */ + private Map createConfigWithXPath(String fieldName, Map xpathExpressions) { + Map config = createBaseConfig(fieldName); + config.put("xpath", xpathExpressions); + return config; + } + + /** + * Creates a configuration map with namespace definitions. + */ + private Map createConfigWithNamespaces(String fieldName, Map namespaces) { + Map config = createBaseConfig(fieldName); + config.put("namespaces", namespaces); + return config; + } + + /** + * Creates a configuration map with both XPath expressions and namespaces. + */ + private Map createConfigWithXPathAndNamespaces( + String fieldName, + Map xpathExpressions, + Map namespaces + ) { + Map config = createBaseConfig(fieldName); + config.put("xpath", xpathExpressions); + config.put("namespaces", namespaces); + return config; + } + + /** + * Creates a processor with the given factory and configuration. + */ + private XmlProcessor createProcessor(XmlProcessor.Factory factory, Map config) throws Exception { + String processorTag = randomAlphaOfLength(10); + return factory.create(null, processorTag, null, config, null); + } + + /** + * Creates a processor with the default factory and given configuration. + */ + private XmlProcessor createProcessor(Map config) throws Exception { + return createProcessor(createFactory(), config); + } + + /** + * Helper method to create XPath configuration map. + */ + private Map createXPathConfig(String... expressionsAndFields) { + if (expressionsAndFields.length % 2 != 0) { + throw new IllegalArgumentException("Must provide even number of arguments (expression, field, expression, field, ...)"); + } + + Map xpathConfig = new HashMap<>(); + for (int i = 0; i < expressionsAndFields.length; i += 2) { + xpathConfig.put(expressionsAndFields[i], expressionsAndFields[i + 1]); + } + return xpathConfig; + } + + /** + * Helper method to create namespace configuration map. + */ + private Map createNamespaceConfig(String... prefixesAndUris) { + if (prefixesAndUris.length % 2 != 0) { + throw new IllegalArgumentException("Must provide even number of arguments (prefix, uri, prefix, uri, ...)"); + } + + Map namespaceConfig = new HashMap<>(); + for (int i = 0; i < prefixesAndUris.length; i += 2) { + namespaceConfig.put(prefixesAndUris[i], prefixesAndUris[i + 1]); + } + return namespaceConfig; + } + + /** + * Helper method to create configuration with common boolean options. + */ + private Map createConfigWithOptions(String fieldName, String... options) { + Map config = createBaseConfig(fieldName); + + for (String option : options) { + switch (option) { + case "ignore_missing": + config.put("ignore_missing", true); + break; + case "ignore_failure": + config.put("ignore_failure", true); + break; + case "to_lower": + config.put("to_lower", true); + break; + case "ignore_empty_value": + config.put("ignore_empty_value", true); + break; + case "store_xml": + config.put("store_xml", false); // Test false case since default is true + break; + case "remove_namespaces": + config.put("remove_namespaces", true); + break; + case "force_content": + config.put("force_content", true); + break; + case "force_array": + config.put("force_array", true); + break; + case "strict": + config.put("parse_options", "strict"); + break; + default: + throw new IllegalArgumentException("Unknown option: " + option); + } + } + + return config; + } + + /** + * Helper to expect processor creation failure with specific message. + */ + private void expectCreationFailure(Map config, Class exceptionClass, String expectedMessage) { + XmlProcessor.Factory factory = createFactory(); + String processorTag = randomAlphaOfLength(10); + + Exception exception = expectThrows( + exceptionClass, + () -> factory.create(null, processorTag, null, config, null) + ); + assertThat(exception.getMessage(), equalTo(expectedMessage)); + } + + /** + * Tests processor creation with various configurations. + */ + public void testCreate() throws Exception { + Map config = createBaseConfig(); + config.put("target_field", DEFAULT_TARGET_FIELD); config.put("ignore_missing", true); config.put("ignore_failure", true); config.put("to_lower", true); config.put("ignore_empty_value", true); - String processorTag = randomAlphaOfLength(10); - XmlProcessor processor = factory.create(null, processorTag, null, config, null); + XmlProcessor processor = createProcessor(config); - assertThat(processor.getTag(), equalTo(processorTag)); - assertThat(processor.getField(), equalTo("field1")); - assertThat(processor.getTargetField(), equalTo("target")); + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.getTargetField(), equalTo(DEFAULT_TARGET_FIELD)); assertThat(processor.isIgnoreMissing(), equalTo(true)); assertThat(processor.isIgnoreEmptyValue(), equalTo(true)); } public void testCreateWithDefaults() throws Exception { - XmlProcessor.Factory factory = new XmlProcessor.Factory(); - Map config = new HashMap<>(); - config.put("field", "field1"); - - String processorTag = randomAlphaOfLength(10); - XmlProcessor processor = factory.create(null, processorTag, null, config, null); + Map config = createBaseConfig(); + XmlProcessor processor = createProcessor(config); - assertThat(processor.getTag(), equalTo(processorTag)); - assertThat(processor.getField(), equalTo("field1")); - assertThat(processor.getTargetField(), equalTo("field1")); + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.getTargetField(), equalTo(DEFAULT_FIELD)); assertThat(processor.isIgnoreMissing(), equalTo(false)); assertThat(processor.isIgnoreEmptyValue(), equalTo(false)); } public void testCreateMissingField() throws Exception { - XmlProcessor.Factory factory = new XmlProcessor.Factory(); - Map config = new HashMap<>(); - - String processorTag = randomAlphaOfLength(10); - ElasticsearchParseException exception = expectThrows( - ElasticsearchParseException.class, - () -> factory.create(null, processorTag, null, config, null) - ); - assertThat(exception.getMessage(), equalTo("[field] required property is missing")); + Map config = new HashMap<>(); // Empty config - no field specified + expectCreationFailure(config, ElasticsearchParseException.class, "[field] required property is missing"); } public void testCreateWithIgnoreEmptyValueOnly() throws Exception { - XmlProcessor.Factory factory = new XmlProcessor.Factory(); - Map config = new HashMap<>(); - config.put("field", "field1"); + Map config = createBaseConfig(); config.put("ignore_empty_value", true); - String processorTag = randomAlphaOfLength(10); - XmlProcessor processor = factory.create(null, processorTag, null, config, null); + XmlProcessor processor = createProcessor(config); - assertThat(processor.getField(), equalTo("field1")); + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); assertThat(processor.isIgnoreEmptyValue(), equalTo(true)); assertThat(processor.isIgnoreMissing(), equalTo(false)); // other flags should remain default } + + public void testCreateWithXPath() throws Exception { + Map xpathConfig = createXPathConfig( + "//author/text()", "author_field", + "//title/@lang", "language_field" + ); + Map config = createConfigWithXPath(DEFAULT_FIELD, xpathConfig); + + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + } + + public void testCreateWithInvalidXPathConfig() throws Exception { + Map config = createBaseConfig(); + config.put("xpath", "invalid_string"); // Should be a map + + expectCreationFailure(config, IllegalArgumentException.class, "XPath configuration must be a map of expressions to target fields"); + } + + public void testCreateWithInvalidXPathTargetField() throws Exception { + Map config = createBaseConfig(); + + Map xpathConfig = new HashMap<>(); + xpathConfig.put("//author/text()", 123); // Should be string + config.put("xpath", xpathConfig); + + expectCreationFailure(config, IllegalArgumentException.class, "XPath target field [//author/text()] must be a string, got [Integer]"); + } + + public void testCreateWithNamespaces() throws Exception { + Map namespacesConfig = createNamespaceConfig( + "book", "http://example.com/book", + "author", "http://example.com/author" + ); + Map config = createConfigWithNamespaces(DEFAULT_FIELD, namespacesConfig); + + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.getNamespaces(), equalTo(namespacesConfig)); + } + + public void testCreateWithInvalidNamespacesConfig() throws Exception { + Map config = createBaseConfig(); + config.put("namespaces", "invalid_string"); // Should be a map + + expectCreationFailure(config, IllegalArgumentException.class, "Namespaces configuration must be a map of prefixes to URIs"); + } + + public void testCreateWithInvalidNamespaceURI() throws Exception { + Map config = createBaseConfig(); + + Map namespacesConfig = new HashMap<>(); + namespacesConfig.put("book", 123); // Should be string + config.put("namespaces", namespacesConfig); + + expectCreationFailure(config, IllegalArgumentException.class, "Namespace prefix [book] must have a string URI, got [Integer]"); + } + + public void testCreateWithXPathAndNamespaces() throws Exception { + Map xpathConfig = createXPathConfig( + "//book:author/text()", "author_field", + "//book:title/@lang", "language_field" + ); + Map namespacesConfig = createNamespaceConfig( + "book", "http://example.com/book" + ); + Map config = createConfigWithXPathAndNamespaces(DEFAULT_FIELD, xpathConfig, namespacesConfig); + + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.getNamespaces(), equalTo(namespacesConfig)); + } + + // Tests for individual boolean options + + public void testCreateWithStoreXmlFalse() throws Exception { + Map config = createConfigWithOptions(DEFAULT_FIELD, "store_xml"); + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.isStoreXml(), equalTo(false)); + } + + public void testCreateWithRemoveNamespaces() throws Exception { + Map config = createConfigWithOptions(DEFAULT_FIELD, "remove_namespaces"); + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.isRemoveNamespaces(), equalTo(true)); + } + + public void testCreateWithForceContent() throws Exception { + Map config = createConfigWithOptions(DEFAULT_FIELD, "force_content"); + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.isForceContent(), equalTo(true)); + } + + public void testCreateWithForceArray() throws Exception { + Map config = createConfigWithOptions(DEFAULT_FIELD, "force_array"); + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.isForceArray(), equalTo(true)); + } + + public void testCreateWithStrictParseOptions() throws Exception { + Map config = createConfigWithOptions(DEFAULT_FIELD, "strict"); + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.getParseOptions(), equalTo("strict")); + assertThat(processor.isStrict(), equalTo(true)); + } + + public void testCreateWithMultipleOptions() throws Exception { + Map config = createConfigWithOptions(DEFAULT_FIELD, + "ignore_missing", "force_content", "force_array", "remove_namespaces"); + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.isIgnoreMissing(), equalTo(true)); + assertThat(processor.isForceContent(), equalTo(true)); + assertThat(processor.isForceArray(), equalTo(true)); + assertThat(processor.isRemoveNamespaces(), equalTo(true)); + } + + // Tests for invalid parse options + + public void testCreateWithInvalidParseOptions() throws Exception { + Map config = createBaseConfig(); + config.put("parse_options", "invalid_option"); + + expectCreationFailure(config, IllegalArgumentException.class, "Invalid parse_options [invalid_option]. Only 'strict' is supported."); + } + + // Tests for XPath compilation errors (testing precompilation feature) + + public void testCreateWithInvalidXPathExpression() throws Exception { + Map xpathConfig = createXPathConfig("invalid xpath ][", "target_field"); + Map config = createConfigWithXPath(DEFAULT_FIELD, xpathConfig); + + XmlProcessor.Factory factory = createFactory(); + String processorTag = randomAlphaOfLength(10); + + IllegalArgumentException exception = expectThrows( + IllegalArgumentException.class, + () -> factory.create(null, processorTag, null, config, null) + ); + + // Check that the error message contains the XPath expression and indicates it's invalid + assertThat(exception.getMessage(), containsString("Invalid XPath expression [invalid xpath ][]:")); + assertThat(exception.getMessage(), containsString("javax.xml.transform.TransformerException")); + } + + public void testCreateWithXPathUsingNamespacesWithoutConfiguration() throws Exception { + Map xpathConfig = createXPathConfig("//book:title/text()", "title_field"); + Map config = createConfigWithXPath(DEFAULT_FIELD, xpathConfig); + + expectCreationFailure(config, IllegalArgumentException.class, "Invalid XPath expression [//book:title/text()]: contains namespace prefixes but no namespace configuration provided"); + } } diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java index 0b1ee4123f128..e48c6de593f84 100644 --- a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java @@ -10,617 +10,627 @@ package org.elasticsearch.ingest.common; import org.elasticsearch.ingest.IngestDocument; -import org.elasticsearch.ingest.RandomDocumentPicks; import org.elasticsearch.test.ESTestCase; +import static org.hamcrest.Matchers.equalTo; import java.util.HashMap; -import java.util.List; import java.util.Map; +import java.util.List; -import static org.hamcrest.Matchers.containsString; -import static org.hamcrest.Matchers.equalTo; - +/** + * Tests for {@link XmlProcessor}. These tests ensure feature parity and test coverage. + */ +@SuppressWarnings("unchecked") public class XmlProcessorTests extends ESTestCase { - public void testSimpleXmlDecodeWithTargetField() throws Exception { - String xml = """ - - - William H. Gaddis - The Recognitions - One of the great seminal American novels of the 20th century. - - """; - - XmlProcessor processor = new XmlProcessor("tag", null, "message", "xml", false, false, false, false); - Map document = new HashMap<>(); - document.put("message", xml); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); - - processor.execute(ingestDocument); - - @SuppressWarnings("unchecked") - Map xmlField = (Map) ingestDocument.getFieldValue("xml", Object.class); - @SuppressWarnings("unchecked") - Map catalog = (Map) xmlField.get("catalog"); - @SuppressWarnings("unchecked") - Map book = (Map) catalog.get("book"); + private static final String XML_FIELD = "xmldata"; + private static final String TARGET_FIELD = "data"; - assertThat(book.get("seq"), equalTo("1")); - assertThat(book.get("author"), equalTo("William H. Gaddis")); - assertThat(book.get("title"), equalTo("The Recognitions")); - assertThat(book.get("review"), equalTo("One of the great seminal American novels of the 20th century.")); - - // Original field should remain unchanged - assertThat(ingestDocument.getFieldValue("message", String.class), equalTo(xml)); + private static IngestDocument createTestIngestDocument(String xml) { + return new IngestDocument("_index", "_id", 1, null, null, new HashMap<>(Map.of(XML_FIELD, xml))); } - - public void testXmlDecodeToSameFieldWhenTargetIsField() throws Exception { - String xml = """ - - - - William H. Gaddis - The Recognitions - One of the great seminal American novels of the 20th century. - - """; - - XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, false, false); - Map document = new HashMap<>(); - document.put("message", xml); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); - - processor.execute(ingestDocument); - - @SuppressWarnings("unchecked") - Map messageField = (Map) ingestDocument.getFieldValue("message", Object.class); - @SuppressWarnings("unchecked") - Map catalog = (Map) messageField.get("catalog"); - @SuppressWarnings("unchecked") - Map book = (Map) catalog.get("book"); - - assertThat(book.get("seq"), equalTo("1")); - assertThat(book.get("author"), equalTo("William H. Gaddis")); - assertThat(book.get("title"), equalTo("The Recognitions")); - assertThat(book.get("review"), equalTo("One of the great seminal American novels of the 20th century.")); + + private static XmlProcessor createTestProcessor(Map config) { + config.putIfAbsent("field", XML_FIELD); + config.putIfAbsent("target_field", TARGET_FIELD); + + XmlProcessor.Factory factory = new XmlProcessor.Factory(); + try { + return factory.create(null, "_tag", null, config, null); + } catch (Exception e){ + fail("Failed to create XmlProcessor: " + e.getMessage()); + return null; // This line will never be reached, but is needed to satisfy the compiler + } } - public void testXmlDecodeWithArray() throws Exception { - String xml = """ - - - - William H. Gaddis - The Recognitions - One of the great seminal American novels of the 20th century. - - - Ralls, Kim - Midnight Rain - Some review. - - """; - - XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, false, false); - Map document = new HashMap<>(); - document.put("message", xml); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); - - processor.execute(ingestDocument); - - @SuppressWarnings("unchecked") - Map messageField = (Map) ingestDocument.getFieldValue("message", Object.class); - @SuppressWarnings("unchecked") - Map catalog = (Map) messageField.get("catalog"); - @SuppressWarnings("unchecked") - List> books = (List>) catalog.get("book"); + /** + * Test parsing standard XML with attributes. + */ + public void testParseStandardXml() { + String xml = ""; - assertThat(books.size(), equalTo(2)); - - Map firstBook = books.get(0); - assertThat(firstBook.get("author"), equalTo("William H. Gaddis")); - assertThat(firstBook.get("title"), equalTo("The Recognitions")); - - Map secondBook = books.get(1); - assertThat(secondBook.get("author"), equalTo("Ralls, Kim")); - assertThat(secondBook.get("title"), equalTo("Midnight Rain")); - } - - public void testXmlDecodeWithToLower() throws Exception { - String xml = """ - - - - N/A - - - N/A - - - """; - - XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, true, false); - Map document = new HashMap<>(); - document.put("message", xml); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + Map config = new HashMap<>(); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - - @SuppressWarnings("unchecked") - Map messageField = (Map) ingestDocument.getFieldValue("message", Object.class); - @SuppressWarnings("unchecked") - Map auditbase = (Map) messageField.get("auditbase"); - @SuppressWarnings("unchecked") - Map contextcomponents = (Map) auditbase.get("contextcomponents"); - @SuppressWarnings("unchecked") - List> components = (List>) contextcomponents.get("component"); - - assertThat(components.size(), equalTo(2)); - assertThat(components.get(0).get("relyingparty"), equalTo("N/A")); - assertThat(components.get(1).get("primaryauth"), equalTo("N/A")); + + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); + Map foo = (Map) data.get("foo"); + assertThat(foo.get("key"), equalTo("value")); } - - public void testXmlDecodeWithMultipleElements() throws Exception { - String xml = """ - - - - William H. Gaddis - The Recognitions - One of the great seminal American novels of the 20th century. - - - Ralls, Kim - Midnight Rain - Some review. - - - - Ralls, Kim - A former architect battles corporate zombies, - an evil sorceress, and her own childhood to become queen of the world. - - - - """; - - XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, false, false); - Map document = new HashMap<>(); - document.put("message", xml); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + /** + * Test parsing XML with array elements (multiple elements with same name). + */ + public void testParseXmlWithArrayValue() { + String xml = "value1value2"; + + Map config = new HashMap<>(); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - - @SuppressWarnings("unchecked") - Map messageField = (Map) ingestDocument.getFieldValue("message", Object.class); - @SuppressWarnings("unchecked") - Map catalog = (Map) messageField.get("catalog"); - - // Check books array - @SuppressWarnings("unchecked") - List> books = (List>) catalog.get("book"); - assertThat(books.size(), equalTo(2)); - - // Check secondcategory - @SuppressWarnings("unchecked") - Map secondcategory = (Map) catalog.get("secondcategory"); - @SuppressWarnings("unchecked") - Map paper = (Map) secondcategory.get("paper"); - assertThat(paper.get("id"), equalTo("bk102")); - assertThat(paper.get("test2"), equalTo("Ralls, Kim")); + + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); + Map foo = (Map) data.get("foo"); + List keyValues = (List) foo.get("key"); + assertThat(keyValues.size(), equalTo(2)); + + // The values might be nested inside their own lists + Object firstValue = keyValues.get(0); + assertThat(firstValue, equalTo("value1")); + + Object secondValue = keyValues.get(1); + assertThat(secondValue, equalTo("value2")); } - - public void testXmlDecodeWithUtf16Encoding() throws Exception { - String xml = """ - - - - William H. Gaddis - The Recognitions - One of the great seminal American novels of the 20th century. - - """; - - XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, false, false); - Map document = new HashMap<>(); - document.put("message", xml); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + /** + * Test parsing XML with nested elements. + */ + public void testParseXmlWithNestedElements() { + String xml = "value"; + + Map config = new HashMap<>(); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - - @SuppressWarnings("unchecked") - Map messageField = (Map) ingestDocument.getFieldValue("message", Object.class); - @SuppressWarnings("unchecked") - Map catalog = (Map) messageField.get("catalog"); - @SuppressWarnings("unchecked") - Map book = (Map) catalog.get("book"); - - assertThat(book.get("author"), equalTo("William H. Gaddis")); - assertThat(book.get("title"), equalTo("The Recognitions")); + + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); + Map foo = (Map) data.get("foo"); + + Map key1Map = (Map) foo.get("key1"); + assertThat(key1Map.size(), equalTo(1)); + + String key2Value = (String) key1Map.get("key2"); + assertThat(key2Value, equalTo("value")); } - public void testBrokenXmlWithIgnoreFailureFalse() { - String brokenXml = """ - - - - William H. Gaddis - The Recognitions - One of the great seminal American novels of the 20th century. - - catalog>"""; - - XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, false, false); - Map document = new HashMap<>(); - document.put("message", brokenXml); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); - - Exception exception = expectThrows(IllegalArgumentException.class, () -> processor.execute(ingestDocument)); - assertThat(exception.getMessage(), containsString("contains invalid XML")); - } + /** + * Test parsing XML in a single item array. + */ + public void testParseXmlInSingleItemArray() { + String xml = ""; + + Map config = new HashMap<>(); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); - public void testBrokenXmlWithIgnoreFailureTrue() throws Exception { - String brokenXml = """ - - - - William H. Gaddis - The Recognitions - One of the great seminal American novels of the 20th century. - - catalog>"""; - - XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, true, false, false); - Map document = new HashMap<>(); - document.put("message", brokenXml); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); - - // Should not throw exception and leave document unchanged processor.execute(ingestDocument); - assertThat(ingestDocument.getFieldValue("message", String.class), equalTo(brokenXml)); - } - - public void testFieldNotFound() { - XmlProcessor processor = new XmlProcessor("tag", null, "nonexistent", "target", false, false, false, false); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), new HashMap<>()); - - Exception exception = expectThrows(IllegalArgumentException.class, () -> processor.execute(ingestDocument)); - assertThat(exception.getMessage(), containsString("not present as part of path [nonexistent]")); + + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); + Map foo = (Map) data.get("foo"); + assertThat(foo.get("bar"), equalTo("baz")); } - public void testFieldNotFoundWithIgnoreMissing() throws Exception { - XmlProcessor processor = new XmlProcessor("tag", null, "nonexistent", "target", true, false, false, false); - IngestDocument originalDocument = RandomDocumentPicks.randomIngestDocument(random(), new HashMap<>()); - IngestDocument ingestDocument = new IngestDocument(originalDocument); + /** + * Test extracting a single element using XPath. + */ + public void testXPathSingleElementExtraction() { + String xml = "helloworld"; + + Map xpathMap = Map.of("/foo/bar/text()", "bar_content"); + + Map config = new HashMap<>(); + config.put("xpath", xpathMap); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - - // Document should remain unchanged - assertThat(ingestDocument.getSourceAndMetadata(), equalTo(originalDocument.getSourceAndMetadata())); + + // Get the XPath result + Object barContent = ingestDocument.getFieldValue("bar_content", Object.class); + assertNotNull(barContent); + assertEquals("hello", barContent); + + // Verify that the full parsed XML is also available + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); + Map foo = (Map) data.get("foo"); + assertNotNull(foo); + assertThat(foo.get("bar"), equalTo("hello")); + assertThat(foo.get("baz"), equalTo("world")); } - public void testNullValue() { - XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, false, false); - Map document = new HashMap<>(); - document.put("field", null); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + /** + * Test extracting multiple elements using XPath. + */ + public void testXPathMultipleElementsExtraction() { + String xml = "firstsecondthird"; + + Map xpathMap = Map.of("/foo/bar", "all_bars"); + + Map config = new HashMap<>(); + config.put("xpath", xpathMap); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); - Exception exception = expectThrows(IllegalArgumentException.class, () -> processor.execute(ingestDocument)); - assertThat(exception.getMessage(), containsString("field [field] is null")); + processor.execute(ingestDocument); + + List allBars = ingestDocument.getFieldValue("all_bars", List.class); + + assertNotNull(allBars); + assertThat(allBars.size(), equalTo(3)); + assertThat(allBars.get(0), equalTo("first")); + assertThat(allBars.get(1), equalTo("second")); + assertThat(allBars.get(2), equalTo("third")); } - public void testNullValueWithIgnoreMissing() throws Exception { - XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", true, false, false, false); - Map document = new HashMap<>(); - document.put("field", null); - IngestDocument originalDocument = RandomDocumentPicks.randomIngestDocument(random(), document); - IngestDocument ingestDocument = new IngestDocument(originalDocument); + /** + * Test extracting attributes using XPath. + */ + public void testXPathAttributeExtraction() { + String xml = "content"; + + Map xpathMap = new HashMap<>(); + xpathMap.put("/foo/bar/@id", "bar_id"); + xpathMap.put("/foo/bar/@type", "bar_type"); + + Map config = new HashMap<>(); + config.put("xpath", xpathMap); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - - // Document should remain unchanged - assertThat(ingestDocument.getSourceAndMetadata(), equalTo(originalDocument.getSourceAndMetadata())); + + String barId = ingestDocument.getFieldValue("bar_id", String.class); + assertNotNull(barId); + assertThat(barId, equalTo("123")); + + String barType = ingestDocument.getFieldValue("bar_type", String.class); + assertNotNull(barType); + assertThat(barType, equalTo("test")); } - public void testNonStringValueWithIgnoreFailureFalse() { - XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, false, false); - Map document = new HashMap<>(); - document.put("field", 123); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + /** + * Test extracting elements with namespaces using XPath. + */ + public void testXPathNamespacedExtraction() { + String xml = "" + + "" + + " namespace-value" + + " regular-value" + + ""; + + Map namespaces = Map.of("myns", "http://example.org/ns1"); + Map xpathMap = Map.of("//myns:element/text()", "ns_value"); + + Map config = new HashMap<>(); + config.put("xpath", xpathMap); + config.put("namespaces", namespaces); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); - Exception exception = expectThrows(IllegalArgumentException.class, () -> processor.execute(ingestDocument)); - assertThat(exception.getMessage(), containsString("field [field] is not a string")); + processor.execute(ingestDocument); + + String nsValue = ingestDocument.getFieldValue("ns_value", String.class); + assertNotNull(nsValue); + assertThat(nsValue, equalTo("namespace-value")); } - public void testNonStringValueWithIgnoreFailureTrue() throws Exception { - XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, true, false, false); - Map document = new HashMap<>(); - document.put("field", 123); - IngestDocument originalDocument = RandomDocumentPicks.randomIngestDocument(random(), document); - IngestDocument ingestDocument = new IngestDocument(originalDocument); + /** + * Test parsing XML with mixed content (text and elements mixed together). + */ + public void testParseXmlWithMixedContent() { + String xml = "This text is bold and this is italic!"; + + Map config = new HashMap<>(); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - - // Document should remain unchanged - assertThat(ingestDocument.getSourceAndMetadata(), equalTo(originalDocument.getSourceAndMetadata())); + + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); + Map foo = (Map) data.get("foo"); + + assertNotNull(foo.get("b")); + assertThat((String)foo.get("b"), equalTo("bold")); + assertNotNull(foo.get("i")); + assertThat((String)foo.get("i"), equalTo("italic")); + assertNotNull(foo.get("#text")); + assertThat((String)foo.get("#text"), equalTo("This text is and this is !")); } - - public void testEmptyXml() throws Exception { - XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, false, false); - Map document = new HashMap<>(); - document.put("field", ""); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + /** + * Test parsing XML with CDATA sections. + */ + public void testParseXmlWithCDATA() { + String xml = " that shouldn't be parsed!]]>"; + + Map config = new HashMap<>(); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - - // Empty XML should result in null target - assertThat(ingestDocument.getFieldValue("target", Object.class), equalTo(null)); + + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); + Object content = data.get("foo"); + + assertNotNull(content); + assertThat(content, equalTo("This is CDATA content with that shouldn't be parsed!")); } - - public void testWhitespaceOnlyXml() throws Exception { - XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, false, false); - Map document = new HashMap<>(); - document.put("field", " \n\t "); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + /** + * Test parsing XML with numeric data. + */ + public void testParseXmlWithNumericData() { + String xml = "12399.95true"; + + Map config = new HashMap<>(); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - - // Whitespace-only XML should result in null target - assertThat(ingestDocument.getFieldValue("target", Object.class), equalTo(null)); + + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); + Map foo = (Map) data.get("foo"); + + assertThat((String)foo.get("count"), equalTo("123")); + assertThat((String)foo.get("price"), equalTo("99.95")); + assertThat((String)foo.get("active"), equalTo("true")); } - public void testSelfClosingTag() throws Exception { - String xml = ""; - - XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, false, false); - Map document = new HashMap<>(); - document.put("field", xml); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + /** + * Test parsing XML with force_array option enabled. + */ + public void testParseXmlWithForceArray() { + String xml = "single_value"; + + Map config = new HashMap<>(); + config.put("force_array", true); // Enable force_array option + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - - @SuppressWarnings("unchecked") - Map result = (Map) ingestDocument.getFieldValue("target", Object.class); - assertThat(result.get("empty"), equalTo(null)); + + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); + Map foo = (Map) data.get("foo"); + + // With force_array=true, even single values should be in arrays + Object barValue = foo.get("bar"); + assertNotNull(barValue); + assertTrue("Expected bar value to be a List with force_array=true", barValue instanceof List); + + List barList = (List) barValue; + assertThat(barList.size(), equalTo(1)); + assertThat(barList.get(0), equalTo("single_value")); } - public void testSelfClosingTagWithAttributes() throws Exception { - String xml = ""; - - XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, false, false); - Map document = new HashMap<>(); - document.put("field", xml); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + /** + * Test extracting multiple elements using multiple XPath expressions. + * Tests that multiple XPath expressions can be used simultaneously. + */ + public void testMultipleXPathExpressions() { + String xml = "" + + " John30" + + " Jane25" + + ""; + + // Configure multiple XPath expressions + Map xpathMap = new HashMap<>(); + xpathMap.put("/root/person[1]/n/text()", "first_person_name"); + xpathMap.put("/root/person[2]/n/text()", "second_person_name"); + xpathMap.put("/root/person/@id", "person_ids"); + + Map config = new HashMap<>(); + config.put("xpath", xpathMap); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - - @SuppressWarnings("unchecked") - Map result = (Map) ingestDocument.getFieldValue("target", Object.class); - @SuppressWarnings("unchecked") - Map empty = (Map) result.get("empty"); - assertThat(empty.get("id"), equalTo("123")); - assertThat(empty.get("name"), equalTo("test")); + + assertTrue("first_person_name field should exist", ingestDocument.hasField("first_person_name")); + assertTrue("second_person_name field should exist", ingestDocument.hasField("second_person_name")); + assertTrue("person_ids field should exist", ingestDocument.hasField("person_ids")); + + Object firstName = ingestDocument.getFieldValue("first_person_name", Object.class); + assertEquals("John", firstName); + + Object secondName = ingestDocument.getFieldValue("second_person_name", Object.class); + assertEquals("Jane", secondName); + + Object personIdsObj = ingestDocument.getFieldValue("person_ids", Object.class); + assertTrue("person_ids should be a List", personIdsObj instanceof List); + List personIds = (List) personIdsObj; + assertEquals("Should have 2 person IDs", 2, personIds.size()); + assertEquals("First person ID should be '1'", "1", personIds.get(0)); + assertEquals("Second person ID should be '2'", "2", personIds.get(1)); + + assertTrue("Target field should exist", ingestDocument.hasField(TARGET_FIELD)); } - - public void testGetType() { - XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, false, false); - assertThat(processor.getType(), equalTo("xml")); + + /** + * Test handling of invalid XML with ignoreFailure=false. + */ + public void testInvalidXml() { + String xml = ""; // Invalid XML missing closing tag + + Map config = new HashMap<>(); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> { + processor.execute(ingestDocument); + }); + + assertTrue("Error message should indicate XML is invalid", + exception.getMessage().contains("invalid XML") || + exception.getCause().getMessage().contains("XML")); } - - public void testGetters() { - XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", true, true, true, false); - assertThat(processor.getField(), equalTo("field")); - assertThat(processor.getTargetField(), equalTo("target")); - assertThat(processor.isIgnoreMissing(), equalTo(true)); + + /** + * Test handling of invalid XML with ignoreFailure=true. + */ + public void testInvalidXmlWithIgnoreFailure() { + String xml = ""; // Invalid XML missing closing tag + + Map config = new HashMap<>(); + config.put("ignore_failure", true); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + List tags = ingestDocument.getFieldValue("tags", List.class); + assertNotNull(tags); + assertTrue(tags.contains("_xmlparsefailure")); } - - public void testIgnoreEmptyValueEnabled() throws Exception { - String xml = """ - - - William H. Gaddis - - One of the great seminal American novels. - - - - Some content - - - - """; - - XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, false, true); - Map document = new HashMap<>(); - document.put("message", xml); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); - + + /** + * Test the store_xml=false option to not store parsed XML in target field. + */ + public void testNoStoreXml() { + String xml = "value"; + + // Set up XPath to extract value but don't store XML + Map xpathMap = Map.of("/foo/bar/text()", "bar_content"); + + Map config = new HashMap<>(); + config.put("store_xml", false); // Do not store XML in target field + config.put("xpath", xpathMap); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + processor.execute(ingestDocument); - - @SuppressWarnings("unchecked") - Map messageField = (Map) ingestDocument.getFieldValue("message", Object.class); - @SuppressWarnings("unchecked") - Map catalog = (Map) messageField.get("catalog"); - @SuppressWarnings("unchecked") - Map book = (Map) catalog.get("book"); - - // Empty title should be filtered out - assertThat(book.containsKey("title"), equalTo(false)); - // Empty element should be filtered out - assertThat(book.containsKey("empty"), equalTo(false)); - // empty_book should be filtered out entirely - assertThat(catalog.containsKey("empty_book"), equalTo(false)); - - // Valid content should remain - assertThat(book.get("author"), equalTo("William H. Gaddis")); - assertThat(book.get("review"), equalTo("One of the great seminal American novels.")); - - // Nested structure handling - @SuppressWarnings("unchecked") - Map nested = (Map) book.get("nested"); - assertThat(nested.containsKey("empty_text"), equalTo(false)); // Whitespace-only should be filtered - assertThat(nested.get("valid_content"), equalTo("Some content")); + + // Verify XPath result is stored + String barContent = ingestDocument.getFieldValue("bar_content", String.class); + assertNotNull(barContent); + assertThat(barContent, equalTo("value")); + + // Verify the target field was not created + assertFalse(ingestDocument.hasField(TARGET_FIELD)); } - - public void testIgnoreEmptyValueWithArrays() throws Exception { - String xml = """ - - - Valid Book - - - - - - Another Valid Book - - """; - - XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, false, true); - Map document = new HashMap<>(); - document.put("message", xml); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); - + + /** + * Test the to_lower option for converting field names to lowercase. + */ + public void testToLower() { + String xml = "value"; + + Map config = new HashMap<>(); + config.put("to_lower", true); // Enable to_lower option + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + processor.execute(ingestDocument); - - @SuppressWarnings("unchecked") - Map messageField = (Map) ingestDocument.getFieldValue("message", Object.class); - @SuppressWarnings("unchecked") - Map catalog = (Map) messageField.get("catalog"); - @SuppressWarnings("unchecked") - List> books = (List>) catalog.get("book"); - - // Should have 2 books after filtering out the one with empty title - assertThat(books.size(), equalTo(2)); - assertThat(books.get(0).get("title"), equalTo("Valid Book")); - assertThat(books.get(1).get("title"), equalTo("Another Valid Book")); + + // Verify field names are lowercase + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); + assertTrue(data.containsKey("foo")); + assertFalse(data.containsKey("FOO")); + + Map foo = (Map) data.get("foo"); + assertTrue(foo.containsKey("bar")); + assertFalse(foo.containsKey("BAR")); + assertThat(foo.get("bar"), equalTo("value")); } - - public void testIgnoreEmptyValueDisabled() throws Exception { - String xml = """ - - - William H. Gaddis - - - - """; - - XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, false, false); - Map document = new HashMap<>(); - document.put("message", xml); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); - + + /** + * Test the ignore_missing option when field is missing. + */ + public void testIgnoreMissing() { + String xmlField = "nonexistent_field"; + + Map config = new HashMap<>(); + config.put("field", xmlField); + config.put("ignore_missing", true); // Enable ignore_missing option + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = new IngestDocument("_index", "_id", 1, null, null, new HashMap<>(Map.of())); processor.execute(ingestDocument); - - @SuppressWarnings("unchecked") - Map messageField = (Map) ingestDocument.getFieldValue("message", Object.class); - @SuppressWarnings("unchecked") - Map catalog = (Map) messageField.get("catalog"); - @SuppressWarnings("unchecked") - Map book = (Map) catalog.get("book"); - - // Empty values should remain when ignore_empty_value is false - assertThat(book.containsKey("title"), equalTo(true)); - assertThat(book.get("title"), equalTo(null)); // Empty elements are parsed as null - assertThat(book.containsKey("empty"), equalTo(true)); - assertThat(book.get("empty"), equalTo(null)); - assertThat(book.get("author"), equalTo("William H. Gaddis")); + + assertFalse("Target field should not be created when source field is missing", + ingestDocument.hasField(TARGET_FIELD)); + + // With ignoreMissing=false + config.put("ignore_missing", false); + XmlProcessor failingProcessor = createTestProcessor(config); + + // This should throw an exception + IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> { + failingProcessor.execute(ingestDocument); + }); + + assertTrue(exception.getMessage().contains("not present as part of path")); } - - public void testGettersWithIgnoreEmptyValue() { - XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", true, true, true, true); - assertThat(processor.getField(), equalTo("field")); - assertThat(processor.getTargetField(), equalTo("target")); - assertThat(processor.isIgnoreMissing(), equalTo(true)); - assertThat(processor.isIgnoreEmptyValue(), equalTo(true)); + + /** + * Test that ignore_empty_value correctly filters out empty values from arrays and mixed content. + */ + public void testIgnoreEmptyValue() { + // XML with mixed empty and non-empty elements, including array elements with mixed empty/non-empty values + String xml = "" + + " " + + " " + + " content" + + " nested-content" + + " " + + " first" + + " " + + " third" + + " " + + " fifth" + + " " + + " Text with and content" + + ""; + + Map config = new HashMap<>(); + config.put("ignore_empty_value", true); + XmlProcessor processor = createTestProcessor(config); + + IngestDocument ingestDocument = createTestIngestDocument(xml); + processor.execute(ingestDocument); + + Map result = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); + Map root = (Map) result.get("root"); + + // Check empty elements are filtered + assertFalse("Empty element should be filtered out", root.containsKey("empty")); + assertFalse("Blank element should be filtered out", root.containsKey("blank")); + + // Check valid elements are preserved + assertTrue("Valid element should be preserved", root.containsKey("valid")); + assertEquals("content", root.get("valid")); + + // Check nested structure filtering + Map nested = (Map) root.get("nested"); + assertNotNull("Nested element should be preserved", nested); + assertFalse("Empty nested element should be filtered", nested.containsKey("empty")); + assertEquals("nested-content", nested.get("valid")); + + // Check array with mixed empty/non-empty values + Map items = (Map) root.get("items"); + assertNotNull("Items object should be preserved", items); + List itemList = (List) items.get("item"); + assertNotNull("Item array should be preserved", itemList); + assertEquals("Array should contain only non-empty items", 3, itemList.size()); + assertEquals("first", itemList.get(0)); + assertEquals("third", itemList.get(1)); + assertEquals("fifth", itemList.get(2)); + + // Check mixed content handling + Map mixed = (Map) root.get("mixed"); + assertNotNull("Mixed content should be preserved", mixed); + assertFalse("Empty element in mixed content should be filtered", mixed.containsKey("empty")); + assertTrue("Valid element in mixed content should be preserved", mixed.containsKey("valid")); + assertEquals("content", mixed.get("valid")); + assertEquals("Text with and", mixed.get("#text")); } - - public void testElementsWithAttributesAndTextContent() throws Exception { - String xml = """ - - - The Recognitions - William H. Gaddis - 29.99 - - """; - - XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, false, false); - Map document = new HashMap<>(); - document.put("field", xml); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); - + + /** + * Test parsing with strict mode option. + */ + public void testStrictParsing() { + String xml = "valid"; + + Map config = new HashMap<>(); + config.put("parse_options", "strict"); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + processor.execute(ingestDocument); - - @SuppressWarnings("unchecked") - Map result = (Map) ingestDocument.getFieldValue("target", Object.class); - @SuppressWarnings("unchecked") - Map catalog = (Map) result.get("catalog"); - @SuppressWarnings("unchecked") - Map book = (Map) catalog.get("book"); - @SuppressWarnings("unchecked") - Map title = (Map) book.get("title"); - @SuppressWarnings("unchecked") - Map author = (Map) book.get("author"); - @SuppressWarnings("unchecked") - Map price = (Map) book.get("price"); - - // Test catalog with attributes only - assertThat(catalog.get("version"), equalTo("1.0")); - - // Test book with attributes only (no text content) - assertThat(book.get("id"), equalTo("123")); - assertThat(book.get("isbn"), equalTo("978-0-684-80335-9")); - - // Test elements with both attributes and text content (should use #text key) - assertThat(title.get("lang"), equalTo("en")); - assertThat(title.get("#text"), equalTo("The Recognitions")); - - assertThat(author.get("nationality"), equalTo("American")); - assertThat(author.get("#text"), equalTo("William H. Gaddis")); - - assertThat(price.get("currency"), equalTo("USD")); - assertThat(price.get("#text"), equalTo("29.99")); + + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); + Map foo = (Map) data.get("foo"); + assertThat(foo.get("bar"), equalTo("valid")); + + // Test with invalid XML in strict mode + String invalidXml = ""; + IngestDocument invalidDocument = createTestIngestDocument(invalidXml); + + IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> { + processor.execute(invalidDocument); + }); + + assertTrue("Error message should indicate XML is invalid", + exception.getMessage().contains("invalid XML") || + exception.getCause().getMessage().contains("XML")); } - - public void testMixedAttributesAndTextWithToLower() throws Exception { - String xml = """ - - - The Recognitions - William H. Gaddis - - """; - - XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, true, false); - Map document = new HashMap<>(); - document.put("field", xml); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); - + + /** + * Test parsing XML with remove_namespaces option. + */ + public void testRemoveNamespaces() { + String xml = "value"; + + Map config = new HashMap<>(); + config.put("remove_namespaces", true); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + processor.execute(ingestDocument); + + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); + Map foo = (Map) data.get("foo"); + + assertTrue("Element with namespace should be present", foo.containsKey("ns:bar")); + assertThat(foo.get("ns:bar"), equalTo("value")); + + // Now test with removeNamespaces=false + IngestDocument ingestDocument2 = createTestIngestDocument(xml); + + config.put("remove_namespaces", false); + XmlProcessor processor2 = createTestProcessor(config); + processor2.execute(ingestDocument2); + + Map data2 = ingestDocument2.getFieldValue(TARGET_FIELD, Map.class); + Map foo2 = (Map) data2.get("foo"); + + // With removeNamespaces=false, the "ns:" prefix should be preserved + assertTrue("Element should be accessible with namespace prefix", foo2.containsKey("ns:bar")); + assertThat(foo2.get("ns:bar"), equalTo("value")); + } - @SuppressWarnings("unchecked") - Map result = (Map) ingestDocument.getFieldValue("target", Object.class); - @SuppressWarnings("unchecked") - Map catalog = (Map) result.get("catalog"); - @SuppressWarnings("unchecked") - Map book = (Map) catalog.get("book"); - @SuppressWarnings("unchecked") - Map title = (Map) book.get("title"); - @SuppressWarnings("unchecked") - Map author = (Map) book.get("author"); - - // Test that element names are converted to lowercase - assertThat(catalog.get("version"), equalTo("1.0")); - assertThat(book.get("id"), equalTo("123")); - - // Test that attribute names are converted to lowercase but values remain unchanged - assertThat(title.get("lang"), equalTo("EN")); - assertThat(title.get("#text"), equalTo("The Recognitions")); - - assertThat(author.get("nationality"), equalTo("AMERICAN")); - assertThat(author.get("#text"), equalTo("William H. Gaddis")); + /** + * Test the force_content option. + */ + public void testForceContent() { + String xml = "simple text"; + + Map config = new HashMap<>(); + config.put("force_content", true); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); + Map foo = (Map) data.get("foo"); + + // With forceContent=true, the text should be in a #text field + assertTrue("Text content should be in #text field", foo.containsKey("#text")); + assertThat(foo.get("#text"), equalTo("simple text")); + + // Now test with forceContent=false + config.put("force_content", false); + XmlProcessor processor2 = createTestProcessor(config); + IngestDocument ingestDocument2 = createTestIngestDocument(xml); + + processor2.execute(ingestDocument2); + + Map data2 = ingestDocument2.getFieldValue(TARGET_FIELD, Map.class); + + // With forceContent=false, the text should be directly assigned to the element + assertThat(data2.get("foo"), equalTo("simple text")); } } From 002802d1af34999b8c6d84bdaacd0e5c38d16fec Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Fri, 4 Jul 2025 14:09:32 +0000 Subject: [PATCH 07/54] [CI] Auto commit changes from spotless --- .../ingest/common/XmlProcessor.java | 213 ++++++------ .../common/XmlProcessorFactoryTests.java | 84 ++--- .../ingest/common/XmlProcessorTests.java | 313 +++++++++--------- 3 files changed, 322 insertions(+), 288 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index 582b8d9f3e787..2bcb8666dba6b 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -14,6 +14,9 @@ import org.elasticsearch.ingest.ConfigurationUtils; import org.elasticsearch.ingest.IngestDocument; import org.elasticsearch.ingest.Processor; +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; import java.util.ArrayList; import java.util.HashMap; @@ -31,10 +34,6 @@ import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; -import org.w3c.dom.Document; -import org.w3c.dom.Node; -import org.w3c.dom.NodeList; - /** * Processor that parses XML documents and converts them to JSON objects using a single-pass streaming approach. * @@ -49,15 +48,16 @@ public final class XmlProcessor extends AbstractProcessor { public static final String TYPE = "xml"; - + private static final XPathFactory XPATH_FACTORY = XPathFactory.newInstance(); - + // Pre-configured SAX parser factories for secure XML parsing private static final javax.xml.parsers.SAXParserFactory SAX_PARSER_FACTORY = createSecureSaxParserFactory(); private static final javax.xml.parsers.SAXParserFactory SAX_PARSER_FACTORY_NS = createSecureSaxParserFactoryNamespaceAware(); private static final javax.xml.parsers.SAXParserFactory SAX_PARSER_FACTORY_STRICT = createSecureSaxParserFactoryStrict(); - private static final javax.xml.parsers.SAXParserFactory SAX_PARSER_FACTORY_NS_STRICT = createSecureSaxParserFactoryNamespaceAwareStrict(); - + private static final javax.xml.parsers.SAXParserFactory SAX_PARSER_FACTORY_NS_STRICT = + createSecureSaxParserFactoryNamespaceAwareStrict(); + // Pre-configured document builder factory for DOM creation private static final DocumentBuilderFactory DOM_FACTORY = createSecureDocumentBuilderFactory(); @@ -202,13 +202,13 @@ public String getType() { /** * Determines if a value should be considered empty for filtering purposes. * Used by the ignore_empty_value feature to filter out empty content. - * + * * Considers empty: * - null values * - empty or whitespace-only strings * - empty Maps * - empty Lists - * + * * @param value the value to check * @return true if the value should be considered empty */ @@ -235,7 +235,7 @@ private boolean isEmptyValue(Object value) { * - ATTRIBUTE_NODE: returns attribute value * - ELEMENT_NODE: returns text content (concatenated text of all descendants) * - Other node types: returns text content as fallback - * + * * @param node the DOM node to extract text from * @return the text content of the node, or null if node is null */ @@ -243,7 +243,7 @@ private String getNodeValue(Node node) { if (node == null) { return null; } - + switch (node.getNodeType()) { case Node.ATTRIBUTE_NODE: case Node.CDATA_SECTION_NODE: @@ -263,7 +263,7 @@ private String getNodeValue(Node node) { * - If force_array is true and content is already a List: returns content unchanged * - If force_array is true and content is not a List: wraps content in a new ArrayList * - Handles null content appropriately (wraps null in array if force_array is true) - * + * * @param elementName the name of the element (for context, not used in current implementation) * @param content the content to potentially wrap in an array * @return the content, optionally wrapped in an array based on force_array setting @@ -279,13 +279,13 @@ private Object applyForceArray(String elementName, Object content) { /** * Evaluates precompiled XPath expressions against a DOM document and adds results to the ingest document. - * + * * Features: * - Uses precompiled XPath expressions for optimal performance * - Extracts text values from matched nodes (elements, attributes, text nodes) * - Single matches stored as strings, multiple matches as string arrays * - Respects ignoreFailure setting for XPath evaluation errors - * + * * @param document the ingest document to add XPath results to * @param doc the DOM document to evaluate XPath expressions against * @throws Exception if XPath processing fails and ignoreFailure is false @@ -295,14 +295,14 @@ private void processXPathExpressionsFromDom(IngestDocument document, Document do for (Map.Entry entry : compiledXPathExpressions.entrySet()) { String targetFieldName = entry.getKey(); XPathExpression compiledExpression = entry.getValue(); - + try { Object result = compiledExpression.evaluate(doc, XPathConstants.NODESET); - + if (result instanceof NodeList) { NodeList nodeList = (NodeList) result; List values = new ArrayList<>(); - + for (int i = 0; i < nodeList.getLength(); i++) { Node node = nodeList.item(i); String value = getNodeValue(node); @@ -310,7 +310,7 @@ private void processXPathExpressionsFromDom(IngestDocument document, Document do values.add(value); } } - + if (values.isEmpty() == false) { if (values.size() == 1) { document.setFieldValue(targetFieldName, values.get(0)); @@ -321,7 +321,10 @@ private void processXPathExpressionsFromDom(IngestDocument document, Document do } } catch (XPathExpressionException e) { if (ignoreFailure == false) { - throw new IllegalArgumentException("XPath evaluation failed for target field [" + targetFieldName + "]: " + e.getMessage(), e); + throw new IllegalArgumentException( + "XPath evaluation failed for target field [" + targetFieldName + "]: " + e.getMessage(), + e + ); } } } @@ -331,23 +334,23 @@ private void processXPathExpressionsFromDom(IngestDocument document, Document do * Compiles XPath expressions at processor creation time for optimal runtime performance. * This method pre-compiles all configured XPath expressions with appropriate namespace context, * eliminating the compilation overhead during document processing. - * + * * @param xpathExpressions map of XPath expressions to target field names * @param namespaces map of namespace prefixes to URIs * @return map of compiled XPath expressions keyed by target field name * @throws IllegalArgumentException if XPath compilation fails or namespace validation fails */ private static Map compileXPathExpressions( - Map xpathExpressions, + Map xpathExpressions, Map namespaces ) { if (xpathExpressions.isEmpty()) { return Map.of(); } - + Map compiled = new HashMap<>(); XPath xpath = XPATH_FACTORY.newXPath(); - + // Set namespace context if namespaces are defined boolean hasNamespaces = namespaces.isEmpty() == false; if (hasNamespaces) { @@ -382,32 +385,31 @@ public Iterator getPrefixes(String namespaceURI) { } }); } - + // Pre-compiled pattern to detect namespace prefixes - java.util.regex.Pattern namespacePattern = - java.util.regex.Pattern.compile(".*\\b[a-zA-Z][a-zA-Z0-9_-]*:[a-zA-Z][a-zA-Z0-9_-]*.*"); - + java.util.regex.Pattern namespacePattern = java.util.regex.Pattern.compile(".*\\b[a-zA-Z][a-zA-Z0-9_-]*:[a-zA-Z][a-zA-Z0-9_-]*.*"); + for (Map.Entry entry : xpathExpressions.entrySet()) { String xpathExpression = entry.getKey(); String targetFieldName = entry.getValue(); - + // Validate namespace prefixes if no namespaces are configured if (!hasNamespaces && namespacePattern.matcher(xpathExpression).matches()) { throw new IllegalArgumentException( - "Invalid XPath expression [" + xpathExpression + "]: contains namespace prefixes but no namespace configuration provided" + "Invalid XPath expression [" + + xpathExpression + + "]: contains namespace prefixes but no namespace configuration provided" ); } - + try { XPathExpression compiledExpression = xpath.compile(xpathExpression); compiled.put(targetFieldName, compiledExpression); } catch (XPathExpressionException e) { - throw new IllegalArgumentException( - "Invalid XPath expression [" + xpathExpression + "]: " + e.getMessage(), e - ); + throw new IllegalArgumentException("Invalid XPath expression [" + xpathExpression + "]: " + e.getMessage(), e); } } - + return Map.copyOf(compiled); } @@ -431,7 +433,7 @@ public XmlProcessor create( boolean removeNamespaces = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "remove_namespaces", false); boolean forceContent = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "force_content", false); boolean forceArray = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "force_array", false); - + // Parse XPath expressions map Map xpathExpressions = new HashMap<>(); Object xpathConfig = config.get("xpath"); @@ -444,7 +446,11 @@ public XmlProcessor create( xpathExpressions.put(entry.getKey(), (String) entry.getValue()); } else { throw new IllegalArgumentException( - "XPath target field [" + entry.getKey() + "] must be a string, got [" + entry.getValue().getClass().getSimpleName() + "]" + "XPath target field [" + + entry.getKey() + + "] must be a string, got [" + + entry.getValue().getClass().getSimpleName() + + "]" ); } } @@ -465,7 +471,11 @@ public XmlProcessor create( namespaces.put(entry.getKey(), (String) entry.getValue()); } else { throw new IllegalArgumentException( - "Namespace prefix [" + entry.getKey() + "] must have a string URI, got [" + entry.getValue().getClass().getSimpleName() + "]" + "Namespace prefix [" + + entry.getKey() + + "] must have a string URI, got [" + + entry.getValue().getClass().getSimpleName() + + "]" ); } } @@ -480,14 +490,30 @@ public XmlProcessor create( throw new IllegalArgumentException("Invalid parse_options [" + parseOptions + "]. Only 'strict' is supported."); } - return new XmlProcessor(processorTag, description, field, targetField, ignoreMissing, ignoreFailure, toLower, ignoreEmptyValue, storeXml, removeNamespaces, forceContent, forceArray, xpathExpressions, namespaces, parseOptions); + return new XmlProcessor( + processorTag, + description, + field, + targetField, + ignoreMissing, + ignoreFailure, + toLower, + ignoreEmptyValue, + storeXml, + removeNamespaces, + forceContent, + forceArray, + xpathExpressions, + namespaces, + parseOptions + ); } } /** * Main XML parsing method that converts XML to JSON and optionally extracts XPath values. * Uses streaming SAX parser with optional DOM building for XPath processing. - * + * * @param document the ingest document to modify with parsed results * @param xmlString the XML string to parse (should be trimmed) * @throws Exception if XML parsing fails @@ -499,12 +525,12 @@ private void parseXmlAndXPath(IngestDocument document, String xmlString) throws // Determine if we need DOM for XPath processing boolean needsDom = xpathExpressions.isEmpty() == false; - + // Use the appropriate pre-configured SAX parser factory javax.xml.parsers.SAXParserFactory factory = selectSaxParserFactory(); - + javax.xml.parsers.SAXParser parser = factory.newSAXParser(); - + // Configure error handler for strict mode if (isStrict()) { parser.getXMLReader().setErrorHandler(new org.xml.sax.ErrorHandler() { @@ -512,24 +538,24 @@ private void parseXmlAndXPath(IngestDocument document, String xmlString) throws public void warning(org.xml.sax.SAXParseException exception) throws org.xml.sax.SAXException { throw exception; } - + @Override public void error(org.xml.sax.SAXParseException exception) throws org.xml.sax.SAXException { throw exception; } - + @Override public void fatalError(org.xml.sax.SAXParseException exception) throws org.xml.sax.SAXException { throw exception; } }); } - + // Use enhanced handler that can build DOM during streaming when needed XmlStreamingWithDomHandler handler = new XmlStreamingWithDomHandler(needsDom); - + parser.parse(new java.io.ByteArrayInputStream(xmlString.getBytes("UTF-8")), handler); - + // Store structured result if needed if (storeXml) { Object streamingResult = handler.getStructuredResult(); @@ -537,7 +563,7 @@ public void fatalError(org.xml.sax.SAXParseException exception) throws org.xml.s document.setFieldValue(targetField, streamingResult); } } - + // Process XPath expressions if DOM was built during streaming if (needsDom) { Document domDocument = handler.getDomDocument(); @@ -558,12 +584,12 @@ private class XmlStreamingWithDomHandler extends org.xml.sax.helpers.DefaultHand private final java.util.Deque textStack = new java.util.ArrayDeque<>(); private final java.util.Deque>> repeatedElementsStack = new java.util.ArrayDeque<>(); private Object rootResult = null; - + // DOM building state (for XPath processing when needed) private final boolean buildDom; private Document domDocument = null; private final java.util.Deque domElementStack = new java.util.ArrayDeque<>(); - + public XmlStreamingWithDomHandler(boolean buildDom) { this.buildDom = buildDom; } @@ -585,29 +611,30 @@ public void startDocument() throws org.xml.sax.SAXException { } @Override - public void startElement(String uri, String localName, String qName, org.xml.sax.Attributes attributes) throws org.xml.sax.SAXException { + public void startElement(String uri, String localName, String qName, org.xml.sax.Attributes attributes) + throws org.xml.sax.SAXException { String elementName = getElementName(uri, localName, qName); - + // Build structured representation (always) Map element = new HashMap<>(); Map> repeatedElements = new HashMap<>(); - + // Process attributes for structured output for (int i = 0; i < attributes.getLength(); i++) { String attrName = getAttributeName(attributes.getURI(i), attributes.getLocalName(i), attributes.getQName(i)); String attrValue = attributes.getValue(i); - + // Apply ignoreEmptyValue filtering to attributes if (ignoreEmptyValue == false || isEmptyValue(attrValue) == false) { element.put(attrName, attrValue); } } - + elementStack.push(element); elementNameStack.push(elementName); textStack.push(new StringBuilder()); repeatedElementsStack.push(repeatedElements); - + // Build DOM element simultaneously if needed if (buildDom && domDocument != null) { org.w3c.dom.Element domElement; @@ -616,28 +643,28 @@ public void startElement(String uri, String localName, String qName, org.xml.sax } else { domElement = domDocument.createElement(removeNamespaces ? localName : qName); } - + // Add attributes to DOM element for (int i = 0; i < attributes.getLength(); i++) { String attrUri = attributes.getURI(i); String attrLocalName = attributes.getLocalName(i); String attrQName = attributes.getQName(i); String attrValue = attributes.getValue(i); - + if (attrUri != null && !attrUri.isEmpty() && !removeNamespaces) { domElement.setAttributeNS(attrUri, attrQName, attrValue); } else { domElement.setAttribute(removeNamespaces ? attrLocalName : attrQName, attrValue); } } - + // Add to parent or root if (domElementStack.isEmpty()) { domDocument.appendChild(domElement); } else { domElementStack.peek().appendChild(domElement); } - + domElementStack.push(domElement); } } @@ -648,7 +675,7 @@ public void characters(char[] ch, int start, int length) throws org.xml.sax.SAXE if (!textStack.isEmpty()) { textStack.peek().append(ch, start, length); } - + // Add to DOM text node if needed if (buildDom && !domElementStack.isEmpty()) { String text = new String(ch, start, length); @@ -665,12 +692,12 @@ public void endElement(String uri, String localName, String qName) throws org.xm if (elementStack.isEmpty()) { return; } - + Map element = elementStack.pop(); String elementName = elementNameStack.pop(); StringBuilder textContent = textStack.pop(); Map> repeatedElements = repeatedElementsStack.pop(); - + // Add repeated elements as arrays for (Map.Entry> entry : repeatedElements.entrySet()) { List values = entry.getValue(); @@ -678,12 +705,12 @@ public void endElement(String uri, String localName, String qName) throws org.xm element.put(entry.getKey(), values); } } - + // Process text content and determine final element structure String trimmedText = textContent.toString().trim(); boolean hasText = trimmedText.isEmpty() == false; boolean hasChildren = element.size() > 0; - + Object elementValue; if (hasText == false && hasChildren == false) { // Empty element @@ -718,7 +745,7 @@ public void endElement(String uri, String localName, String qName) throws org.xm } elementValue = (forceArray && forceContent) ? applyForceArray(elementName, element) : element; } - + // If this is the root element, store the result if (elementStack.isEmpty()) { if (elementValue != null) { @@ -731,7 +758,7 @@ public void endElement(String uri, String localName, String qName) throws org.xm if (elementValue != null) { Map parentElement = elementStack.peek(); Map> parentRepeatedElements = repeatedElementsStack.peek(); - + if (parentElement.containsKey(elementName) || parentRepeatedElements.containsKey(elementName)) { // Handle repeated elements if (parentRepeatedElements.containsKey(elementName) == false) { @@ -748,7 +775,7 @@ public void endElement(String uri, String localName, String qName) throws org.xm } } } - + // Complete DOM element if building DOM if (buildDom && !domElementStack.isEmpty()) { domElementStack.pop(); @@ -763,7 +790,7 @@ public void endDocument() throws org.xml.sax.SAXException { public Object getStructuredResult() { return rootResult; } - + public Document getDomDocument() { return domDocument; } @@ -775,12 +802,12 @@ private String getElementName(String uri, String localName, String qName) { } else { elementName = qName; } - + // Apply toLower if enabled if (toLower) { elementName = elementName.toLowerCase(Locale.ROOT); } - + return elementName; } @@ -791,16 +818,16 @@ private String getAttributeName(String uri, String localName, String qName) { } else { attrName = qName; } - + // Apply toLower if enabled if (toLower) { attrName = attrName.toLowerCase(Locale.ROOT); } - + return attrName; } } - + /** * Creates a secure, pre-configured SAX parser factory for XML parsing. * This factory is configured to prevent XXE attacks with SAX-specific features. @@ -808,7 +835,7 @@ private String getAttributeName(String uri, String localName, String qName) { private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactory() { javax.xml.parsers.SAXParserFactory factory = javax.xml.parsers.SAXParserFactory.newInstance(); factory.setValidating(false); - + // Configure SAX-specific security features to prevent XXE attacks try { // SAX parser features - these are the correct features for SAXParserFactory @@ -819,10 +846,10 @@ private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactory() } catch (Exception e) { // If features cannot be set, continue with default settings } - + return factory; } - + /** * Creates a secure, pre-configured namespace-aware SAX parser factory for XML parsing. * This factory is configured to prevent XXE attacks and has namespace awareness enabled. @@ -831,7 +858,7 @@ private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactoryNa javax.xml.parsers.SAXParserFactory factory = javax.xml.parsers.SAXParserFactory.newInstance(); factory.setValidating(false); factory.setNamespaceAware(true); - + // Configure SAX-specific security features to prevent XXE attacks try { // SAX parser features - these are the correct features for SAXParserFactory @@ -842,10 +869,10 @@ private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactoryNa } catch (Exception e) { // If features cannot be set, continue with default settings } - + return factory; } - + /** * Creates a secure, pre-configured SAX parser factory for strict XML parsing. * This factory is configured to prevent XXE attacks and has strict validation enabled. @@ -853,7 +880,7 @@ private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactoryNa private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactoryStrict() { javax.xml.parsers.SAXParserFactory factory = javax.xml.parsers.SAXParserFactory.newInstance(); factory.setValidating(false); - + // Configure SAX-specific security features to prevent XXE attacks try { // SAX parser features - these are the correct features for SAXParserFactory @@ -861,16 +888,16 @@ private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactorySt factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false); factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); - + // Enable strict parsing features factory.setFeature("http://apache.org/xml/features/validation/check-full-element-content", true); } catch (Exception e) { // If features cannot be set, continue with default settings } - + return factory; } - + /** * Creates a secure, pre-configured namespace-aware SAX parser factory for strict XML parsing. * This factory is configured to prevent XXE attacks, has namespace awareness enabled, and strict validation. @@ -879,7 +906,7 @@ private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactoryNa javax.xml.parsers.SAXParserFactory factory = javax.xml.parsers.SAXParserFactory.newInstance(); factory.setValidating(false); factory.setNamespaceAware(true); - + // Configure SAX-specific security features to prevent XXE attacks try { // SAX parser features - these are the correct features for SAXParserFactory @@ -887,16 +914,16 @@ private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactoryNa factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false); factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); - + // Enable strict parsing features factory.setFeature("http://apache.org/xml/features/validation/check-full-element-content", true); } catch (Exception e) { // If features cannot be set, continue with default settings } - + return factory; } - + /** * Creates a secure, pre-configured DocumentBuilderFactory for DOM creation. * Since we only use this factory to create empty DOM documents programmatically @@ -907,13 +934,13 @@ private static DocumentBuilderFactory createSecureDocumentBuilderFactory() { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); factory.setNamespaceAware(true); // Enable for maximum compatibility factory.setValidating(false); - + // No XXE security features needed - we only create empty documents, // never parse XML with this factory - + return factory; } - + /** * Selects the appropriate pre-configured SAX parser factory based on processor configuration. * @@ -922,7 +949,7 @@ private static DocumentBuilderFactory createSecureDocumentBuilderFactory() { * - Regular parsing, with namespaces: SAX_PARSER_FACTORY_NS * - Strict parsing, no namespaces: SAX_PARSER_FACTORY_STRICT * - Strict parsing, with namespaces: SAX_PARSER_FACTORY_NS_STRICT - * + * * @return the appropriate SAX parser factory for the current configuration */ private javax.xml.parsers.SAXParserFactory selectSaxParserFactory() { diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java index d0f4437074359..e693c39a5cab3 100644 --- a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java @@ -15,8 +15,8 @@ import java.util.HashMap; import java.util.Map; -import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.containsString; +import static org.hamcrest.Matchers.equalTo; public class XmlProcessorFactoryTests extends ESTestCase { @@ -68,8 +68,8 @@ private Map createConfigWithNamespaces(String fieldName, Map createConfigWithXPathAndNamespaces( - String fieldName, - Map xpathExpressions, + String fieldName, + Map xpathExpressions, Map namespaces ) { Map config = createBaseConfig(fieldName); @@ -100,7 +100,7 @@ private Map createXPathConfig(String... expressionsAndFields) { if (expressionsAndFields.length % 2 != 0) { throw new IllegalArgumentException("Must provide even number of arguments (expression, field, expression, field, ...)"); } - + Map xpathConfig = new HashMap<>(); for (int i = 0; i < expressionsAndFields.length; i += 2) { xpathConfig.put(expressionsAndFields[i], expressionsAndFields[i + 1]); @@ -115,7 +115,7 @@ private Map createNamespaceConfig(String... prefixesAndUris) { if (prefixesAndUris.length % 2 != 0) { throw new IllegalArgumentException("Must provide even number of arguments (prefix, uri, prefix, uri, ...)"); } - + Map namespaceConfig = new HashMap<>(); for (int i = 0; i < prefixesAndUris.length; i += 2) { namespaceConfig.put(prefixesAndUris[i], prefixesAndUris[i + 1]); @@ -128,7 +128,7 @@ private Map createNamespaceConfig(String... prefixesAndUris) { */ private Map createConfigWithOptions(String fieldName, String... options) { Map config = createBaseConfig(fieldName); - + for (String option : options) { switch (option) { case "ignore_missing": @@ -162,7 +162,7 @@ private Map createConfigWithOptions(String fieldName, String... throw new IllegalArgumentException("Unknown option: " + option); } } - + return config; } @@ -172,11 +172,8 @@ private Map createConfigWithOptions(String fieldName, String... private void expectCreationFailure(Map config, Class exceptionClass, String expectedMessage) { XmlProcessor.Factory factory = createFactory(); String processorTag = randomAlphaOfLength(10); - - Exception exception = expectThrows( - exceptionClass, - () -> factory.create(null, processorTag, null, config, null) - ); + + Exception exception = expectThrows(exceptionClass, () -> factory.create(null, processorTag, null, config, null)); assertThat(exception.getMessage(), equalTo(expectedMessage)); } @@ -226,10 +223,7 @@ public void testCreateWithIgnoreEmptyValueOnly() throws Exception { } public void testCreateWithXPath() throws Exception { - Map xpathConfig = createXPathConfig( - "//author/text()", "author_field", - "//title/@lang", "language_field" - ); + Map xpathConfig = createXPathConfig("//author/text()", "author_field", "//title/@lang", "language_field"); Map config = createConfigWithXPath(DEFAULT_FIELD, xpathConfig); XmlProcessor processor = createProcessor(config); @@ -240,24 +234,30 @@ public void testCreateWithXPath() throws Exception { public void testCreateWithInvalidXPathConfig() throws Exception { Map config = createBaseConfig(); config.put("xpath", "invalid_string"); // Should be a map - + expectCreationFailure(config, IllegalArgumentException.class, "XPath configuration must be a map of expressions to target fields"); } public void testCreateWithInvalidXPathTargetField() throws Exception { Map config = createBaseConfig(); - + Map xpathConfig = new HashMap<>(); xpathConfig.put("//author/text()", 123); // Should be string config.put("xpath", xpathConfig); - expectCreationFailure(config, IllegalArgumentException.class, "XPath target field [//author/text()] must be a string, got [Integer]"); + expectCreationFailure( + config, + IllegalArgumentException.class, + "XPath target field [//author/text()] must be a string, got [Integer]" + ); } public void testCreateWithNamespaces() throws Exception { Map namespacesConfig = createNamespaceConfig( - "book", "http://example.com/book", - "author", "http://example.com/author" + "book", + "http://example.com/book", + "author", + "http://example.com/author" ); Map config = createConfigWithNamespaces(DEFAULT_FIELD, namespacesConfig); @@ -276,7 +276,7 @@ public void testCreateWithInvalidNamespacesConfig() throws Exception { public void testCreateWithInvalidNamespaceURI() throws Exception { Map config = createBaseConfig(); - + Map namespacesConfig = new HashMap<>(); namespacesConfig.put("book", 123); // Should be string config.put("namespaces", namespacesConfig); @@ -285,13 +285,8 @@ public void testCreateWithInvalidNamespaceURI() throws Exception { } public void testCreateWithXPathAndNamespaces() throws Exception { - Map xpathConfig = createXPathConfig( - "//book:author/text()", "author_field", - "//book:title/@lang", "language_field" - ); - Map namespacesConfig = createNamespaceConfig( - "book", "http://example.com/book" - ); + Map xpathConfig = createXPathConfig("//book:author/text()", "author_field", "//book:title/@lang", "language_field"); + Map namespacesConfig = createNamespaceConfig("book", "http://example.com/book"); Map config = createConfigWithXPathAndNamespaces(DEFAULT_FIELD, xpathConfig, namespacesConfig); XmlProcessor processor = createProcessor(config); @@ -301,7 +296,7 @@ public void testCreateWithXPathAndNamespaces() throws Exception { } // Tests for individual boolean options - + public void testCreateWithStoreXmlFalse() throws Exception { Map config = createConfigWithOptions(DEFAULT_FIELD, "store_xml"); XmlProcessor processor = createProcessor(config); @@ -344,8 +339,13 @@ public void testCreateWithStrictParseOptions() throws Exception { } public void testCreateWithMultipleOptions() throws Exception { - Map config = createConfigWithOptions(DEFAULT_FIELD, - "ignore_missing", "force_content", "force_array", "remove_namespaces"); + Map config = createConfigWithOptions( + DEFAULT_FIELD, + "ignore_missing", + "force_content", + "force_array", + "remove_namespaces" + ); XmlProcessor processor = createProcessor(config); assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); @@ -356,28 +356,32 @@ public void testCreateWithMultipleOptions() throws Exception { } // Tests for invalid parse options - + public void testCreateWithInvalidParseOptions() throws Exception { Map config = createBaseConfig(); config.put("parse_options", "invalid_option"); - expectCreationFailure(config, IllegalArgumentException.class, "Invalid parse_options [invalid_option]. Only 'strict' is supported."); + expectCreationFailure( + config, + IllegalArgumentException.class, + "Invalid parse_options [invalid_option]. Only 'strict' is supported." + ); } // Tests for XPath compilation errors (testing precompilation feature) - + public void testCreateWithInvalidXPathExpression() throws Exception { Map xpathConfig = createXPathConfig("invalid xpath ][", "target_field"); Map config = createConfigWithXPath(DEFAULT_FIELD, xpathConfig); XmlProcessor.Factory factory = createFactory(); String processorTag = randomAlphaOfLength(10); - + IllegalArgumentException exception = expectThrows( IllegalArgumentException.class, () -> factory.create(null, processorTag, null, config, null) ); - + // Check that the error message contains the XPath expression and indicates it's invalid assertThat(exception.getMessage(), containsString("Invalid XPath expression [invalid xpath ][]:")); assertThat(exception.getMessage(), containsString("javax.xml.transform.TransformerException")); @@ -387,6 +391,10 @@ public void testCreateWithXPathUsingNamespacesWithoutConfiguration() throws Exce Map xpathConfig = createXPathConfig("//book:title/text()", "title_field"); Map config = createConfigWithXPath(DEFAULT_FIELD, xpathConfig); - expectCreationFailure(config, IllegalArgumentException.class, "Invalid XPath expression [//book:title/text()]: contains namespace prefixes but no namespace configuration provided"); + expectCreationFailure( + config, + IllegalArgumentException.class, + "Invalid XPath expression [//book:title/text()]: contains namespace prefixes but no namespace configuration provided" + ); } } diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java index e48c6de593f84..f2c1aec1031d7 100644 --- a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java @@ -11,11 +11,12 @@ import org.elasticsearch.ingest.IngestDocument; import org.elasticsearch.test.ESTestCase; -import static org.hamcrest.Matchers.equalTo; import java.util.HashMap; -import java.util.Map; import java.util.List; +import java.util.Map; + +import static org.hamcrest.Matchers.equalTo; /** * Tests for {@link XmlProcessor}. These tests ensure feature parity and test coverage. @@ -29,7 +30,7 @@ public class XmlProcessorTests extends ESTestCase { private static IngestDocument createTestIngestDocument(String xml) { return new IngestDocument("_index", "_id", 1, null, null, new HashMap<>(Map.of(XML_FIELD, xml))); } - + private static XmlProcessor createTestProcessor(Map config) { config.putIfAbsent("field", XML_FIELD); config.putIfAbsent("target_field", TARGET_FIELD); @@ -37,7 +38,7 @@ private static XmlProcessor createTestProcessor(Map config) { XmlProcessor.Factory factory = new XmlProcessor.Factory(); try { return factory.create(null, "_tag", null, config, null); - } catch (Exception e){ + } catch (Exception e) { fail("Failed to create XmlProcessor: " + e.getMessage()); return null; // This line will never be reached, but is needed to satisfy the compiler } @@ -54,55 +55,55 @@ public void testParseStandardXml() { IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); Map foo = (Map) data.get("foo"); assertThat(foo.get("key"), equalTo("value")); } - + /** * Test parsing XML with array elements (multiple elements with same name). */ public void testParseXmlWithArrayValue() { String xml = "value1value2"; - + Map config = new HashMap<>(); XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); Map foo = (Map) data.get("foo"); List keyValues = (List) foo.get("key"); assertThat(keyValues.size(), equalTo(2)); - + // The values might be nested inside their own lists Object firstValue = keyValues.get(0); assertThat(firstValue, equalTo("value1")); - + Object secondValue = keyValues.get(1); assertThat(secondValue, equalTo("value2")); } - + /** * Test parsing XML with nested elements. */ public void testParseXmlWithNestedElements() { String xml = "value"; - + Map config = new HashMap<>(); XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); Map foo = (Map) data.get("foo"); Map key1Map = (Map) foo.get("key1"); assertThat(key1Map.size(), equalTo(1)); - + String key2Value = (String) key1Map.get("key2"); assertThat(key2Value, equalTo("value")); } @@ -112,13 +113,13 @@ public void testParseXmlWithNestedElements() { */ public void testParseXmlInSingleItemArray() { String xml = ""; - + Map config = new HashMap<>(); XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); Map foo = (Map) data.get("foo"); assertThat(foo.get("bar"), equalTo("baz")); @@ -129,21 +130,21 @@ public void testParseXmlInSingleItemArray() { */ public void testXPathSingleElementExtraction() { String xml = "helloworld"; - + Map xpathMap = Map.of("/foo/bar/text()", "bar_content"); - + Map config = new HashMap<>(); config.put("xpath", xpathMap); XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - + // Get the XPath result Object barContent = ingestDocument.getFieldValue("bar_content", Object.class); assertNotNull(barContent); assertEquals("hello", barContent); - + // Verify that the full parsed XML is also available Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); Map foo = (Map) data.get("foo"); @@ -157,18 +158,18 @@ public void testXPathSingleElementExtraction() { */ public void testXPathMultipleElementsExtraction() { String xml = "firstsecondthird"; - + Map xpathMap = Map.of("/foo/bar", "all_bars"); - + Map config = new HashMap<>(); config.put("xpath", xpathMap); XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - + List allBars = ingestDocument.getFieldValue("all_bars", List.class); - + assertNotNull(allBars); assertThat(allBars.size(), equalTo(3)); assertThat(allBars.get(0), equalTo("first")); @@ -181,22 +182,22 @@ public void testXPathMultipleElementsExtraction() { */ public void testXPathAttributeExtraction() { String xml = "content"; - + Map xpathMap = new HashMap<>(); xpathMap.put("/foo/bar/@id", "bar_id"); xpathMap.put("/foo/bar/@type", "bar_type"); - + Map config = new HashMap<>(); config.put("xpath", xpathMap); XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - + String barId = ingestDocument.getFieldValue("bar_id", String.class); assertNotNull(barId); assertThat(barId, equalTo("123")); - + String barType = ingestDocument.getFieldValue("bar_type", String.class); assertNotNull(barType); assertThat(barType, equalTo("test")); @@ -206,15 +207,15 @@ public void testXPathAttributeExtraction() { * Test extracting elements with namespaces using XPath. */ public void testXPathNamespacedExtraction() { - String xml = "" + - "" + - " namespace-value" + - " regular-value" + - ""; - + String xml = "" + + "" + + " namespace-value" + + " regular-value" + + ""; + Map namespaces = Map.of("myns", "http://example.org/ns1"); Map xpathMap = Map.of("//myns:element/text()", "ns_value"); - + Map config = new HashMap<>(); config.put("xpath", xpathMap); config.put("namespaces", namespaces); @@ -222,7 +223,7 @@ public void testXPathNamespacedExtraction() { IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - + String nsValue = ingestDocument.getFieldValue("ns_value", String.class); assertNotNull(nsValue); assertThat(nsValue, equalTo("namespace-value")); @@ -233,61 +234,61 @@ public void testXPathNamespacedExtraction() { */ public void testParseXmlWithMixedContent() { String xml = "This text is bold and this is italic!"; - + Map config = new HashMap<>(); XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); Map foo = (Map) data.get("foo"); - + assertNotNull(foo.get("b")); - assertThat((String)foo.get("b"), equalTo("bold")); + assertThat((String) foo.get("b"), equalTo("bold")); assertNotNull(foo.get("i")); - assertThat((String)foo.get("i"), equalTo("italic")); + assertThat((String) foo.get("i"), equalTo("italic")); assertNotNull(foo.get("#text")); - assertThat((String)foo.get("#text"), equalTo("This text is and this is !")); + assertThat((String) foo.get("#text"), equalTo("This text is and this is !")); } - + /** * Test parsing XML with CDATA sections. */ public void testParseXmlWithCDATA() { String xml = " that shouldn't be parsed!]]>"; - + Map config = new HashMap<>(); XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); Object content = data.get("foo"); - + assertNotNull(content); assertThat(content, equalTo("This is CDATA content with that shouldn't be parsed!")); } - + /** * Test parsing XML with numeric data. */ public void testParseXmlWithNumericData() { String xml = "12399.95true"; - + Map config = new HashMap<>(); XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); Map foo = (Map) data.get("foo"); - - assertThat((String)foo.get("count"), equalTo("123")); - assertThat((String)foo.get("price"), equalTo("99.95")); - assertThat((String)foo.get("active"), equalTo("true")); + + assertThat((String) foo.get("count"), equalTo("123")); + assertThat((String) foo.get("price"), equalTo("99.95")); + assertThat((String) foo.get("active"), equalTo("true")); } /** @@ -295,17 +296,17 @@ public void testParseXmlWithNumericData() { */ public void testParseXmlWithForceArray() { String xml = "single_value"; - + Map config = new HashMap<>(); config.put("force_array", true); // Enable force_array option XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); Map foo = (Map) data.get("foo"); - + // With force_array=true, even single values should be in arrays Object barValue = foo.get("bar"); assertNotNull(barValue); @@ -321,205 +322,204 @@ public void testParseXmlWithForceArray() { * Tests that multiple XPath expressions can be used simultaneously. */ public void testMultipleXPathExpressions() { - String xml = "" + - " John30" + - " Jane25" + - ""; - + String xml = "" + + " John30" + + " Jane25" + + ""; + // Configure multiple XPath expressions Map xpathMap = new HashMap<>(); xpathMap.put("/root/person[1]/n/text()", "first_person_name"); xpathMap.put("/root/person[2]/n/text()", "second_person_name"); xpathMap.put("/root/person/@id", "person_ids"); - + Map config = new HashMap<>(); config.put("xpath", xpathMap); XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - + assertTrue("first_person_name field should exist", ingestDocument.hasField("first_person_name")); assertTrue("second_person_name field should exist", ingestDocument.hasField("second_person_name")); assertTrue("person_ids field should exist", ingestDocument.hasField("person_ids")); - + Object firstName = ingestDocument.getFieldValue("first_person_name", Object.class); assertEquals("John", firstName); - + Object secondName = ingestDocument.getFieldValue("second_person_name", Object.class); assertEquals("Jane", secondName); - + Object personIdsObj = ingestDocument.getFieldValue("person_ids", Object.class); assertTrue("person_ids should be a List", personIdsObj instanceof List); List personIds = (List) personIdsObj; assertEquals("Should have 2 person IDs", 2, personIds.size()); assertEquals("First person ID should be '1'", "1", personIds.get(0)); assertEquals("Second person ID should be '2'", "2", personIds.get(1)); - + assertTrue("Target field should exist", ingestDocument.hasField(TARGET_FIELD)); } - + /** * Test handling of invalid XML with ignoreFailure=false. */ public void testInvalidXml() { String xml = ""; // Invalid XML missing closing tag - + Map config = new HashMap<>(); XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); - - IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> { - processor.execute(ingestDocument); - }); - - assertTrue("Error message should indicate XML is invalid", - exception.getMessage().contains("invalid XML") || - exception.getCause().getMessage().contains("XML")); + + IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> { processor.execute(ingestDocument); }); + + assertTrue( + "Error message should indicate XML is invalid", + exception.getMessage().contains("invalid XML") || exception.getCause().getMessage().contains("XML") + ); } - + /** * Test handling of invalid XML with ignoreFailure=true. */ public void testInvalidXmlWithIgnoreFailure() { String xml = ""; // Invalid XML missing closing tag - + Map config = new HashMap<>(); config.put("ignore_failure", true); XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); - + processor.execute(ingestDocument); - + List tags = ingestDocument.getFieldValue("tags", List.class); assertNotNull(tags); assertTrue(tags.contains("_xmlparsefailure")); } - + /** * Test the store_xml=false option to not store parsed XML in target field. */ public void testNoStoreXml() { String xml = "value"; - + // Set up XPath to extract value but don't store XML Map xpathMap = Map.of("/foo/bar/text()", "bar_content"); - + Map config = new HashMap<>(); config.put("store_xml", false); // Do not store XML in target field - config.put("xpath", xpathMap); + config.put("xpath", xpathMap); XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); - + processor.execute(ingestDocument); - + // Verify XPath result is stored String barContent = ingestDocument.getFieldValue("bar_content", String.class); assertNotNull(barContent); assertThat(barContent, equalTo("value")); - + // Verify the target field was not created assertFalse(ingestDocument.hasField(TARGET_FIELD)); } - + /** * Test the to_lower option for converting field names to lowercase. */ public void testToLower() { String xml = "value"; - + Map config = new HashMap<>(); config.put("to_lower", true); // Enable to_lower option XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); - + processor.execute(ingestDocument); - + // Verify field names are lowercase Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); assertTrue(data.containsKey("foo")); assertFalse(data.containsKey("FOO")); - + Map foo = (Map) data.get("foo"); assertTrue(foo.containsKey("bar")); assertFalse(foo.containsKey("BAR")); assertThat(foo.get("bar"), equalTo("value")); } - + /** * Test the ignore_missing option when field is missing. */ public void testIgnoreMissing() { String xmlField = "nonexistent_field"; - + Map config = new HashMap<>(); config.put("field", xmlField); config.put("ignore_missing", true); // Enable ignore_missing option XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = new IngestDocument("_index", "_id", 1, null, null, new HashMap<>(Map.of())); processor.execute(ingestDocument); - - assertFalse("Target field should not be created when source field is missing", - ingestDocument.hasField(TARGET_FIELD)); - + + assertFalse("Target field should not be created when source field is missing", ingestDocument.hasField(TARGET_FIELD)); + // With ignoreMissing=false config.put("ignore_missing", false); XmlProcessor failingProcessor = createTestProcessor(config); - + // This should throw an exception - IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> { - failingProcessor.execute(ingestDocument); - }); - + IllegalArgumentException exception = expectThrows( + IllegalArgumentException.class, + () -> { failingProcessor.execute(ingestDocument); } + ); + assertTrue(exception.getMessage().contains("not present as part of path")); } - + /** * Test that ignore_empty_value correctly filters out empty values from arrays and mixed content. */ public void testIgnoreEmptyValue() { // XML with mixed empty and non-empty elements, including array elements with mixed empty/non-empty values - String xml = "" + - " " + - " " + - " content" + - " nested-content" + - " " + - " first" + - " " + - " third" + - " " + - " fifth" + - " " + - " Text with and content" + - ""; + String xml = "" + + " " + + " " + + " content" + + " nested-content" + + " " + + " first" + + " " + + " third" + + " " + + " fifth" + + " " + + " Text with and content" + + ""; Map config = new HashMap<>(); config.put("ignore_empty_value", true); XmlProcessor processor = createTestProcessor(config); - + IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - + Map result = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); Map root = (Map) result.get("root"); - + // Check empty elements are filtered assertFalse("Empty element should be filtered out", root.containsKey("empty")); assertFalse("Blank element should be filtered out", root.containsKey("blank")); - + // Check valid elements are preserved assertTrue("Valid element should be preserved", root.containsKey("valid")); assertEquals("content", root.get("valid")); - + // Check nested structure filtering Map nested = (Map) root.get("nested"); assertNotNull("Nested element should be preserved", nested); assertFalse("Empty nested element should be filtered", nested.containsKey("empty")); assertEquals("nested-content", nested.get("valid")); - + // Check array with mixed empty/non-empty values - Map items = (Map) root.get("items"); + Map items = (Map) root.get("items"); assertNotNull("Items object should be preserved", items); List itemList = (List) items.get("item"); assertNotNull("Item array should be preserved", itemList); @@ -527,7 +527,7 @@ public void testIgnoreEmptyValue() { assertEquals("first", itemList.get(0)); assertEquals("third", itemList.get(1)); assertEquals("fifth", itemList.get(2)); - + // Check mixed content handling Map mixed = (Map) root.get("mixed"); assertNotNull("Mixed content should be preserved", mixed); @@ -536,66 +536,65 @@ public void testIgnoreEmptyValue() { assertEquals("content", mixed.get("valid")); assertEquals("Text with and", mixed.get("#text")); } - + /** * Test parsing with strict mode option. */ public void testStrictParsing() { String xml = "valid"; - + Map config = new HashMap<>(); config.put("parse_options", "strict"); XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); - + processor.execute(ingestDocument); - + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); Map foo = (Map) data.get("foo"); assertThat(foo.get("bar"), equalTo("valid")); - + // Test with invalid XML in strict mode String invalidXml = ""; IngestDocument invalidDocument = createTestIngestDocument(invalidXml); - - IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> { - processor.execute(invalidDocument); - }); - - assertTrue("Error message should indicate XML is invalid", - exception.getMessage().contains("invalid XML") || - exception.getCause().getMessage().contains("XML")); + + IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> { processor.execute(invalidDocument); }); + + assertTrue( + "Error message should indicate XML is invalid", + exception.getMessage().contains("invalid XML") || exception.getCause().getMessage().contains("XML") + ); } - + /** * Test parsing XML with remove_namespaces option. */ public void testRemoveNamespaces() { String xml = "value"; - + Map config = new HashMap<>(); config.put("remove_namespaces", true); XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); - + processor.execute(ingestDocument); - + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); Map foo = (Map) data.get("foo"); - + assertTrue("Element with namespace should be present", foo.containsKey("ns:bar")); assertThat(foo.get("ns:bar"), equalTo("value")); - + // Now test with removeNamespaces=false IngestDocument ingestDocument2 = createTestIngestDocument(xml); - + config.put("remove_namespaces", false); XmlProcessor processor2 = createTestProcessor(config); processor2.execute(ingestDocument2); - + Map data2 = ingestDocument2.getFieldValue(TARGET_FIELD, Map.class); Map foo2 = (Map) data2.get("foo"); - + // With removeNamespaces=false, the "ns:" prefix should be preserved assertTrue("Element should be accessible with namespace prefix", foo2.containsKey("ns:bar")); assertThat(foo2.get("ns:bar"), equalTo("value")); @@ -606,30 +605,30 @@ public void testRemoveNamespaces() { */ public void testForceContent() { String xml = "simple text"; - + Map config = new HashMap<>(); config.put("force_content", true); XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); - + processor.execute(ingestDocument); - + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); Map foo = (Map) data.get("foo"); - + // With forceContent=true, the text should be in a #text field assertTrue("Text content should be in #text field", foo.containsKey("#text")); assertThat(foo.get("#text"), equalTo("simple text")); - + // Now test with forceContent=false config.put("force_content", false); XmlProcessor processor2 = createTestProcessor(config); IngestDocument ingestDocument2 = createTestIngestDocument(xml); - + processor2.execute(ingestDocument2); - + Map data2 = ingestDocument2.getFieldValue(TARGET_FIELD, Map.class); - + // With forceContent=false, the text should be directly assigned to the element assertThat(data2.get("foo"), equalTo("simple text")); } From 55ebcf96b07ebcdecb90e9ebe3552e72e6fdbafc Mon Sep 17 00:00:00 2001 From: Marc Guasch Date: Fri, 4 Jul 2025 16:22:58 +0200 Subject: [PATCH 08/54] fix: implement Copilot PR review suggestions - Fix test assertion for remove_namespaces feature - Use StandardCharsets.UTF_8 instead of string literal - Replace string reference comparison with isEmpty() - Move regex pattern to static final field for performance --- .../ingest/common/XmlProcessor.java | 20 +++++++++++-------- .../ingest/common/XmlProcessorTests.java | 4 ++-- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index 582b8d9f3e787..bfbbe05334714 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -15,12 +15,14 @@ import org.elasticsearch.ingest.IngestDocument; import org.elasticsearch.ingest.Processor; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.regex.Pattern; import javax.xml.namespace.NamespaceContext; import javax.xml.parsers.DocumentBuilder; @@ -52,6 +54,9 @@ public final class XmlProcessor extends AbstractProcessor { private static final XPathFactory XPATH_FACTORY = XPathFactory.newInstance(); + // Pre-compiled pattern to detect namespace prefixes + private static final Pattern NAMESPACE_PATTERN = Pattern.compile(".*\\b[a-zA-Z][a-zA-Z0-9_-]*:[a-zA-Z][a-zA-Z0-9_-]*.*"); + // Pre-configured SAX parser factories for secure XML parsing private static final javax.xml.parsers.SAXParserFactory SAX_PARSER_FACTORY = createSecureSaxParserFactory(); private static final javax.xml.parsers.SAXParserFactory SAX_PARSER_FACTORY_NS = createSecureSaxParserFactoryNamespaceAware(); @@ -383,16 +388,14 @@ public Iterator getPrefixes(String namespaceURI) { }); } - // Pre-compiled pattern to detect namespace prefixes - java.util.regex.Pattern namespacePattern = - java.util.regex.Pattern.compile(".*\\b[a-zA-Z][a-zA-Z0-9_-]*:[a-zA-Z][a-zA-Z0-9_-]*.*"); + // Use pre-compiled pattern to detect namespace prefixes for (Map.Entry entry : xpathExpressions.entrySet()) { String xpathExpression = entry.getKey(); String targetFieldName = entry.getValue(); // Validate namespace prefixes if no namespaces are configured - if (!hasNamespaces && namespacePattern.matcher(xpathExpression).matches()) { + if (!hasNamespaces && NAMESPACE_PATTERN.matcher(xpathExpression).matches()) { throw new IllegalArgumentException( "Invalid XPath expression [" + xpathExpression + "]: contains namespace prefixes but no namespace configuration provided" ); @@ -476,7 +479,7 @@ public XmlProcessor create( // Parse parse_options parameter String parseOptions = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "parse_options", ""); - if (parseOptions != null && parseOptions != "" && !"strict".equals(parseOptions)) { + if (parseOptions != null && !parseOptions.isEmpty() && !"strict".equals(parseOptions)) { throw new IllegalArgumentException("Invalid parse_options [" + parseOptions + "]. Only 'strict' is supported."); } @@ -528,7 +531,7 @@ public void fatalError(org.xml.sax.SAXParseException exception) throws org.xml.s // Use enhanced handler that can build DOM during streaming when needed XmlStreamingWithDomHandler handler = new XmlStreamingWithDomHandler(needsDom); - parser.parse(new java.io.ByteArrayInputStream(xmlString.getBytes("UTF-8")), handler); + parser.parse(new java.io.ByteArrayInputStream(xmlString.getBytes(StandardCharsets.UTF_8)), handler); // Store structured result if needed if (storeXml) { @@ -926,10 +929,11 @@ private static DocumentBuilderFactory createSecureDocumentBuilderFactory() { * @return the appropriate SAX parser factory for the current configuration */ private javax.xml.parsers.SAXParserFactory selectSaxParserFactory() { + boolean needsNamespaceAware = hasNamespaces() || removeNamespaces; if (isStrict()) { - return hasNamespaces() ? SAX_PARSER_FACTORY_NS_STRICT : SAX_PARSER_FACTORY_STRICT; + return needsNamespaceAware ? SAX_PARSER_FACTORY_NS_STRICT : SAX_PARSER_FACTORY_STRICT; } else { - return hasNamespaces() ? SAX_PARSER_FACTORY_NS : SAX_PARSER_FACTORY; + return needsNamespaceAware ? SAX_PARSER_FACTORY_NS : SAX_PARSER_FACTORY; } } } diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java index e48c6de593f84..3df19ddf06269 100644 --- a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java @@ -583,8 +583,8 @@ public void testRemoveNamespaces() { Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); Map foo = (Map) data.get("foo"); - assertTrue("Element with namespace should be present", foo.containsKey("ns:bar")); - assertThat(foo.get("ns:bar"), equalTo("value")); + assertTrue("Element without namespace should be present", foo.containsKey("bar")); + assertThat(foo.get("bar"), equalTo("value")); // Now test with removeNamespaces=false IngestDocument ingestDocument2 = createTestIngestDocument(xml); From 6e531b2a1e13f8561b3463031bd3e71d105d7f0e Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Fri, 4 Jul 2025 14:35:48 +0000 Subject: [PATCH 09/54] [CI] Auto commit changes from spotless --- .../elasticsearch/ingest/common/XmlProcessor.java | 12 ++++++------ .../ingest/common/XmlProcessorTests.java | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index c8b9e9a723412..3c2fac023fa20 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -52,10 +52,10 @@ public final class XmlProcessor extends AbstractProcessor { public static final String TYPE = "xml"; private static final XPathFactory XPATH_FACTORY = XPathFactory.newInstance(); - + // Pre-compiled pattern to detect namespace prefixes private static final Pattern NAMESPACE_PATTERN = Pattern.compile(".*\\b[a-zA-Z][a-zA-Z0-9_-]*:[a-zA-Z][a-zA-Z0-9_-]*.*"); - + // Pre-configured SAX parser factories for secure XML parsing private static final javax.xml.parsers.SAXParserFactory SAX_PARSER_FACTORY = createSecureSaxParserFactory(); private static final javax.xml.parsers.SAXParserFactory SAX_PARSER_FACTORY_NS = createSecureSaxParserFactoryNamespaceAware(); @@ -390,9 +390,9 @@ public Iterator getPrefixes(String namespaceURI) { } }); } - + // Use pre-compiled pattern to detect namespace prefixes - + for (Map.Entry entry : xpathExpressions.entrySet()) { String xpathExpression = entry.getKey(); String targetFieldName = entry.getValue(); @@ -557,9 +557,9 @@ public void fatalError(org.xml.sax.SAXParseException exception) throws org.xml.s // Use enhanced handler that can build DOM during streaming when needed XmlStreamingWithDomHandler handler = new XmlStreamingWithDomHandler(needsDom); - + parser.parse(new java.io.ByteArrayInputStream(xmlString.getBytes(StandardCharsets.UTF_8)), handler); - + // Store structured result if needed if (storeXml) { Object streamingResult = handler.getStructuredResult(); diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java index a02db6c43e59d..c01c086be1eb5 100644 --- a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java @@ -581,10 +581,10 @@ public void testRemoveNamespaces() { Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); Map foo = (Map) data.get("foo"); - + assertTrue("Element without namespace should be present", foo.containsKey("bar")); assertThat(foo.get("bar"), equalTo("value")); - + // Now test with removeNamespaces=false IngestDocument ingestDocument2 = createTestIngestDocument(xml); From 19c01cb2c4c8175d308f435af959767ef6c40e4f Mon Sep 17 00:00:00 2001 From: Marc Guasch Date: Mon, 18 Aug 2025 12:22:05 +0200 Subject: [PATCH 10/54] Sort processor reference --- docs/reference/enrich-processor/index.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/reference/enrich-processor/index.md b/docs/reference/enrich-processor/index.md index fb2cac99ee355..5779fa6132f66 100644 --- a/docs/reference/enrich-processor/index.md +++ b/docs/reference/enrich-processor/index.md @@ -159,12 +159,12 @@ Refer to [Enrich your data](docs-content://manage-data/ingest/transform-enrich/d [`split` processor](/reference/enrich-processor/split-processor.md) : Splits a field into an array of values. -[`xml` processor](/reference/enrich-processor/xml-processor.md) -: Parses XML documents and converts them to JSON objects. - [`trim` processor](/reference/enrich-processor/trim-processor.md) : Trims whitespace from field. +[`xml` processor](/reference/enrich-processor/xml-processor.md) +: Parses XML documents and converts them to JSON objects. + ## Data filtering processors [ingest-process-category-data-filtering] From 04dc049676032e8359e4ce6b947116a52d8a00eb Mon Sep 17 00:00:00 2001 From: Marc Guasch Date: Mon, 18 Aug 2025 12:23:42 +0200 Subject: [PATCH 11/54] Better doc for force_content --- .../enrich-processor/xml-processor.md | 68 +++++++++++++++++-- 1 file changed, 62 insertions(+), 6 deletions(-) diff --git a/docs/reference/enrich-processor/xml-processor.md b/docs/reference/enrich-processor/xml-processor.md index 7452fea12ed65..c4c8367eded44 100644 --- a/docs/reference/enrich-processor/xml-processor.md +++ b/docs/reference/enrich-processor/xml-processor.md @@ -516,7 +516,58 @@ Result: ### Force content mode -When `force_content` is `true`, all element text content is stored under the special `#text` key: +When `force_content` is `true`, all element text content is stored under the special `#text` key, even for simple elements without attributes. This provides a consistent structure when elements may have varying complexity. + +**Without force_content (default behavior):** + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content", + "force_content": false + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "The RecognitionsWilliam H. Gaddis" + } + } + ] +} +``` + +Result (simple elements as string values, complex elements with #text): + +```console-result +{ + "docs": [ + { + "doc": { + ... + "_source": { + "xml_content": "The RecognitionsWilliam H. Gaddis", + "book": { + "title": "The Recognitions", + "author": { + "nationality": "American", + "#text": "William H. Gaddis" + } + } + } + } + } + ] +} +``` + +**With force_content enabled:** ```console POST _ingest/pipeline/_simulate @@ -534,14 +585,14 @@ POST _ingest/pipeline/_simulate "docs": [ { "_source": { - "xml_content": "The Recognitions" + "xml_content": "The RecognitionsWilliam H. Gaddis" } } ] } ``` -Result: +Result (all text content under #text key): ```console-result { @@ -550,10 +601,15 @@ Result: "doc": { ... "_source": { - "xml_content": "The Recognitions", + "xml_content": "The RecognitionsWilliam H. Gaddis", "book": { - "author": "William H. Gaddis", - "#text": "The Recognitions" + "title": { + "#text": "The Recognitions" + }, + "author": { + "nationality": "American", + "#text": "William H. Gaddis" + } } } } From 21006dd5eb4f02e90bc333e5fb7d0474ac09fb5d Mon Sep 17 00:00:00 2001 From: Marc Guasch Date: Mon, 18 Aug 2025 12:28:43 +0200 Subject: [PATCH 12/54] Better changelog entry --- docs/changelog/130337.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/changelog/130337.yaml b/docs/changelog/130337.yaml index 2ea20ebd1944e..710c48f6367c7 100644 --- a/docs/changelog/130337.yaml +++ b/docs/changelog/130337.yaml @@ -1,5 +1,5 @@ pr: 130337 -summary: Add `XmlProcessor` initial implementation +summary: Add `xml` ingest processor for parsing XML area: Ingest Node type: enhancement issues: From 3de7971e764c43eb04a86c5b922987186269625b Mon Sep 17 00:00:00 2001 From: Marc Guasch Date: Mon, 18 Aug 2025 12:39:29 +0200 Subject: [PATCH 13/54] Rename ignore_empty_value to remove_empty_values --- .../enrich-processor/xml-processor.md | 10 +++--- .../ingest/common/XmlProcessor.java | 34 +++++++++---------- .../common/XmlProcessorFactoryTests.java | 16 ++++----- .../ingest/common/XmlProcessorTests.java | 6 ++-- 4 files changed, 33 insertions(+), 33 deletions(-) diff --git a/docs/reference/enrich-processor/xml-processor.md b/docs/reference/enrich-processor/xml-processor.md index c4c8367eded44..73e9d506993db 100644 --- a/docs/reference/enrich-processor/xml-processor.md +++ b/docs/reference/enrich-processor/xml-processor.md @@ -18,7 +18,7 @@ $$$xml-options$$$ | `ignore_missing` | no | `false` | If `true` and `field` does not exist, the processor quietly exits without modifying the document. | | `ignore_failure` | no | `false` | Ignore failures for the processor. When `true` and XML parsing fails, adds `_xmlparsefailure` tag to the document. See [Handling pipeline failures](docs-content://manage-data/ingest/transform-enrich/ingest-pipelines.md#handling-pipeline-failures). | | `to_lower` | no | `false` | Convert XML element names and attribute names to lowercase. | -| `ignore_empty_value` | no | `false` | If `true`, the processor will filter out null and empty values from the parsed XML structure, including empty elements, elements with null values, and elements with whitespace-only content. | +| `remove_empty_values` | no | `false` | If `true`, the processor will filter out null and empty values from the parsed XML structure, including empty elements, elements with null values, and elements with whitespace-only content. | | `remove_namespaces` | no | `false` | If `true`, removes namespace prefixes from element and attribute names. | | `force_content` | no | `false` | If `true`, forces text content and attributes to always parse to a hash value with `#text` key for content. | | `force_array` | no | `false` | If `true`, forces all parsed values to be arrays. Single elements are wrapped in arrays. | @@ -37,7 +37,7 @@ $$$xml-options$$$ "xml": { "field": "xml_field", "target_field": "parsed_xml", - "ignore_empty_value": true + "remove_empty_values": true } } ``` @@ -94,7 +94,7 @@ Result: ### Filtering empty values -When `ignore_empty_value` is set to `true`, the processor will remove empty elements from the parsed XML: +When `remove_empty_values` is set to `true`, the processor will remove empty elements from the parsed XML: ```console POST _ingest/pipeline/_simulate @@ -105,7 +105,7 @@ POST _ingest/pipeline/_simulate "xml": { "field": "xml_content", "target_field": "parsed_xml", - "ignore_empty_value": true + "remove_empty_values": true } } ] @@ -624,7 +624,7 @@ The XML processor supports: - **Elements with text content**: Converted to key-value pairs where the element name is the key and text content is the value - **Nested elements**: Converted to nested JSON objects -- **Empty elements**: Converted to `null` values (can be filtered with `ignore_empty_value`) +- **Empty elements**: Converted to `null` values (can be filtered with `remove_empty_values`) - **Repeated elements**: Converted to arrays when multiple elements with the same name exist at the same level - **XML attributes**: Included as properties in the JSON object alongside element content. When an element has both attributes and text content, the text is stored under a special `#text` key - **Mixed content**: Elements with both text and child elements include text under a special `#text` key while attributes and child elements become object properties diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index 3c2fac023fa20..ad07481dbbe31 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -44,7 +44,7 @@ * - XPath extraction with namespace support * - Configurable options: force_array, force_content, remove_namespaces, to_lower * - Strict parsing mode for XML validation - * - Empty value filtering with ignore_empty_value option + * - Empty value filtering with remove_empty_values option * - Logstash-compatible error handling and behavior */ public final class XmlProcessor extends AbstractProcessor { @@ -71,7 +71,7 @@ public final class XmlProcessor extends AbstractProcessor { private final boolean ignoreMissing; private final boolean ignoreFailure; private final boolean toLower; - private final boolean ignoreEmptyValue; + private final boolean removeEmptyValues; private final boolean storeXml; private final boolean removeNamespaces; private final boolean forceContent; @@ -89,7 +89,7 @@ public final class XmlProcessor extends AbstractProcessor { boolean ignoreMissing, boolean ignoreFailure, boolean toLower, - boolean ignoreEmptyValue, + boolean removeEmptyValues, boolean storeXml, boolean removeNamespaces, boolean forceContent, @@ -104,7 +104,7 @@ public final class XmlProcessor extends AbstractProcessor { this.ignoreMissing = ignoreMissing; this.ignoreFailure = ignoreFailure; this.toLower = toLower; - this.ignoreEmptyValue = ignoreEmptyValue; + this.removeEmptyValues = removeEmptyValues; this.storeXml = storeXml; this.removeNamespaces = removeNamespaces; this.forceContent = forceContent; @@ -127,8 +127,8 @@ public boolean isIgnoreMissing() { return ignoreMissing; } - public boolean isIgnoreEmptyValue() { - return ignoreEmptyValue; + public boolean isRemoveEmptyValues() { + return removeEmptyValues; } public boolean isStoreXml() { @@ -206,7 +206,7 @@ public String getType() { /** * Determines if a value should be considered empty for filtering purposes. - * Used by the ignore_empty_value feature to filter out empty content. + * Used by the remove_empty_values feature to filter out empty content. * * Considers empty: * - null values @@ -432,7 +432,7 @@ public XmlProcessor create( boolean ignoreMissing = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "ignore_missing", false); boolean ignoreFailure = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "ignore_failure", false); boolean toLower = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "to_lower", false); - boolean ignoreEmptyValue = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "ignore_empty_value", false); + boolean removeEmptyValues = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "remove_empty_values", false); boolean storeXml = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "store_xml", true); boolean removeNamespaces = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "remove_namespaces", false); boolean forceContent = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "force_content", false); @@ -502,7 +502,7 @@ public XmlProcessor create( ignoreMissing, ignoreFailure, toLower, - ignoreEmptyValue, + removeEmptyValues, storeXml, removeNamespaces, forceContent, @@ -628,8 +628,8 @@ public void startElement(String uri, String localName, String qName, org.xml.sax String attrName = getAttributeName(attributes.getURI(i), attributes.getLocalName(i), attributes.getQName(i)); String attrValue = attributes.getValue(i); - // Apply ignoreEmptyValue filtering to attributes - if (ignoreEmptyValue == false || isEmptyValue(attrValue) == false) { + // Apply removeEmptyValues filtering to attributes + if (removeEmptyValues == false || isEmptyValue(attrValue) == false) { element.put(attrName, attrValue); } } @@ -683,7 +683,7 @@ public void characters(char[] ch, int start, int length) throws org.xml.sax.SAXE // Add to DOM text node if needed if (buildDom && !domElementStack.isEmpty()) { String text = new String(ch, start, length); - if (!text.trim().isEmpty() || !ignoreEmptyValue) { + if (!text.trim().isEmpty() || !removeEmptyValues) { org.w3c.dom.Text textNode = domDocument.createTextNode(text); domElementStack.peek().appendChild(textNode); } @@ -705,7 +705,7 @@ public void endElement(String uri, String localName, String qName) throws org.xm // Add repeated elements as arrays for (Map.Entry> entry : repeatedElements.entrySet()) { List values = entry.getValue(); - if (ignoreEmptyValue == false || values.isEmpty() == false) { + if (removeEmptyValues == false || values.isEmpty() == false) { element.put(entry.getKey(), values); } } @@ -718,7 +718,7 @@ public void endElement(String uri, String localName, String qName) throws org.xm Object elementValue; if (hasText == false && hasChildren == false) { // Empty element - if (ignoreEmptyValue == false) { + if (removeEmptyValues == false) { elementValue = applyForceArray(elementName, null); } else { elementValue = null; @@ -727,12 +727,12 @@ public void endElement(String uri, String localName, String qName) throws org.xm // Only text content if (forceContent) { Map contentMap = new HashMap<>(); - if (ignoreEmptyValue == false || isEmptyValue(trimmedText) == false) { + if (removeEmptyValues == false || isEmptyValue(trimmedText) == false) { contentMap.put("#text", trimmedText); } elementValue = contentMap; } else { - if (ignoreEmptyValue && isEmptyValue(trimmedText)) { + if (removeEmptyValues && isEmptyValue(trimmedText)) { elementValue = null; } else { elementValue = trimmedText; @@ -744,7 +744,7 @@ public void endElement(String uri, String localName, String qName) throws org.xm elementValue = (forceArray && forceContent) ? applyForceArray(elementName, element) : element; } else { // Both text and children/attributes - if (ignoreEmptyValue == false || isEmptyValue(trimmedText) == false) { + if (removeEmptyValues == false || isEmptyValue(trimmedText) == false) { element.put("#text", trimmedText); } elementValue = (forceArray && forceContent) ? applyForceArray(elementName, element) : element; diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java index e693c39a5cab3..a2c98870a0902 100644 --- a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java @@ -140,8 +140,8 @@ private Map createConfigWithOptions(String fieldName, String... case "to_lower": config.put("to_lower", true); break; - case "ignore_empty_value": - config.put("ignore_empty_value", true); + case "remove_empty_values": + config.put("remove_empty_values", true); break; case "store_xml": config.put("store_xml", false); // Test false case since default is true @@ -186,14 +186,14 @@ public void testCreate() throws Exception { config.put("ignore_missing", true); config.put("ignore_failure", true); config.put("to_lower", true); - config.put("ignore_empty_value", true); + config.put("remove_empty_values", true); XmlProcessor processor = createProcessor(config); assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); assertThat(processor.getTargetField(), equalTo(DEFAULT_TARGET_FIELD)); assertThat(processor.isIgnoreMissing(), equalTo(true)); - assertThat(processor.isIgnoreEmptyValue(), equalTo(true)); + assertThat(processor.isRemoveEmptyValues(), equalTo(true)); } public void testCreateWithDefaults() throws Exception { @@ -203,7 +203,7 @@ public void testCreateWithDefaults() throws Exception { assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); assertThat(processor.getTargetField(), equalTo(DEFAULT_FIELD)); assertThat(processor.isIgnoreMissing(), equalTo(false)); - assertThat(processor.isIgnoreEmptyValue(), equalTo(false)); + assertThat(processor.isRemoveEmptyValues(), equalTo(false)); } public void testCreateMissingField() throws Exception { @@ -211,14 +211,14 @@ public void testCreateMissingField() throws Exception { expectCreationFailure(config, ElasticsearchParseException.class, "[field] required property is missing"); } - public void testCreateWithIgnoreEmptyValueOnly() throws Exception { + public void testCreateWithRemoveEmptyValuesOnly() throws Exception { Map config = createBaseConfig(); - config.put("ignore_empty_value", true); + config.put("remove_empty_values", true); XmlProcessor processor = createProcessor(config); assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); - assertThat(processor.isIgnoreEmptyValue(), equalTo(true)); + assertThat(processor.isRemoveEmptyValues(), equalTo(true)); assertThat(processor.isIgnoreMissing(), equalTo(false)); // other flags should remain default } diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java index c01c086be1eb5..699aff5fa935f 100644 --- a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java @@ -475,9 +475,9 @@ public void testIgnoreMissing() { } /** - * Test that ignore_empty_value correctly filters out empty values from arrays and mixed content. + * Test that remove_empty_values correctly filters out empty values from arrays and mixed content. */ - public void testIgnoreEmptyValue() { + public void testRemoveEmptyValues() { // XML with mixed empty and non-empty elements, including array elements with mixed empty/non-empty values String xml = "" + " " @@ -495,7 +495,7 @@ public void testIgnoreEmptyValue() { + ""; Map config = new HashMap<>(); - config.put("ignore_empty_value", true); + config.put("remove_empty_values", true); XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); From 8847f4bf79e5a25a6fff0ed41dc592d1ecd78e4d Mon Sep 17 00:00:00 2001 From: Marc Guasch Date: Mon, 18 Aug 2025 12:40:46 +0200 Subject: [PATCH 14/54] Add clarification in docs for store_xml --- docs/reference/enrich-processor/xml-processor.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/reference/enrich-processor/xml-processor.md b/docs/reference/enrich-processor/xml-processor.md index 73e9d506993db..0db4aa4bb917a 100644 --- a/docs/reference/enrich-processor/xml-processor.md +++ b/docs/reference/enrich-processor/xml-processor.md @@ -14,7 +14,7 @@ $$$xml-options$$$ | --- | --- | --- | --- | | `field` | yes | - | The field containing the XML string to be parsed. | | `target_field` | no | `field` | The field that the converted structured object will be written into. Any existing content in this field will be overwritten. | -| `store_xml` | no | `true` | If `true`, stores the parsed XML structure in the target field. If `false`, only XPath extraction results are stored. | +| `store_xml` | no | `true` | If `true`, stores the parsed XML structure in the target field. If `false`, only XPath extraction results are stored and `target_field` is ignored. | | `ignore_missing` | no | `false` | If `true` and `field` does not exist, the processor quietly exits without modifying the document. | | `ignore_failure` | no | `false` | Ignore failures for the processor. When `true` and XML parsing fails, adds `_xmlparsefailure` tag to the document. See [Handling pipeline failures](docs-content://manage-data/ingest/transform-enrich/ingest-pipelines.md#handling-pipeline-failures). | | `to_lower` | no | `false` | Convert XML element names and attribute names to lowercase. | From 497d775effea646216d6122437bd4e826081f32a Mon Sep 17 00:00:00 2001 From: Marc Guasch Date: Mon, 18 Aug 2025 12:43:08 +0200 Subject: [PATCH 15/54] Change strict parsing option to be boolean --- .../enrich-processor/xml-processor.md | 10 ++++---- .../ingest/common/XmlProcessor.java | 21 ++++++++--------- .../common/XmlProcessorFactoryTests.java | 23 ++++--------------- .../ingest/common/XmlProcessorTests.java | 2 +- 4 files changed, 20 insertions(+), 36 deletions(-) diff --git a/docs/reference/enrich-processor/xml-processor.md b/docs/reference/enrich-processor/xml-processor.md index 0db4aa4bb917a..5262ace6ca654 100644 --- a/docs/reference/enrich-processor/xml-processor.md +++ b/docs/reference/enrich-processor/xml-processor.md @@ -22,7 +22,7 @@ $$$xml-options$$$ | `remove_namespaces` | no | `false` | If `true`, removes namespace prefixes from element and attribute names. | | `force_content` | no | `false` | If `true`, forces text content and attributes to always parse to a hash value with `#text` key for content. | | `force_array` | no | `false` | If `true`, forces all parsed values to be arrays. Single elements are wrapped in arrays. | -| `parse_options` | no | - | Controls XML parsing behavior. Set to `"strict"` for strict XML validation that fails fast on invalid content. | +| `strict_parsing` | no | `false` | If `true`, enables strict XML validation that fails fast on invalid content. | | `xpath` | no | - | Map of XPath expressions to target field names. Extracts values from the XML using XPath and stores them in the specified fields. | | `namespaces` | no | - | Map of namespace prefixes to URIs for use with XPath expressions. Required when XPath expressions contain namespace prefixes. | | `description` | no | - | Description of the processor. Useful for describing the purpose of the processor or its configuration. | @@ -422,7 +422,7 @@ Result: ### Strict parsing mode -Use `parse_options: "strict"` for strict XML validation: +Use `strict_parsing: true` for strict XML validation: ```console POST _ingest/pipeline/_simulate @@ -432,7 +432,7 @@ POST _ingest/pipeline/_simulate { "xml": { "field": "xml_content", - "parse_options": "strict", + "strict_parsing": true, "ignore_failure": true } } @@ -441,7 +441,7 @@ POST _ingest/pipeline/_simulate "docs": [ { "_source": { - "xml_content": "Invalid XML with control character" + "xml_content": "Invalid XML with control character \u0000" } } ] @@ -457,7 +457,7 @@ Result (with parsing failure): "doc": { ... "_source": { - "xml_content": "Invalid XML with control character", + "xml_content": "Invalid XML with control character \u0000", "tags": ["_xmlparsefailure"] } } diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index ad07481dbbe31..2c6c045ea0e9f 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -79,7 +79,7 @@ public final class XmlProcessor extends AbstractProcessor { private final Map xpathExpressions; private final Map namespaces; private final Map compiledXPathExpressions; - private final String parseOptions; + private final boolean strictParsing; XmlProcessor( String tag, @@ -96,7 +96,7 @@ public final class XmlProcessor extends AbstractProcessor { boolean forceArray, Map xpathExpressions, Map namespaces, - String parseOptions + boolean strictParsing ) { super(tag, description); this.field = field; @@ -112,7 +112,7 @@ public final class XmlProcessor extends AbstractProcessor { this.xpathExpressions = xpathExpressions != null ? Map.copyOf(xpathExpressions) : Map.of(); this.namespaces = namespaces != null ? Map.copyOf(namespaces) : Map.of(); this.compiledXPathExpressions = compileXPathExpressions(this.xpathExpressions, this.namespaces); - this.parseOptions = parseOptions != null ? parseOptions : ""; + this.strictParsing = strictParsing; } public String getField() { @@ -144,7 +144,7 @@ public boolean isForceContent() { } public boolean isStrict() { - return "strict".equals(parseOptions); + return strictParsing; } public boolean isForceArray() { @@ -159,8 +159,8 @@ public Map getNamespaces() { return namespaces; } - public String getParseOptions() { - return parseOptions; + public boolean getStrictParsing() { + return strictParsing; } @Override @@ -488,11 +488,8 @@ public XmlProcessor create( } } - // Parse parse_options parameter - String parseOptions = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "parse_options", ""); - if (parseOptions != null && !parseOptions.isEmpty() && !"strict".equals(parseOptions)) { - throw new IllegalArgumentException("Invalid parse_options [" + parseOptions + "]. Only 'strict' is supported."); - } + // Parse strict_parsing parameter + boolean strictParsing = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "strict_parsing", false); return new XmlProcessor( processorTag, @@ -509,7 +506,7 @@ public XmlProcessor create( forceArray, xpathExpressions, namespaces, - parseOptions + strictParsing ); } } diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java index a2c98870a0902..49d33db00186c 100644 --- a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java @@ -155,8 +155,8 @@ private Map createConfigWithOptions(String fieldName, String... case "force_array": config.put("force_array", true); break; - case "strict": - config.put("parse_options", "strict"); + case "strict_parsing": + config.put("strict_parsing", true); break; default: throw new IllegalArgumentException("Unknown option: " + option); @@ -329,12 +329,12 @@ public void testCreateWithForceArray() throws Exception { assertThat(processor.isForceArray(), equalTo(true)); } - public void testCreateWithStrictParseOptions() throws Exception { - Map config = createConfigWithOptions(DEFAULT_FIELD, "strict"); + public void testCreateWithStrictParsing() throws Exception { + Map config = createConfigWithOptions(DEFAULT_FIELD, "strict_parsing"); XmlProcessor processor = createProcessor(config); assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); - assertThat(processor.getParseOptions(), equalTo("strict")); + assertThat(processor.getStrictParsing(), equalTo(true)); assertThat(processor.isStrict(), equalTo(true)); } @@ -355,19 +355,6 @@ public void testCreateWithMultipleOptions() throws Exception { assertThat(processor.isRemoveNamespaces(), equalTo(true)); } - // Tests for invalid parse options - - public void testCreateWithInvalidParseOptions() throws Exception { - Map config = createBaseConfig(); - config.put("parse_options", "invalid_option"); - - expectCreationFailure( - config, - IllegalArgumentException.class, - "Invalid parse_options [invalid_option]. Only 'strict' is supported." - ); - } - // Tests for XPath compilation errors (testing precompilation feature) public void testCreateWithInvalidXPathExpression() throws Exception { diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java index 699aff5fa935f..bbe82e1f9e92e 100644 --- a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java @@ -544,7 +544,7 @@ public void testStrictParsing() { String xml = "valid"; Map config = new HashMap<>(); - config.put("parse_options", "strict"); + config.put("strict_parsing", true); XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); From 914a33eccf551ef0bf63852c7d1ab07199d6a235 Mon Sep 17 00:00:00 2001 From: Marc Guasch Date: Mon, 18 Aug 2025 14:14:33 +0200 Subject: [PATCH 16/54] Remove unnecessary docs --- .../enrich-processor/xml-processor.md | 99 ------------------- 1 file changed, 99 deletions(-) diff --git a/docs/reference/enrich-processor/xml-processor.md b/docs/reference/enrich-processor/xml-processor.md index 5262ace6ca654..cd9d9db5531cc 100644 --- a/docs/reference/enrich-processor/xml-processor.md +++ b/docs/reference/enrich-processor/xml-processor.md @@ -148,54 +148,6 @@ Result with empty elements filtered out: } ``` -### Converting element names to lowercase - -```console -POST _ingest/pipeline/_simulate -{ - "pipeline": { - "processors": [ - { - "xml": { - "field": "xml_content", - "to_lower": true - } - } - ] - }, - "docs": [ - { - "_source": { - "xml_content": "William H. GaddisThe Recognitions" - } - } - ] -} -``` - -Result: - -```console-result -{ - "docs": [ - { - "doc": { - ... - "_source": { - "xml_content": "William H. GaddisThe Recognitions", - "catalog": { - "book": { - "author": "William H. Gaddis", - "title": "The Recognitions" - } - } - } - } - } - ] -} -``` - ### Handling XML attributes XML attributes are included as properties in the resulting JSON object alongside element content: @@ -367,58 +319,7 @@ Result: } ``` -### Force array behavior - -When `force_array` is true, all parsed values become arrays: -```console -POST _ingest/pipeline/_simulate -{ - "pipeline": { - "processors": [ - { - "xml": { - "field": "xml_content", - "force_array": true - } - } - ] - }, - "docs": [ - { - "_source": { - "xml_content": "The Recognitions" - } - } - ] -} -``` - -Result: - -```console-result -{ - "docs": [ - { - "doc": { - ... - "_source": { - "xml_content": "The Recognitions", - "catalog": [ - { - "book": [ - { - "title": ["The Recognitions"] - } - ] - } - ] - } - } - } - ] -} -``` ### Strict parsing mode From 940b2931eab2c1fff41858ce634841b810a1c98b Mon Sep 17 00:00:00 2001 From: Marc Guasch Date: Mon, 18 Aug 2025 14:46:42 +0200 Subject: [PATCH 17/54] Fix style checks --- .../ingest/common/XmlProcessor.java | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index 2c6c045ea0e9f..4aa220bde1f0f 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -274,7 +274,7 @@ private String getNodeValue(Node node) { * @return the content, optionally wrapped in an array based on force_array setting */ private Object applyForceArray(String elementName, Object content) { - if (forceArray && !(content instanceof List)) { + if (forceArray && (content instanceof List) == false) { List arrayContent = new ArrayList<>(); arrayContent.add(content); // Add content even if it's null (for empty elements) return arrayContent; @@ -398,7 +398,7 @@ public Iterator getPrefixes(String namespaceURI) { String targetFieldName = entry.getValue(); // Validate namespace prefixes if no namespaces are configured - if (!hasNamespaces && NAMESPACE_PATTERN.matcher(xpathExpression).matches()) { + if (hasNamespaces == false && NAMESPACE_PATTERN.matcher(xpathExpression).matches()) { throw new IllegalArgumentException( "Invalid XPath expression [" + xpathExpression @@ -591,7 +591,7 @@ private class XmlStreamingWithDomHandler extends org.xml.sax.helpers.DefaultHand private Document domDocument = null; private final java.util.Deque domElementStack = new java.util.ArrayDeque<>(); - public XmlStreamingWithDomHandler(boolean buildDom) { + XmlStreamingWithDomHandler(boolean buildDom) { this.buildDom = buildDom; } @@ -639,7 +639,7 @@ public void startElement(String uri, String localName, String qName, org.xml.sax // Build DOM element simultaneously if needed if (buildDom && domDocument != null) { org.w3c.dom.Element domElement; - if (uri != null && !uri.isEmpty() && !removeNamespaces) { + if (uri != null && uri.isEmpty() == false && removeNamespaces == false) { domElement = domDocument.createElementNS(uri, qName); } else { domElement = domDocument.createElement(removeNamespaces ? localName : qName); @@ -652,7 +652,7 @@ public void startElement(String uri, String localName, String qName, org.xml.sax String attrQName = attributes.getQName(i); String attrValue = attributes.getValue(i); - if (attrUri != null && !attrUri.isEmpty() && !removeNamespaces) { + if (attrUri != null && attrUri.isEmpty() == false && removeNamespaces == false) { domElement.setAttributeNS(attrUri, attrQName, attrValue); } else { domElement.setAttribute(removeNamespaces ? attrLocalName : attrQName, attrValue); @@ -673,14 +673,14 @@ public void startElement(String uri, String localName, String qName, org.xml.sax @Override public void characters(char[] ch, int start, int length) throws org.xml.sax.SAXException { // Add to structured output text accumulator - if (!textStack.isEmpty()) { + if (textStack.isEmpty() == false) { textStack.peek().append(ch, start, length); } // Add to DOM text node if needed - if (buildDom && !domElementStack.isEmpty()) { + if (buildDom && domElementStack.isEmpty() == false) { String text = new String(ch, start, length); - if (!text.trim().isEmpty() || !removeEmptyValues) { + if (text.trim().isEmpty() == false || removeEmptyValues == false) { org.w3c.dom.Text textNode = domDocument.createTextNode(text); domElementStack.peek().appendChild(textNode); } @@ -778,7 +778,7 @@ public void endElement(String uri, String localName, String qName) throws org.xm } // Complete DOM element if building DOM - if (buildDom && !domElementStack.isEmpty()) { + if (buildDom && domElementStack.isEmpty() == false) { domElementStack.pop(); } } @@ -799,7 +799,7 @@ public Document getDomDocument() { private String getElementName(String uri, String localName, String qName) { String elementName; if (removeNamespaces) { - elementName = localName != null && !localName.isEmpty() ? localName : qName; + elementName = localName != null && localName.isEmpty() == false ? localName : qName; } else { elementName = qName; } @@ -815,7 +815,7 @@ private String getElementName(String uri, String localName, String qName) { private String getAttributeName(String uri, String localName, String qName) { String attrName; if (removeNamespaces) { - attrName = localName != null && !localName.isEmpty() ? localName : qName; + attrName = localName != null && localName.isEmpty() == false ? localName : qName; } else { attrName = qName; } From 82dd719936bf75c3c494d5e99cc8775c49335a49 Mon Sep 17 00:00:00 2001 From: Marc Guasch Date: Fri, 22 Aug 2025 12:11:22 +0200 Subject: [PATCH 18/54] Improve XmlProcessor code quality and exception handling - Use pattern matching for instanceof in XPath and namespace configuration parsing - Replace manual null/whitespace checks with Strings.hasText() utility method - Clean up exception messages by removing redundant e.getMessage() concatenation - Add ByteArrayInputStream import instead of fully qualifying at point of use - Fix Javadoc formatting to use proper HTML lists instead of markdown syntax - Update parameter names in processXPathExpressionsFromDom for clarity - Simplify exception handling by letting XPathExpressionException bubble up - Update tests to match cleaner exception message format --- .../ingest/common/XmlProcessor.java | 287 +++++++++--------- .../common/XmlProcessorFactoryTests.java | 4 +- .../ingest/common/XmlProcessorTests.java | 23 +- 3 files changed, 158 insertions(+), 156 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index 4aa220bde1f0f..e695d3d791d26 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -10,6 +10,7 @@ package org.elasticsearch.ingest.common; import org.elasticsearch.cluster.metadata.ProjectId; +import org.elasticsearch.common.Strings; import org.elasticsearch.ingest.AbstractProcessor; import org.elasticsearch.ingest.ConfigurationUtils; import org.elasticsearch.ingest.IngestDocument; @@ -18,6 +19,7 @@ import org.w3c.dom.Node; import org.w3c.dom.NodeList; +import java.io.ByteArrayInputStream; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.HashMap; @@ -30,6 +32,8 @@ import javax.xml.namespace.NamespaceContext; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpression; @@ -39,13 +43,14 @@ /** * Processor that parses XML documents and converts them to JSON objects using a single-pass streaming approach. * - * Features: - * - XML to JSON conversion with configurable structure options - * - XPath extraction with namespace support - * - Configurable options: force_array, force_content, remove_namespaces, to_lower - * - Strict parsing mode for XML validation - * - Empty value filtering with remove_empty_values option - * - Logstash-compatible error handling and behavior + * Features:
    + *
  • XML to JSON conversion with configurable structure options + *
  • XPath extraction with namespace support + *
  • Configurable options: force_array, force_content, remove_namespaces, to_lower + *
  • Strict parsing mode for XML validation + *
  • Empty value filtering with remove_empty_values option + *
  • Logstash-compatible error handling and behavior + *
*/ public final class XmlProcessor extends AbstractProcessor { @@ -54,14 +59,13 @@ public final class XmlProcessor extends AbstractProcessor { private static final XPathFactory XPATH_FACTORY = XPathFactory.newInstance(); // Pre-compiled pattern to detect namespace prefixes - private static final Pattern NAMESPACE_PATTERN = Pattern.compile(".*\\b[a-zA-Z][a-zA-Z0-9_-]*:[a-zA-Z][a-zA-Z0-9_-]*.*"); + private static final Pattern NAMESPACE_PATTERN = Pattern.compile("\\b[a-zA-Z][a-zA-Z0-9_-]*:[a-zA-Z][a-zA-Z0-9_-]*"); // Pre-configured SAX parser factories for secure XML parsing - private static final javax.xml.parsers.SAXParserFactory SAX_PARSER_FACTORY = createSecureSaxParserFactory(); - private static final javax.xml.parsers.SAXParserFactory SAX_PARSER_FACTORY_NS = createSecureSaxParserFactoryNamespaceAware(); - private static final javax.xml.parsers.SAXParserFactory SAX_PARSER_FACTORY_STRICT = createSecureSaxParserFactoryStrict(); - private static final javax.xml.parsers.SAXParserFactory SAX_PARSER_FACTORY_NS_STRICT = - createSecureSaxParserFactoryNamespaceAwareStrict(); + private static final SAXParserFactory SAX_PARSER_FACTORY = createSecureSaxParserFactory(); + private static final SAXParserFactory SAX_PARSER_FACTORY_NS = createSecureSaxParserFactoryNamespaceAware(); + private static final SAXParserFactory SAX_PARSER_FACTORY_STRICT = createSecureSaxParserFactoryStrict(); + private static final SAXParserFactory SAX_PARSER_FACTORY_NS_STRICT = createSecureSaxParserFactoryNamespaceAwareStrict(); // Pre-configured document builder factory for DOM creation private static final DocumentBuilderFactory DOM_FACTORY = createSecureDocumentBuilderFactory(); @@ -69,7 +73,6 @@ public final class XmlProcessor extends AbstractProcessor { private final String field; private final String targetField; private final boolean ignoreMissing; - private final boolean ignoreFailure; private final boolean toLower; private final boolean removeEmptyValues; private final boolean storeXml; @@ -87,7 +90,6 @@ public final class XmlProcessor extends AbstractProcessor { String field, String targetField, boolean ignoreMissing, - boolean ignoreFailure, boolean toLower, boolean removeEmptyValues, boolean storeXml, @@ -102,7 +104,6 @@ public final class XmlProcessor extends AbstractProcessor { this.field = field; this.targetField = targetField; this.ignoreMissing = ignoreMissing; - this.ignoreFailure = ignoreFailure; this.toLower = toLower; this.removeEmptyValues = removeEmptyValues; this.storeXml = storeXml; @@ -168,16 +169,13 @@ public IngestDocument execute(IngestDocument document) { Object fieldValue = document.getFieldValue(field, Object.class, ignoreMissing); if (fieldValue == null) { - if (ignoreMissing || ignoreFailure) { + if (ignoreMissing) { return document; } throw new IllegalArgumentException("field [" + field + "] is null, cannot parse XML"); } if (fieldValue instanceof String == false) { - if (ignoreFailure) { - return document; - } throw new IllegalArgumentException("field [" + field + "] is not a string, cannot parse XML"); } @@ -188,12 +186,7 @@ public IngestDocument execute(IngestDocument document) { parseXmlAndXPath(document, xmlString.trim()); } } catch (Exception e) { - if (ignoreFailure) { - // Add failure tag similar to Logstash behavior - document.appendFieldValue("tags", "_xmlparsefailure"); - return document; - } - throw new IllegalArgumentException("field [" + field + "] contains invalid XML: " + e.getMessage(), e); + throw new IllegalArgumentException("field [" + field + "] contains invalid XML", e); } return document; @@ -208,11 +201,12 @@ public String getType() { * Determines if a value should be considered empty for filtering purposes. * Used by the remove_empty_values feature to filter out empty content. * - * Considers empty: - * - null values - * - empty or whitespace-only strings - * - empty Maps - * - empty Lists + * Considers empty:
    + *
  • null values + *
  • empty or whitespace-only strings + *
  • empty Maps + *
  • empty Lists + *
* * @param value the value to check * @return true if the value should be considered empty @@ -221,25 +215,26 @@ private boolean isEmptyValue(Object value) { if (value == null) { return true; } - if (value instanceof String) { - return ((String) value).trim().isEmpty(); + if (value instanceof String string) { + return string.isBlank(); } - if (value instanceof Map) { - return ((Map) value).isEmpty(); + if (value instanceof Map map) { + return map.isEmpty(); } - if (value instanceof List) { - return ((List) value).isEmpty(); + if (value instanceof List list) { + return list.isEmpty(); } return false; } /** * Extract the text value from a DOM node for XPath result processing. - * Handles different node types appropriately: - * - TEXT_NODE and CDATA_SECTION_NODE: returns node value directly - * - ATTRIBUTE_NODE: returns attribute value - * - ELEMENT_NODE: returns text content (concatenated text of all descendants) - * - Other node types: returns text content as fallback + * Handles different node types appropriately:
    + *
  • TEXT_NODE and CDATA_SECTION_NODE: returns node value directly + *
  • ATTRIBUTE_NODE: returns attribute value + *
  • ELEMENT_NODE: returns text content (concatenated text of all descendants) + *
  • Other node types: returns text content as fallback + *
* * @param node the DOM node to extract text from * @return the text content of the node, or null if node is null @@ -249,25 +244,21 @@ private String getNodeValue(Node node) { return null; } - switch (node.getNodeType()) { - case Node.ATTRIBUTE_NODE: - case Node.CDATA_SECTION_NODE: - case Node.TEXT_NODE: - return node.getNodeValue(); - case Node.ELEMENT_NODE: - default: - return node.getTextContent(); - } + return switch (node.getNodeType()) { + case Node.ATTRIBUTE_NODE, Node.CDATA_SECTION_NODE, Node.TEXT_NODE -> node.getNodeValue(); + default -> node.getTextContent(); + }; } /** * Applies force_array logic to ensure all fields are arrays when enabled. * - * Behavior: - * - If force_array is false: returns content unchanged - * - If force_array is true and content is already a List: returns content unchanged - * - If force_array is true and content is not a List: wraps content in a new ArrayList - * - Handles null content appropriately (wraps null in array if force_array is true) + * Behavior:
    + *
  • If force_array is false: returns content unchanged + *
  • If force_array is true and content is already a List: returns content unchanged + *
  • If force_array is true and content is not a List: wraps content in a new ArrayList + *
  • Handles null content appropriately (wraps null in array if force_array is true) + *
* * @param elementName the name of the element (for context, not used in current implementation) * @param content the content to potentially wrap in an array @@ -285,52 +276,41 @@ private Object applyForceArray(String elementName, Object content) { /** * Evaluates precompiled XPath expressions against a DOM document and adds results to the ingest document. * - * Features: - * - Uses precompiled XPath expressions for optimal performance - * - Extracts text values from matched nodes (elements, attributes, text nodes) - * - Single matches stored as strings, multiple matches as string arrays - * - Respects ignoreFailure setting for XPath evaluation errors + * Features:
    + *
  • Uses precompiled XPath expressions for optimal performance + *
  • Extracts text values from matched nodes (elements, attributes, text nodes) + *
  • Single matches stored as strings, multiple matches as string arrays + *
* - * @param document the ingest document to add XPath results to - * @param doc the DOM document to evaluate XPath expressions against - * @throws Exception if XPath processing fails and ignoreFailure is false + * @param ingestDocument the ingest document to add XPath results to + * @param xmlDocument the DOM document to evaluate XPath expressions against + * @throws XPathExpressionException if XPath processing fails */ - private void processXPathExpressionsFromDom(IngestDocument document, Document doc) throws Exception { + private void processXPathExpressionsFromDom(IngestDocument ingestDocument, Document xmlDocument) throws XPathExpressionException { // Use precompiled XPath expressions for optimal performance for (Map.Entry entry : compiledXPathExpressions.entrySet()) { String targetFieldName = entry.getKey(); XPathExpression compiledExpression = entry.getValue(); + Object result = compiledExpression.evaluate(xmlDocument, XPathConstants.NODESET); - try { - Object result = compiledExpression.evaluate(doc, XPathConstants.NODESET); - - if (result instanceof NodeList) { - NodeList nodeList = (NodeList) result; - List values = new ArrayList<>(); + if (result instanceof NodeList nodeList) { + List values = new ArrayList<>(); - for (int i = 0; i < nodeList.getLength(); i++) { - Node node = nodeList.item(i); - String value = getNodeValue(node); - if (value != null && value.trim().isEmpty() == false) { - values.add(value); - } + for (int i = 0; i < nodeList.getLength(); i++) { + Node node = nodeList.item(i); + String value = getNodeValue(node); + if (Strings.hasText(value)) { + values.add(value); } + } - if (values.isEmpty() == false) { - if (values.size() == 1) { - document.setFieldValue(targetFieldName, values.get(0)); - } else { - document.setFieldValue(targetFieldName, values); - } + if (values.isEmpty() == false) { + if (values.size() == 1) { + ingestDocument.setFieldValue(targetFieldName, values.get(0)); + } else { + ingestDocument.setFieldValue(targetFieldName, values); } } - } catch (XPathExpressionException e) { - if (ignoreFailure == false) { - throw new IllegalArgumentException( - "XPath evaluation failed for target field [" + targetFieldName + "]: " + e.getMessage(), - e - ); - } } } } @@ -398,7 +378,7 @@ public Iterator getPrefixes(String namespaceURI) { String targetFieldName = entry.getValue(); // Validate namespace prefixes if no namespaces are configured - if (hasNamespaces == false && NAMESPACE_PATTERN.matcher(xpathExpression).matches()) { + if (hasNamespaces == false && NAMESPACE_PATTERN.matcher(xpathExpression).find()) { throw new IllegalArgumentException( "Invalid XPath expression [" + xpathExpression @@ -410,7 +390,7 @@ public Iterator getPrefixes(String namespaceURI) { XPathExpression compiledExpression = xpath.compile(xpathExpression); compiled.put(targetFieldName, compiledExpression); } catch (XPathExpressionException e) { - throw new IllegalArgumentException("Invalid XPath expression [" + xpathExpression + "]: " + e.getMessage(), e); + throw new IllegalArgumentException("Invalid XPath expression [" + xpathExpression + "]", e); } } @@ -430,7 +410,6 @@ public XmlProcessor create( String field = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "field"); String targetField = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "target_field", field); boolean ignoreMissing = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "ignore_missing", false); - boolean ignoreFailure = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "ignore_failure", false); boolean toLower = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "to_lower", false); boolean removeEmptyValues = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "remove_empty_values", false); boolean storeXml = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "store_xml", true); @@ -446,8 +425,8 @@ public XmlProcessor create( @SuppressWarnings("unchecked") Map xpathMap = (Map) xpathConfig; for (Map.Entry entry : xpathMap.entrySet()) { - if (entry.getValue() instanceof String) { - xpathExpressions.put(entry.getKey(), (String) entry.getValue()); + if (entry.getValue() instanceof String str) { + xpathExpressions.put(entry.getKey(), str); } else { throw new IllegalArgumentException( "XPath target field [" @@ -471,8 +450,8 @@ public XmlProcessor create( @SuppressWarnings("unchecked") Map namespaceMap = (Map) namespaceConfig; for (Map.Entry entry : namespaceMap.entrySet()) { - if (entry.getValue() instanceof String) { - namespaces.put(entry.getKey(), (String) entry.getValue()); + if (entry.getValue() instanceof String str) { + namespaces.put(entry.getKey(), str); } else { throw new IllegalArgumentException( "Namespace prefix [" @@ -497,7 +476,6 @@ public XmlProcessor create( field, targetField, ignoreMissing, - ignoreFailure, toLower, removeEmptyValues, storeXml, @@ -520,7 +498,7 @@ public XmlProcessor create( * @throws Exception if XML parsing fails */ private void parseXmlAndXPath(IngestDocument document, String xmlString) throws Exception { - if (xmlString == null || xmlString.trim().isEmpty()) { + if (Strings.hasText(xmlString) == false) { return; } @@ -528,9 +506,9 @@ private void parseXmlAndXPath(IngestDocument document, String xmlString) throws boolean needsDom = xpathExpressions.isEmpty() == false; // Use the appropriate pre-configured SAX parser factory - javax.xml.parsers.SAXParserFactory factory = selectSaxParserFactory(); + SAXParserFactory factory = selectSaxParserFactory(); - javax.xml.parsers.SAXParser parser = factory.newSAXParser(); + SAXParser parser = factory.newSAXParser(); // Configure error handler for strict mode if (isStrict()) { @@ -555,7 +533,7 @@ public void fatalError(org.xml.sax.SAXParseException exception) throws org.xml.s // Use enhanced handler that can build DOM during streaming when needed XmlStreamingWithDomHandler handler = new XmlStreamingWithDomHandler(needsDom); - parser.parse(new java.io.ByteArrayInputStream(xmlString.getBytes(StandardCharsets.UTF_8)), handler); + parser.parse(new ByteArrayInputStream(xmlString.getBytes(StandardCharsets.UTF_8)), handler); // Store structured result if needed if (storeXml) { @@ -568,9 +546,8 @@ public void fatalError(org.xml.sax.SAXParseException exception) throws org.xml.s // Process XPath expressions if DOM was built during streaming if (needsDom) { Document domDocument = handler.getDomDocument(); - if (domDocument != null) { - processXPathExpressionsFromDom(document, domDocument); - } + assert domDocument != null : "DOM document should not be null when XPath processing is needed"; + processXPathExpressionsFromDom(document, domDocument); } } @@ -579,11 +556,20 @@ public void fatalError(org.xml.sax.SAXParseException exception) throws org.xml.s * Handles XML-to-JSON conversion with support for all processor configuration options. */ private class XmlStreamingWithDomHandler extends org.xml.sax.helpers.DefaultHandler { + + /** + * Record to encapsulate the parsing state for each XML element level. + * Maintains the 1:1:1:1 relationship between element data, name, text content, and repeated elements. + */ + private record ElementParsingState( + Map element, + String elementName, + StringBuilder textContent, + Map> repeatedElements + ) {} + // Streaming parser state (for structured output) - private final java.util.Deque> elementStack = new java.util.ArrayDeque<>(); - private final java.util.Deque elementNameStack = new java.util.ArrayDeque<>(); - private final java.util.Deque textStack = new java.util.ArrayDeque<>(); - private final java.util.Deque>> repeatedElementsStack = new java.util.ArrayDeque<>(); + private final java.util.Deque elementStack = new java.util.ArrayDeque<>(); private Object rootResult = null; // DOM building state (for XPath processing when needed) @@ -631,10 +617,7 @@ public void startElement(String uri, String localName, String qName, org.xml.sax } } - elementStack.push(element); - elementNameStack.push(elementName); - textStack.push(new StringBuilder()); - repeatedElementsStack.push(repeatedElements); + elementStack.push(new ElementParsingState(element, elementName, new StringBuilder(), repeatedElements)); // Build DOM element simultaneously if needed if (buildDom && domDocument != null) { @@ -673,14 +656,14 @@ public void startElement(String uri, String localName, String qName, org.xml.sax @Override public void characters(char[] ch, int start, int length) throws org.xml.sax.SAXException { // Add to structured output text accumulator - if (textStack.isEmpty() == false) { - textStack.peek().append(ch, start, length); + if (elementStack.isEmpty() == false) { + elementStack.peek().textContent().append(ch, start, length); } // Add to DOM text node if needed if (buildDom && domElementStack.isEmpty() == false) { String text = new String(ch, start, length); - if (text.trim().isEmpty() == false || removeEmptyValues == false) { + if (text.isBlank() == false || removeEmptyValues == false) { org.w3c.dom.Text textNode = domDocument.createTextNode(text); domElementStack.peek().appendChild(textNode); } @@ -694,10 +677,11 @@ public void endElement(String uri, String localName, String qName) throws org.xm return; } - Map element = elementStack.pop(); - String elementName = elementNameStack.pop(); - StringBuilder textContent = textStack.pop(); - Map> repeatedElements = repeatedElementsStack.pop(); + ElementParsingState currentState = elementStack.pop(); + Map element = currentState.element(); + String elementName = currentState.elementName(); + StringBuilder textContent = currentState.textContent(); + Map> repeatedElements = currentState.repeatedElements(); // Add repeated elements as arrays for (Map.Entry> entry : repeatedElements.entrySet()) { @@ -708,8 +692,9 @@ public void endElement(String uri, String localName, String qName) throws org.xm } // Process text content and determine final element structure - String trimmedText = textContent.toString().trim(); - boolean hasText = trimmedText.isEmpty() == false; + String textContentString = textContent.toString(); + String trimmedText = textContentString.trim(); + boolean hasText = textContentString.isBlank() == false; boolean hasChildren = element.size() > 0; Object elementValue; @@ -757,8 +742,9 @@ public void endElement(String uri, String localName, String qName) throws org.xm } else { // Add to parent element if (elementValue != null) { - Map parentElement = elementStack.peek(); - Map> parentRepeatedElements = repeatedElementsStack.peek(); + ElementParsingState parentState = elementStack.peek(); + Map parentElement = parentState.element(); + Map> parentRepeatedElements = parentState.repeatedElements(); if (parentElement.containsKey(elementName) || parentRepeatedElements.containsKey(elementName)) { // Handle repeated elements @@ -833,8 +819,8 @@ private String getAttributeName(String uri, String localName, String qName) { * Creates a secure, pre-configured SAX parser factory for XML parsing. * This factory is configured to prevent XXE attacks with SAX-specific features. */ - private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactory() { - javax.xml.parsers.SAXParserFactory factory = javax.xml.parsers.SAXParserFactory.newInstance(); + private static SAXParserFactory createSecureSaxParserFactory() { + SAXParserFactory factory = SAXParserFactory.newInstance(); factory.setValidating(false); // Configure SAX-specific security features to prevent XXE attacks @@ -845,7 +831,8 @@ private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactory() factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); } catch (Exception e) { - // If features cannot be set, continue with default settings + // Security features are critical - fail if they cannot be set + throw new IllegalStateException("Cannot configure secure XML parsing features", e); } return factory; @@ -855,8 +842,8 @@ private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactory() * Creates a secure, pre-configured namespace-aware SAX parser factory for XML parsing. * This factory is configured to prevent XXE attacks and has namespace awareness enabled. */ - private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactoryNamespaceAware() { - javax.xml.parsers.SAXParserFactory factory = javax.xml.parsers.SAXParserFactory.newInstance(); + private static SAXParserFactory createSecureSaxParserFactoryNamespaceAware() { + SAXParserFactory factory = SAXParserFactory.newInstance(); factory.setValidating(false); factory.setNamespaceAware(true); @@ -868,7 +855,8 @@ private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactoryNa factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); } catch (Exception e) { - // If features cannot be set, continue with default settings + // Security features are critical - fail if they cannot be set + throw new IllegalStateException("Cannot configure secure XML parsing features", e); } return factory; @@ -878,8 +866,8 @@ private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactoryNa * Creates a secure, pre-configured SAX parser factory for strict XML parsing. * This factory is configured to prevent XXE attacks and has strict validation enabled. */ - private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactoryStrict() { - javax.xml.parsers.SAXParserFactory factory = javax.xml.parsers.SAXParserFactory.newInstance(); + private static SAXParserFactory createSecureSaxParserFactoryStrict() { + SAXParserFactory factory = SAXParserFactory.newInstance(); factory.setValidating(false); // Configure SAX-specific security features to prevent XXE attacks @@ -889,11 +877,16 @@ private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactorySt factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false); factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); + } catch (Exception e) { + // Security features are critical - fail if they cannot be set + throw new IllegalStateException("Cannot configure secure XML parsing features", e); + } - // Enable strict parsing features + // Try to enable strict parsing features (optional - may not be supported) + try { factory.setFeature("http://apache.org/xml/features/validation/check-full-element-content", true); } catch (Exception e) { - // If features cannot be set, continue with default settings + // Strict parsing features are optional - continue without them if not supported } return factory; @@ -903,8 +896,8 @@ private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactorySt * Creates a secure, pre-configured namespace-aware SAX parser factory for strict XML parsing. * This factory is configured to prevent XXE attacks, has namespace awareness enabled, and strict validation. */ - private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactoryNamespaceAwareStrict() { - javax.xml.parsers.SAXParserFactory factory = javax.xml.parsers.SAXParserFactory.newInstance(); + private static SAXParserFactory createSecureSaxParserFactoryNamespaceAwareStrict() { + SAXParserFactory factory = SAXParserFactory.newInstance(); factory.setValidating(false); factory.setNamespaceAware(true); @@ -915,11 +908,16 @@ private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactoryNa factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false); factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); + } catch (Exception e) { + // Security features are critical - fail if they cannot be set + throw new IllegalStateException("Cannot configure secure XML parsing features", e); + } - // Enable strict parsing features + // Try to enable strict parsing features (optional - may not be supported) + try { factory.setFeature("http://apache.org/xml/features/validation/check-full-element-content", true); } catch (Exception e) { - // If features cannot be set, continue with default settings + // Strict parsing features are optional - continue without them if not supported } return factory; @@ -945,15 +943,16 @@ private static DocumentBuilderFactory createSecureDocumentBuilderFactory() { /** * Selects the appropriate pre-configured SAX parser factory based on processor configuration. * - * Factory selection matrix: - * - Regular parsing, no namespaces: SAX_PARSER_FACTORY - * - Regular parsing, with namespaces: SAX_PARSER_FACTORY_NS - * - Strict parsing, no namespaces: SAX_PARSER_FACTORY_STRICT - * - Strict parsing, with namespaces: SAX_PARSER_FACTORY_NS_STRICT + * Factory selection matrix:
    + *
  • Regular parsing, no namespaces: SAX_PARSER_FACTORY + *
  • Regular parsing, with namespaces: SAX_PARSER_FACTORY_NS + *
  • Strict parsing, no namespaces: SAX_PARSER_FACTORY_STRICT + *
  • Strict parsing, with namespaces: SAX_PARSER_FACTORY_NS_STRICT + *
* * @return the appropriate SAX parser factory for the current configuration */ - private javax.xml.parsers.SAXParserFactory selectSaxParserFactory() { + private SAXParserFactory selectSaxParserFactory() { boolean needsNamespaceAware = hasNamespaces() || removeNamespaces; if (isStrict()) { return needsNamespaceAware ? SAX_PARSER_FACTORY_NS_STRICT : SAX_PARSER_FACTORY_STRICT; diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java index 49d33db00186c..29c4de9c5031f 100644 --- a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java @@ -370,8 +370,8 @@ public void testCreateWithInvalidXPathExpression() throws Exception { ); // Check that the error message contains the XPath expression and indicates it's invalid - assertThat(exception.getMessage(), containsString("Invalid XPath expression [invalid xpath ][]:")); - assertThat(exception.getMessage(), containsString("javax.xml.transform.TransformerException")); + assertThat(exception.getMessage(), containsString("Invalid XPath expression [invalid xpath ][]")); + assertThat(exception.getCause().getMessage(), containsString("javax.xml.transform.TransformerException")); } public void testCreateWithXPathUsingNamespacesWithoutConfiguration() throws Exception { diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java index bbe82e1f9e92e..f05c98ed402e5 100644 --- a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java @@ -370,7 +370,7 @@ public void testInvalidXml() { XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); - IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> { processor.execute(ingestDocument); }); + IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> processor.execute(ingestDocument)); assertTrue( "Error message should indicate XML is invalid", @@ -380,6 +380,9 @@ public void testInvalidXml() { /** * Test handling of invalid XML with ignoreFailure=true. + * Note: The ignore_failure parameter is handled by the framework's OnFailureProcessor wrapper. + * When calling the processor directly (as in tests), exceptions are still thrown. + * This test verifies that the processor itself properly reports XML parsing errors. */ public void testInvalidXmlWithIgnoreFailure() { String xml = ""; // Invalid XML missing closing tag @@ -389,11 +392,14 @@ public void testInvalidXmlWithIgnoreFailure() { XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); - processor.execute(ingestDocument); + // Even with ignore_failure=true, calling the processor directly still throws exceptions + // The framework's OnFailureProcessor wrapper handles the ignore_failure behavior in production + IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> processor.execute(ingestDocument)); - List tags = ingestDocument.getFieldValue("tags", List.class); - assertNotNull(tags); - assertTrue(tags.contains("_xmlparsefailure")); + assertTrue( + "Error message should indicate XML is invalid", + exception.getMessage().contains("invalid XML") || exception.getCause().getMessage().contains("XML") + ); } /** @@ -466,10 +472,7 @@ public void testIgnoreMissing() { XmlProcessor failingProcessor = createTestProcessor(config); // This should throw an exception - IllegalArgumentException exception = expectThrows( - IllegalArgumentException.class, - () -> { failingProcessor.execute(ingestDocument); } - ); + IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> failingProcessor.execute(ingestDocument)); assertTrue(exception.getMessage().contains("not present as part of path")); } @@ -558,7 +561,7 @@ public void testStrictParsing() { String invalidXml = ""; IngestDocument invalidDocument = createTestIngestDocument(invalidXml); - IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> { processor.execute(invalidDocument); }); + IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> processor.execute(invalidDocument)); assertTrue( "Error message should indicate XML is invalid", From 27d1d66ad058ff20041f7a561850f762a642da33 Mon Sep 17 00:00:00 2001 From: Marc Guasch Date: Fri, 22 Aug 2025 12:54:10 +0200 Subject: [PATCH 19/54] Refactor XmlProcessor configuration handling and enhance factory tests for validation --- .../ingest/common/XmlProcessor.java | 60 ++++++++----------- .../common/XmlProcessorFactoryTests.java | 52 +++++++++++++++- 2 files changed, 74 insertions(+), 38 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index e695d3d791d26..401a3a2232e0a 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -419,51 +419,39 @@ public XmlProcessor create( // Parse XPath expressions map Map xpathExpressions = new HashMap<>(); - Object xpathConfig = config.get("xpath"); + Map xpathConfig = ConfigurationUtils.readOptionalMap(TYPE, processorTag, config, "xpath"); if (xpathConfig != null) { - if (xpathConfig instanceof Map) { - @SuppressWarnings("unchecked") - Map xpathMap = (Map) xpathConfig; - for (Map.Entry entry : xpathMap.entrySet()) { - if (entry.getValue() instanceof String str) { - xpathExpressions.put(entry.getKey(), str); - } else { - throw new IllegalArgumentException( - "XPath target field [" - + entry.getKey() - + "] must be a string, got [" - + entry.getValue().getClass().getSimpleName() - + "]" - ); - } + for (Map.Entry entry : xpathConfig.entrySet()) { + if (entry.getValue() instanceof String str) { + xpathExpressions.put(entry.getKey(), str); + } else { + throw new IllegalArgumentException( + "XPath target field [" + + entry.getKey() + + "] must be a string, got [" + + entry.getValue().getClass().getSimpleName() + + "]" + ); } - } else { - throw new IllegalArgumentException("XPath configuration must be a map of expressions to target fields"); } } // Parse namespaces map Map namespaces = new HashMap<>(); - Object namespaceConfig = config.get("namespaces"); + Map namespaceConfig = ConfigurationUtils.readOptionalMap(TYPE, processorTag, config, "namespaces"); if (namespaceConfig != null) { - if (namespaceConfig instanceof Map) { - @SuppressWarnings("unchecked") - Map namespaceMap = (Map) namespaceConfig; - for (Map.Entry entry : namespaceMap.entrySet()) { - if (entry.getValue() instanceof String str) { - namespaces.put(entry.getKey(), str); - } else { - throw new IllegalArgumentException( - "Namespace prefix [" - + entry.getKey() - + "] must have a string URI, got [" - + entry.getValue().getClass().getSimpleName() - + "]" - ); - } + for (Map.Entry entry : namespaceConfig.entrySet()) { + if (entry.getValue() instanceof String str) { + namespaces.put(entry.getKey(), str); + } else { + throw new IllegalArgumentException( + "Namespace prefix [" + + entry.getKey() + + "] must have a string URI, got [" + + entry.getValue().getClass().getSimpleName() + + "]" + ); } - } else { - throw new IllegalArgumentException("Namespaces configuration must be a map of prefixes to URIs"); } } diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java index 29c4de9c5031f..97826b8b1449a 100644 --- a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java @@ -12,6 +12,7 @@ import org.elasticsearch.ElasticsearchParseException; import org.elasticsearch.test.ESTestCase; +import java.util.Arrays; import java.util.HashMap; import java.util.Map; @@ -93,6 +94,32 @@ private XmlProcessor createProcessor(Map config) throws Exceptio return createProcessor(createFactory(), config); } + /** + * Creates a processor mimicking the production pipeline validation. + * This simulates what ConfigurationUtils.readProcessor() does. + */ + private XmlProcessor createProcessorWithValidation(Map config) throws Exception { + XmlProcessor.Factory factory = createFactory(); + String processorTag = randomAlphaOfLength(10); + + // Make a copy of the config to avoid modifying the original + Map configCopy = new HashMap<>(config); + + // Create the processor (this should consume config parameters) + XmlProcessor processor = factory.create(null, processorTag, null, configCopy, null); + + // Simulate the validation check from ConfigurationUtils.readProcessor() + if (configCopy.isEmpty() == false) { + throw new ElasticsearchParseException( + "processor [{}] doesn't support one or more provided configuration parameters {}", + "xml", + Arrays.toString(configCopy.keySet().toArray()) + ); + } + + return processor; + } + /** * Helper method to create XPath configuration map. */ @@ -235,7 +262,7 @@ public void testCreateWithInvalidXPathConfig() throws Exception { Map config = createBaseConfig(); config.put("xpath", "invalid_string"); // Should be a map - expectCreationFailure(config, IllegalArgumentException.class, "XPath configuration must be a map of expressions to target fields"); + expectCreationFailure(config, ElasticsearchParseException.class, "[xpath] property isn't a map, but of type [java.lang.String]"); } public void testCreateWithInvalidXPathTargetField() throws Exception { @@ -271,7 +298,7 @@ public void testCreateWithInvalidNamespacesConfig() throws Exception { Map config = createBaseConfig(); config.put("namespaces", "invalid_string"); // Should be a map - expectCreationFailure(config, IllegalArgumentException.class, "Namespaces configuration must be a map of prefixes to URIs"); + expectCreationFailure(config, ElasticsearchParseException.class, "[namespaces] property isn't a map, but of type [java.lang.String]"); } public void testCreateWithInvalidNamespaceURI() throws Exception { @@ -384,4 +411,25 @@ public void testCreateWithXPathUsingNamespacesWithoutConfiguration() throws Exce "Invalid XPath expression [//book:title/text()]: contains namespace prefixes but no namespace configuration provided" ); } + + public void testConfigurationParametersAreProperlyRemoved() throws Exception { + // Test that demonstrates configuration validation works when using production-like validation + // This test verifies that ConfigurationUtils.readOptionalMap() properly removes parameters from config + // If any are left, the processor factory should throw an exception about unknown parameters + + Map xpathConfig = createXPathConfig("//test", "test_field"); + Map config = createConfigWithXPath(DEFAULT_FIELD, xpathConfig); + + // Add an intentionally unknown parameter to trigger the validation + config.put("unknown_parameter", "should_fail"); + + // This should fail because "unknown_parameter" should remain in config after all valid params are removed + ElasticsearchParseException exception = expectThrows( + ElasticsearchParseException.class, + () -> createProcessorWithValidation(config) + ); + + assertThat(exception.getMessage(), containsString("doesn't support one or more provided configuration parameters")); + assertThat(exception.getMessage(), containsString("unknown_parameter")); + } } From f79adec41d9c0aa927f6a70d3686e0fe56206f3b Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Fri, 22 Aug 2025 11:00:37 +0000 Subject: [PATCH 20/54] [CI] Auto commit changes from spotless --- .../common/XmlProcessorFactoryTests.java | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java index 97826b8b1449a..1df6296033613 100644 --- a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java @@ -101,13 +101,13 @@ private XmlProcessor createProcessor(Map config) throws Exceptio private XmlProcessor createProcessorWithValidation(Map config) throws Exception { XmlProcessor.Factory factory = createFactory(); String processorTag = randomAlphaOfLength(10); - + // Make a copy of the config to avoid modifying the original Map configCopy = new HashMap<>(config); - + // Create the processor (this should consume config parameters) XmlProcessor processor = factory.create(null, processorTag, null, configCopy, null); - + // Simulate the validation check from ConfigurationUtils.readProcessor() if (configCopy.isEmpty() == false) { throw new ElasticsearchParseException( @@ -116,7 +116,7 @@ private XmlProcessor createProcessorWithValidation(Map config) t Arrays.toString(configCopy.keySet().toArray()) ); } - + return processor; } @@ -298,7 +298,11 @@ public void testCreateWithInvalidNamespacesConfig() throws Exception { Map config = createBaseConfig(); config.put("namespaces", "invalid_string"); // Should be a map - expectCreationFailure(config, ElasticsearchParseException.class, "[namespaces] property isn't a map, but of type [java.lang.String]"); + expectCreationFailure( + config, + ElasticsearchParseException.class, + "[namespaces] property isn't a map, but of type [java.lang.String]" + ); } public void testCreateWithInvalidNamespaceURI() throws Exception { @@ -416,19 +420,19 @@ public void testConfigurationParametersAreProperlyRemoved() throws Exception { // Test that demonstrates configuration validation works when using production-like validation // This test verifies that ConfigurationUtils.readOptionalMap() properly removes parameters from config // If any are left, the processor factory should throw an exception about unknown parameters - + Map xpathConfig = createXPathConfig("//test", "test_field"); Map config = createConfigWithXPath(DEFAULT_FIELD, xpathConfig); - + // Add an intentionally unknown parameter to trigger the validation config.put("unknown_parameter", "should_fail"); - + // This should fail because "unknown_parameter" should remain in config after all valid params are removed ElasticsearchParseException exception = expectThrows( ElasticsearchParseException.class, () -> createProcessorWithValidation(config) ); - + assertThat(exception.getMessage(), containsString("doesn't support one or more provided configuration parameters")); assertThat(exception.getMessage(), containsString("unknown_parameter")); } From 7e720893032b9f4accc81469035397ddca4b4ebb Mon Sep 17 00:00:00 2001 From: Marc Guasch Date: Wed, 3 Sep 2025 11:01:00 +0200 Subject: [PATCH 21/54] Simplify tests and make them more idiomatic --- .../common/XmlProcessorFactoryTests.java | 294 ++++--------- .../ingest/common/XmlProcessorTests.java | 405 ++++++++---------- 2 files changed, 269 insertions(+), 430 deletions(-) diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java index 97826b8b1449a..f9982d9729fea 100644 --- a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java @@ -12,10 +12,10 @@ import org.elasticsearch.ElasticsearchParseException; import org.elasticsearch.test.ESTestCase; -import java.util.Arrays; import java.util.HashMap; import java.util.Map; +import static org.hamcrest.Matchers.anEmptyMap; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; @@ -31,166 +31,24 @@ private XmlProcessor.Factory createFactory() { return new XmlProcessor.Factory(); } - /** - * Creates a basic configuration map with the specified field name. - */ - private Map createBaseConfig(String fieldName) { - Map config = new HashMap<>(); - config.put("field", fieldName); - return config; - } - - /** - * Creates a basic configuration map with the default field name. - */ - private Map createBaseConfig() { - return createBaseConfig(DEFAULT_FIELD); - } - - /** - * Creates a configuration map with XPath expressions. - */ - private Map createConfigWithXPath(String fieldName, Map xpathExpressions) { - Map config = createBaseConfig(fieldName); - config.put("xpath", xpathExpressions); - return config; - } - - /** - * Creates a configuration map with namespace definitions. - */ - private Map createConfigWithNamespaces(String fieldName, Map namespaces) { - Map config = createBaseConfig(fieldName); - config.put("namespaces", namespaces); - return config; - } - - /** - * Creates a configuration map with both XPath expressions and namespaces. - */ - private Map createConfigWithXPathAndNamespaces( - String fieldName, - Map xpathExpressions, - Map namespaces - ) { - Map config = createBaseConfig(fieldName); - config.put("xpath", xpathExpressions); - config.put("namespaces", namespaces); - return config; - } - - /** - * Creates a processor with the given factory and configuration. - */ - private XmlProcessor createProcessor(XmlProcessor.Factory factory, Map config) throws Exception { - String processorTag = randomAlphaOfLength(10); - return factory.create(null, processorTag, null, config, null); - } - /** * Creates a processor with the default factory and given configuration. + * This validates that all configuration parameters are consumed during processor creation. */ private XmlProcessor createProcessor(Map config) throws Exception { - return createProcessor(createFactory(), config); - } - - /** - * Creates a processor mimicking the production pipeline validation. - * This simulates what ConfigurationUtils.readProcessor() does. - */ - private XmlProcessor createProcessorWithValidation(Map config) throws Exception { XmlProcessor.Factory factory = createFactory(); String processorTag = randomAlphaOfLength(10); - + // Make a copy of the config to avoid modifying the original Map configCopy = new HashMap<>(config); - + // Create the processor (this should consume config parameters) XmlProcessor processor = factory.create(null, processorTag, null, configCopy, null); - - // Simulate the validation check from ConfigurationUtils.readProcessor() - if (configCopy.isEmpty() == false) { - throw new ElasticsearchParseException( - "processor [{}] doesn't support one or more provided configuration parameters {}", - "xml", - Arrays.toString(configCopy.keySet().toArray()) - ); - } - - return processor; - } - /** - * Helper method to create XPath configuration map. - */ - private Map createXPathConfig(String... expressionsAndFields) { - if (expressionsAndFields.length % 2 != 0) { - throw new IllegalArgumentException("Must provide even number of arguments (expression, field, expression, field, ...)"); - } - - Map xpathConfig = new HashMap<>(); - for (int i = 0; i < expressionsAndFields.length; i += 2) { - xpathConfig.put(expressionsAndFields[i], expressionsAndFields[i + 1]); - } - return xpathConfig; - } - - /** - * Helper method to create namespace configuration map. - */ - private Map createNamespaceConfig(String... prefixesAndUris) { - if (prefixesAndUris.length % 2 != 0) { - throw new IllegalArgumentException("Must provide even number of arguments (prefix, uri, prefix, uri, ...)"); - } - - Map namespaceConfig = new HashMap<>(); - for (int i = 0; i < prefixesAndUris.length; i += 2) { - namespaceConfig.put(prefixesAndUris[i], prefixesAndUris[i + 1]); - } - return namespaceConfig; - } + // Validate that all configuration parameters were consumed + assertThat(configCopy, anEmptyMap()); - /** - * Helper method to create configuration with common boolean options. - */ - private Map createConfigWithOptions(String fieldName, String... options) { - Map config = createBaseConfig(fieldName); - - for (String option : options) { - switch (option) { - case "ignore_missing": - config.put("ignore_missing", true); - break; - case "ignore_failure": - config.put("ignore_failure", true); - break; - case "to_lower": - config.put("to_lower", true); - break; - case "remove_empty_values": - config.put("remove_empty_values", true); - break; - case "store_xml": - config.put("store_xml", false); // Test false case since default is true - break; - case "remove_namespaces": - config.put("remove_namespaces", true); - break; - case "force_content": - config.put("force_content", true); - break; - case "force_array": - config.put("force_array", true); - break; - case "strict_parsing": - config.put("strict_parsing", true); - break; - default: - throw new IllegalArgumentException("Unknown option: " + option); - } - } - - return config; + return processor; } /** @@ -200,7 +58,10 @@ private void expectCreationFailure(Map config, Class factory.create(null, processorTag, null, config, null)); + // Make a mutable copy since Map.of creates immutable maps + Map configCopy = new HashMap<>(config); + + Exception exception = expectThrows(exceptionClass, () -> factory.create(null, processorTag, null, configCopy, null)); assertThat(exception.getMessage(), equalTo(expectedMessage)); } @@ -208,12 +69,18 @@ private void expectCreationFailure(Map config, Class config = createBaseConfig(); - config.put("target_field", DEFAULT_TARGET_FIELD); - config.put("ignore_missing", true); - config.put("ignore_failure", true); - config.put("to_lower", true); - config.put("remove_empty_values", true); + Map config = Map.of( + "field", + DEFAULT_FIELD, + "target_field", + DEFAULT_TARGET_FIELD, + "ignore_missing", + true, + "to_lower", + true, + "remove_empty_values", + true + ); XmlProcessor processor = createProcessor(config); @@ -224,7 +91,7 @@ public void testCreate() throws Exception { } public void testCreateWithDefaults() throws Exception { - Map config = createBaseConfig(); + Map config = Map.of("field", DEFAULT_FIELD); XmlProcessor processor = createProcessor(config); assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); @@ -234,13 +101,12 @@ public void testCreateWithDefaults() throws Exception { } public void testCreateMissingField() throws Exception { - Map config = new HashMap<>(); // Empty config - no field specified + Map config = Map.of(); // Empty config - no field specified expectCreationFailure(config, ElasticsearchParseException.class, "[field] required property is missing"); } public void testCreateWithRemoveEmptyValuesOnly() throws Exception { - Map config = createBaseConfig(); - config.put("remove_empty_values", true); + Map config = Map.of("field", DEFAULT_FIELD, "remove_empty_values", true); XmlProcessor processor = createProcessor(config); @@ -250,8 +116,8 @@ public void testCreateWithRemoveEmptyValuesOnly() throws Exception { } public void testCreateWithXPath() throws Exception { - Map xpathConfig = createXPathConfig("//author/text()", "author_field", "//title/@lang", "language_field"); - Map config = createConfigWithXPath(DEFAULT_FIELD, xpathConfig); + Map xpathConfig = Map.of("//author/text()", "author_field", "//title/@lang", "language_field"); + Map config = Map.of("field", DEFAULT_FIELD, "xpath", xpathConfig); XmlProcessor processor = createProcessor(config); @@ -259,18 +125,21 @@ public void testCreateWithXPath() throws Exception { } public void testCreateWithInvalidXPathConfig() throws Exception { - Map config = createBaseConfig(); - config.put("xpath", "invalid_string"); // Should be a map + Map config = Map.of( + "field", + DEFAULT_FIELD, + "xpath", + "invalid_string" // Should be a map + ); expectCreationFailure(config, ElasticsearchParseException.class, "[xpath] property isn't a map, but of type [java.lang.String]"); } public void testCreateWithInvalidXPathTargetField() throws Exception { - Map config = createBaseConfig(); - Map xpathConfig = new HashMap<>(); xpathConfig.put("//author/text()", 123); // Should be string - config.put("xpath", xpathConfig); + + Map config = Map.of("field", DEFAULT_FIELD, "xpath", xpathConfig); expectCreationFailure( config, @@ -280,13 +149,8 @@ public void testCreateWithInvalidXPathTargetField() throws Exception { } public void testCreateWithNamespaces() throws Exception { - Map namespacesConfig = createNamespaceConfig( - "book", - "http://example.com/book", - "author", - "http://example.com/author" - ); - Map config = createConfigWithNamespaces(DEFAULT_FIELD, namespacesConfig); + Map namespacesConfig = Map.of("book", "http://example.com/book", "author", "http://example.com/author"); + Map config = Map.of("field", DEFAULT_FIELD, "namespaces", namespacesConfig); XmlProcessor processor = createProcessor(config); @@ -295,26 +159,33 @@ public void testCreateWithNamespaces() throws Exception { } public void testCreateWithInvalidNamespacesConfig() throws Exception { - Map config = createBaseConfig(); - config.put("namespaces", "invalid_string"); // Should be a map + Map config = Map.of( + "field", + DEFAULT_FIELD, + "namespaces", + "invalid_string" // Should be a map + ); - expectCreationFailure(config, ElasticsearchParseException.class, "[namespaces] property isn't a map, but of type [java.lang.String]"); + expectCreationFailure( + config, + ElasticsearchParseException.class, + "[namespaces] property isn't a map, but of type [java.lang.String]" + ); } public void testCreateWithInvalidNamespaceURI() throws Exception { - Map config = createBaseConfig(); - Map namespacesConfig = new HashMap<>(); namespacesConfig.put("book", 123); // Should be string - config.put("namespaces", namespacesConfig); + + Map config = Map.of("field", DEFAULT_FIELD, "namespaces", namespacesConfig); expectCreationFailure(config, IllegalArgumentException.class, "Namespace prefix [book] must have a string URI, got [Integer]"); } public void testCreateWithXPathAndNamespaces() throws Exception { - Map xpathConfig = createXPathConfig("//book:author/text()", "author_field", "//book:title/@lang", "language_field"); - Map namespacesConfig = createNamespaceConfig("book", "http://example.com/book"); - Map config = createConfigWithXPathAndNamespaces(DEFAULT_FIELD, xpathConfig, namespacesConfig); + Map xpathConfig = Map.of("//book:author/text()", "author_field", "//book:title/@lang", "language_field"); + Map namespacesConfig = Map.of("book", "http://example.com/book"); + Map config = Map.of("field", DEFAULT_FIELD, "xpath", xpathConfig, "namespaces", namespacesConfig); XmlProcessor processor = createProcessor(config); @@ -325,7 +196,7 @@ public void testCreateWithXPathAndNamespaces() throws Exception { // Tests for individual boolean options public void testCreateWithStoreXmlFalse() throws Exception { - Map config = createConfigWithOptions(DEFAULT_FIELD, "store_xml"); + Map config = Map.of("field", DEFAULT_FIELD, "store_xml", false); XmlProcessor processor = createProcessor(config); assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); @@ -333,7 +204,7 @@ public void testCreateWithStoreXmlFalse() throws Exception { } public void testCreateWithRemoveNamespaces() throws Exception { - Map config = createConfigWithOptions(DEFAULT_FIELD, "remove_namespaces"); + Map config = Map.of("field", DEFAULT_FIELD, "remove_namespaces", true); XmlProcessor processor = createProcessor(config); assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); @@ -341,7 +212,7 @@ public void testCreateWithRemoveNamespaces() throws Exception { } public void testCreateWithForceContent() throws Exception { - Map config = createConfigWithOptions(DEFAULT_FIELD, "force_content"); + Map config = Map.of("field", DEFAULT_FIELD, "force_content", true); XmlProcessor processor = createProcessor(config); assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); @@ -349,7 +220,7 @@ public void testCreateWithForceContent() throws Exception { } public void testCreateWithForceArray() throws Exception { - Map config = createConfigWithOptions(DEFAULT_FIELD, "force_array"); + Map config = Map.of("field", DEFAULT_FIELD, "force_array", true); XmlProcessor processor = createProcessor(config); assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); @@ -357,7 +228,7 @@ public void testCreateWithForceArray() throws Exception { } public void testCreateWithStrictParsing() throws Exception { - Map config = createConfigWithOptions(DEFAULT_FIELD, "strict_parsing"); + Map config = Map.of("field", DEFAULT_FIELD, "strict_parsing", true); XmlProcessor processor = createProcessor(config); assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); @@ -366,12 +237,17 @@ public void testCreateWithStrictParsing() throws Exception { } public void testCreateWithMultipleOptions() throws Exception { - Map config = createConfigWithOptions( + Map config = Map.of( + "field", DEFAULT_FIELD, "ignore_missing", + true, "force_content", + true, "force_array", - "remove_namespaces" + true, + "remove_namespaces", + true ); XmlProcessor processor = createProcessor(config); @@ -385,15 +261,18 @@ public void testCreateWithMultipleOptions() throws Exception { // Tests for XPath compilation errors (testing precompilation feature) public void testCreateWithInvalidXPathExpression() throws Exception { - Map xpathConfig = createXPathConfig("invalid xpath ][", "target_field"); - Map config = createConfigWithXPath(DEFAULT_FIELD, xpathConfig); + Map xpathConfig = Map.of("invalid xpath ][", "target_field"); + Map config = Map.of("field", DEFAULT_FIELD, "xpath", xpathConfig); XmlProcessor.Factory factory = createFactory(); String processorTag = randomAlphaOfLength(10); + // Make a mutable copy since Map.of creates immutable maps + Map configCopy = new HashMap<>(config); + IllegalArgumentException exception = expectThrows( IllegalArgumentException.class, - () -> factory.create(null, processorTag, null, config, null) + () -> factory.create(null, processorTag, null, configCopy, null) ); // Check that the error message contains the XPath expression and indicates it's invalid @@ -402,8 +281,8 @@ public void testCreateWithInvalidXPathExpression() throws Exception { } public void testCreateWithXPathUsingNamespacesWithoutConfiguration() throws Exception { - Map xpathConfig = createXPathConfig("//book:title/text()", "title_field"); - Map config = createConfigWithXPath(DEFAULT_FIELD, xpathConfig); + Map xpathConfig = Map.of("//book:title/text()", "title_field"); + Map config = Map.of("field", DEFAULT_FIELD, "xpath", xpathConfig); expectCreationFailure( config, @@ -414,22 +293,13 @@ public void testCreateWithXPathUsingNamespacesWithoutConfiguration() throws Exce public void testConfigurationParametersAreProperlyRemoved() throws Exception { // Test that demonstrates configuration validation works when using production-like validation - // This test verifies that ConfigurationUtils.readOptionalMap() properly removes parameters from config - // If any are left, the processor factory should throw an exception about unknown parameters - - Map xpathConfig = createXPathConfig("//test", "test_field"); - Map config = createConfigWithXPath(DEFAULT_FIELD, xpathConfig); - - // Add an intentionally unknown parameter to trigger the validation - config.put("unknown_parameter", "should_fail"); - - // This should fail because "unknown_parameter" should remain in config after all valid params are removed - ElasticsearchParseException exception = expectThrows( - ElasticsearchParseException.class, - () -> createProcessorWithValidation(config) - ); - - assertThat(exception.getMessage(), containsString("doesn't support one or more provided configuration parameters")); - assertThat(exception.getMessage(), containsString("unknown_parameter")); + // This test verifies that all valid configuration parameters are consumed during processor creation + + Map xpathConfig = Map.of("//test", "test_field"); + Map config = Map.of("field", DEFAULT_FIELD, "xpath", xpathConfig); + + // This should succeed as all parameters are valid + XmlProcessor processor = createProcessor(config); + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); } } diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java index f05c98ed402e5..419d8a2cbd3cb 100644 --- a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java @@ -16,7 +16,9 @@ import java.util.List; import java.util.Map; +import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.is; /** * Tests for {@link XmlProcessor}. These tests ensure feature parity and test coverage. @@ -31,23 +33,18 @@ private static IngestDocument createTestIngestDocument(String xml) { return new IngestDocument("_index", "_id", 1, null, null, new HashMap<>(Map.of(XML_FIELD, xml))); } - private static XmlProcessor createTestProcessor(Map config) { + private static XmlProcessor createTestProcessor(Map config) throws Exception { config.putIfAbsent("field", XML_FIELD); config.putIfAbsent("target_field", TARGET_FIELD); XmlProcessor.Factory factory = new XmlProcessor.Factory(); - try { - return factory.create(null, "_tag", null, config, null); - } catch (Exception e) { - fail("Failed to create XmlProcessor: " + e.getMessage()); - return null; // This line will never be reached, but is needed to satisfy the compiler - } + return factory.create(null, "_tag", null, config, null); } /** - * Test parsing standard XML with attributes. + * Test parsing standard XML structure. */ - public void testParseStandardXml() { + public void testParseStandardXml() throws Exception { String xml = ""; Map config = new HashMap<>(); @@ -56,16 +53,21 @@ public void testParseStandardXml() { processor.execute(ingestDocument); - Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); - Map foo = (Map) data.get("foo"); - assertThat(foo.get("key"), equalTo("value")); + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); + + Map expectedData = Map.of("foo", Map.of("key", "value")); + assertThat(data, equalTo(expectedData)); } /** * Test parsing XML with array elements (multiple elements with same name). */ - public void testParseXmlWithArrayValue() { - String xml = "value1value2"; + public void testParseXmlWithArrayValue() throws Exception { + String xml = """ + + value1 + value2 + """; Map config = new HashMap<>(); XmlProcessor processor = createTestProcessor(config); @@ -73,24 +75,22 @@ public void testParseXmlWithArrayValue() { processor.execute(ingestDocument); - Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); - Map foo = (Map) data.get("foo"); - List keyValues = (List) foo.get("key"); - assertThat(keyValues.size(), equalTo(2)); - - // The values might be nested inside their own lists - Object firstValue = keyValues.get(0); - assertThat(firstValue, equalTo("value1")); + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); - Object secondValue = keyValues.get(1); - assertThat(secondValue, equalTo("value2")); + Map expectedData = Map.of("foo", Map.of("key", List.of("value1", "value2"))); + assertThat(data, equalTo(expectedData)); } /** * Test parsing XML with nested elements. */ - public void testParseXmlWithNestedElements() { - String xml = "value"; + public void testParseXmlWithNestedElements() throws Exception { + String xml = """ + + + value + + """; Map config = new HashMap<>(); XmlProcessor processor = createTestProcessor(config); @@ -98,20 +98,16 @@ public void testParseXmlWithNestedElements() { processor.execute(ingestDocument); - Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); - Map foo = (Map) data.get("foo"); + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); - Map key1Map = (Map) foo.get("key1"); - assertThat(key1Map.size(), equalTo(1)); - - String key2Value = (String) key1Map.get("key2"); - assertThat(key2Value, equalTo("value")); + Map expectedData = Map.of("foo", Map.of("key1", Map.of("key2", "value"))); + assertThat(data, equalTo(expectedData)); } /** * Test parsing XML in a single item array. */ - public void testParseXmlInSingleItemArray() { + public void testParseXmlInSingleItemArray() throws Exception { String xml = ""; Map config = new HashMap<>(); @@ -120,16 +116,21 @@ public void testParseXmlInSingleItemArray() { processor.execute(ingestDocument); - Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); - Map foo = (Map) data.get("foo"); - assertThat(foo.get("bar"), equalTo("baz")); + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); + + Map expectedData = Map.of("foo", Map.of("bar", "baz")); + assertThat(data, equalTo(expectedData)); } /** * Test extracting a single element using XPath. */ - public void testXPathSingleElementExtraction() { - String xml = "helloworld"; + public void testXPathSingleElementExtraction() throws Exception { + String xml = """ + + hello + world + """; Map xpathMap = Map.of("/foo/bar/text()", "bar_content"); @@ -142,22 +143,25 @@ public void testXPathSingleElementExtraction() { // Get the XPath result Object barContent = ingestDocument.getFieldValue("bar_content", Object.class); - assertNotNull(barContent); - assertEquals("hello", barContent); + assertThat(barContent, equalTo("hello")); // Verify that the full parsed XML is also available - Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); - Map foo = (Map) data.get("foo"); - assertNotNull(foo); - assertThat(foo.get("bar"), equalTo("hello")); - assertThat(foo.get("baz"), equalTo("world")); + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); + + Map expectedData = Map.of("foo", Map.of("bar", "hello", "baz", "world")); + assertThat(data, equalTo(expectedData)); } /** * Test extracting multiple elements using XPath. */ - public void testXPathMultipleElementsExtraction() { - String xml = "firstsecondthird"; + public void testXPathMultipleElementsExtraction() throws Exception { + String xml = """ + + first + second + third + """; Map xpathMap = Map.of("/foo/bar", "all_bars"); @@ -169,19 +173,18 @@ public void testXPathMultipleElementsExtraction() { processor.execute(ingestDocument); List allBars = ingestDocument.getFieldValue("all_bars", List.class); - - assertNotNull(allBars); - assertThat(allBars.size(), equalTo(3)); - assertThat(allBars.get(0), equalTo("first")); - assertThat(allBars.get(1), equalTo("second")); - assertThat(allBars.get(2), equalTo("third")); + List expectedBars = List.of("first", "second", "third"); + assertThat(allBars, equalTo(expectedBars)); } /** * Test extracting attributes using XPath. */ - public void testXPathAttributeExtraction() { - String xml = "content"; + public void testXPathAttributeExtraction() throws Exception { + String xml = """ + + content + """; Map xpathMap = new HashMap<>(); xpathMap.put("/foo/bar/@id", "bar_id"); @@ -195,23 +198,22 @@ public void testXPathAttributeExtraction() { processor.execute(ingestDocument); String barId = ingestDocument.getFieldValue("bar_id", String.class); - assertNotNull(barId); assertThat(barId, equalTo("123")); String barType = ingestDocument.getFieldValue("bar_type", String.class); - assertNotNull(barType); assertThat(barType, equalTo("test")); } /** * Test extracting elements with namespaces using XPath. */ - public void testXPathNamespacedExtraction() { - String xml = "" - + "" - + " namespace-value" - + " regular-value" - + ""; + public void testXPathNamespacedExtraction() throws Exception { + String xml = """ + + + namespace-value + regular-value + """; Map namespaces = Map.of("myns", "http://example.org/ns1"); Map xpathMap = Map.of("//myns:element/text()", "ns_value"); @@ -225,15 +227,17 @@ public void testXPathNamespacedExtraction() { processor.execute(ingestDocument); String nsValue = ingestDocument.getFieldValue("ns_value", String.class); - assertNotNull(nsValue); assertThat(nsValue, equalTo("namespace-value")); } /** * Test parsing XML with mixed content (text and elements mixed together). */ - public void testParseXmlWithMixedContent() { - String xml = "This text is bold and this is italic!"; + public void testParseXmlWithMixedContent() throws Exception { + String xml = """ + + This text is bold and this is italic! + """; Map config = new HashMap<>(); XmlProcessor processor = createTestProcessor(config); @@ -241,21 +245,16 @@ public void testParseXmlWithMixedContent() { processor.execute(ingestDocument); - Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); - Map foo = (Map) data.get("foo"); + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); - assertNotNull(foo.get("b")); - assertThat((String) foo.get("b"), equalTo("bold")); - assertNotNull(foo.get("i")); - assertThat((String) foo.get("i"), equalTo("italic")); - assertNotNull(foo.get("#text")); - assertThat((String) foo.get("#text"), equalTo("This text is and this is !")); + Map expectedData = Map.of("foo", Map.of("b", "bold", "i", "italic", "#text", "This text is and this is !")); + assertThat(data, equalTo(expectedData)); } /** * Test parsing XML with CDATA sections. */ - public void testParseXmlWithCDATA() { + public void testParseXmlWithCDATA() throws Exception { String xml = " that shouldn't be parsed!]]>"; Map config = new HashMap<>(); @@ -264,18 +263,22 @@ public void testParseXmlWithCDATA() { processor.execute(ingestDocument); - Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); - Object content = data.get("foo"); + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); - assertNotNull(content); - assertThat(content, equalTo("This is CDATA content with that shouldn't be parsed!")); + Map expectedData = Map.of("foo", "This is CDATA content with that shouldn't be parsed!"); + assertThat(data, equalTo(expectedData)); } /** * Test parsing XML with numeric data. */ - public void testParseXmlWithNumericData() { - String xml = "12399.95true"; + public void testParseXmlWithNumericData() throws Exception { + String xml = """ + + 123 + 99.95 + true + """; Map config = new HashMap<>(); XmlProcessor processor = createTestProcessor(config); @@ -283,18 +286,16 @@ public void testParseXmlWithNumericData() { processor.execute(ingestDocument); - Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); - Map foo = (Map) data.get("foo"); + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); - assertThat((String) foo.get("count"), equalTo("123")); - assertThat((String) foo.get("price"), equalTo("99.95")); - assertThat((String) foo.get("active"), equalTo("true")); + Map expectedData = Map.of("foo", Map.of("count", "123", "price", "99.95", "active", "true")); + assertThat(data, equalTo(expectedData)); } /** * Test parsing XML with force_array option enabled. */ - public void testParseXmlWithForceArray() { + public void testParseXmlWithForceArray() throws Exception { String xml = "single_value"; Map config = new HashMap<>(); @@ -304,28 +305,28 @@ public void testParseXmlWithForceArray() { processor.execute(ingestDocument); - Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); - Map foo = (Map) data.get("foo"); - - // With force_array=true, even single values should be in arrays - Object barValue = foo.get("bar"); - assertNotNull(barValue); - assertTrue("Expected bar value to be a List with force_array=true", barValue instanceof List); + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); - List barList = (List) barValue; - assertThat(barList.size(), equalTo(1)); - assertThat(barList.get(0), equalTo("single_value")); + Map expectedData = Map.of("foo", Map.of("bar", List.of("single_value"))); + assertThat(data, equalTo(expectedData)); } /** * Test extracting multiple elements using multiple XPath expressions. * Tests that multiple XPath expressions can be used simultaneously. */ - public void testMultipleXPathExpressions() { - String xml = "" - + " John30" - + " Jane25" - + ""; + public void testMultipleXPathExpressions() throws Exception { + String xml = """ + + + John + 30 + + + Jane + 25 + + """; // Configure multiple XPath expressions Map xpathMap = new HashMap<>(); @@ -340,30 +341,24 @@ public void testMultipleXPathExpressions() { processor.execute(ingestDocument); - assertTrue("first_person_name field should exist", ingestDocument.hasField("first_person_name")); - assertTrue("second_person_name field should exist", ingestDocument.hasField("second_person_name")); - assertTrue("person_ids field should exist", ingestDocument.hasField("person_ids")); - + // Verify XPath results Object firstName = ingestDocument.getFieldValue("first_person_name", Object.class); - assertEquals("John", firstName); + assertThat(firstName, equalTo("John")); Object secondName = ingestDocument.getFieldValue("second_person_name", Object.class); - assertEquals("Jane", secondName); + assertThat(secondName, equalTo("Jane")); - Object personIdsObj = ingestDocument.getFieldValue("person_ids", Object.class); - assertTrue("person_ids should be a List", personIdsObj instanceof List); - List personIds = (List) personIdsObj; - assertEquals("Should have 2 person IDs", 2, personIds.size()); - assertEquals("First person ID should be '1'", "1", personIds.get(0)); - assertEquals("Second person ID should be '2'", "2", personIds.get(1)); + List personIds = ingestDocument.getFieldValue("person_ids", List.class); + assertThat(personIds, equalTo(List.of("1", "2"))); - assertTrue("Target field should exist", ingestDocument.hasField(TARGET_FIELD)); + // Verify that the target field was also created (since storeXml defaults to true) + assertThat(ingestDocument.hasField(TARGET_FIELD), equalTo(true)); } /** * Test handling of invalid XML with ignoreFailure=false. */ - public void testInvalidXml() { + public void testInvalidXml() throws Exception { String xml = ""; // Invalid XML missing closing tag Map config = new HashMap<>(); @@ -372,10 +367,7 @@ public void testInvalidXml() { IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> processor.execute(ingestDocument)); - assertTrue( - "Error message should indicate XML is invalid", - exception.getMessage().contains("invalid XML") || exception.getCause().getMessage().contains("XML") - ); + assertThat(exception.getMessage(), containsString("invalid XML")); } /** @@ -384,7 +376,7 @@ public void testInvalidXml() { * When calling the processor directly (as in tests), exceptions are still thrown. * This test verifies that the processor itself properly reports XML parsing errors. */ - public void testInvalidXmlWithIgnoreFailure() { + public void testInvalidXmlWithIgnoreFailure() throws Exception { String xml = ""; // Invalid XML missing closing tag Map config = new HashMap<>(); @@ -396,16 +388,13 @@ public void testInvalidXmlWithIgnoreFailure() { // The framework's OnFailureProcessor wrapper handles the ignore_failure behavior in production IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> processor.execute(ingestDocument)); - assertTrue( - "Error message should indicate XML is invalid", - exception.getMessage().contains("invalid XML") || exception.getCause().getMessage().contains("XML") - ); + assertThat(exception.getMessage(), containsString("invalid XML")); } /** * Test the store_xml=false option to not store parsed XML in target field. */ - public void testNoStoreXml() { + public void testNoStoreXml() throws Exception { String xml = "value"; // Set up XPath to extract value but don't store XML @@ -421,17 +410,16 @@ public void testNoStoreXml() { // Verify XPath result is stored String barContent = ingestDocument.getFieldValue("bar_content", String.class); - assertNotNull(barContent); assertThat(barContent, equalTo("value")); // Verify the target field was not created - assertFalse(ingestDocument.hasField(TARGET_FIELD)); + assertThat(ingestDocument.hasField(TARGET_FIELD), is(false)); } /** * Test the to_lower option for converting field names to lowercase. */ - public void testToLower() { + public void testToLower() throws Exception { String xml = "value"; Map config = new HashMap<>(); @@ -442,20 +430,16 @@ public void testToLower() { processor.execute(ingestDocument); // Verify field names are lowercase - Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); - assertTrue(data.containsKey("foo")); - assertFalse(data.containsKey("FOO")); - - Map foo = (Map) data.get("foo"); - assertTrue(foo.containsKey("bar")); - assertFalse(foo.containsKey("BAR")); - assertThat(foo.get("bar"), equalTo("value")); + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); + + Map expectedData = Map.of("foo", Map.of("bar", "value")); + assertThat(data, equalTo(expectedData)); } /** * Test the ignore_missing option when field is missing. */ - public void testIgnoreMissing() { + public void testIgnoreMissing() throws Exception { String xmlField = "nonexistent_field"; Map config = new HashMap<>(); @@ -465,7 +449,7 @@ public void testIgnoreMissing() { IngestDocument ingestDocument = new IngestDocument("_index", "_id", 1, null, null, new HashMap<>(Map.of())); processor.execute(ingestDocument); - assertFalse("Target field should not be created when source field is missing", ingestDocument.hasField(TARGET_FIELD)); + assertThat("Target field should not be created when source field is missing", ingestDocument.hasField(TARGET_FIELD), is(false)); // With ignoreMissing=false config.put("ignore_missing", false); @@ -474,28 +458,32 @@ public void testIgnoreMissing() { // This should throw an exception IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> failingProcessor.execute(ingestDocument)); - assertTrue(exception.getMessage().contains("not present as part of path")); + assertThat(exception.getMessage(), containsString("not present as part of path")); } /** * Test that remove_empty_values correctly filters out empty values from arrays and mixed content. */ - public void testRemoveEmptyValues() { + public void testRemoveEmptyValues() throws Exception { // XML with mixed empty and non-empty elements, including array elements with mixed empty/non-empty values - String xml = "" - + " " - + " " - + " content" - + " nested-content" - + " " - + " first" - + " " - + " third" - + " " - + " fifth" - + " " - + " Text with and content" - + ""; + String xml = """ + + + + content + + + nested-content + + + first + + third + + fifth + + Text with and content + """; Map config = new HashMap<>(); config.put("remove_empty_values", true); @@ -504,46 +492,31 @@ public void testRemoveEmptyValues() { IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - Map result = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); - Map root = (Map) result.get("root"); - - // Check empty elements are filtered - assertFalse("Empty element should be filtered out", root.containsKey("empty")); - assertFalse("Blank element should be filtered out", root.containsKey("blank")); - - // Check valid elements are preserved - assertTrue("Valid element should be preserved", root.containsKey("valid")); - assertEquals("content", root.get("valid")); - - // Check nested structure filtering - Map nested = (Map) root.get("nested"); - assertNotNull("Nested element should be preserved", nested); - assertFalse("Empty nested element should be filtered", nested.containsKey("empty")); - assertEquals("nested-content", nested.get("valid")); - - // Check array with mixed empty/non-empty values - Map items = (Map) root.get("items"); - assertNotNull("Items object should be preserved", items); - List itemList = (List) items.get("item"); - assertNotNull("Item array should be preserved", itemList); - assertEquals("Array should contain only non-empty items", 3, itemList.size()); - assertEquals("first", itemList.get(0)); - assertEquals("third", itemList.get(1)); - assertEquals("fifth", itemList.get(2)); - - // Check mixed content handling - Map mixed = (Map) root.get("mixed"); - assertNotNull("Mixed content should be preserved", mixed); - assertFalse("Empty element in mixed content should be filtered", mixed.containsKey("empty")); - assertTrue("Valid element in mixed content should be preserved", mixed.containsKey("valid")); - assertEquals("content", mixed.get("valid")); - assertEquals("Text with and", mixed.get("#text")); + Map result = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); + + Map expectedData = Map.of( + "root", Map.of( + "valid", "content", + "nested", Map.of( + "valid", "nested-content" + ), + "items", Map.of( + "item", List.of("first", "third", "fifth") + ), + "mixed", Map.of( + "valid", "content", + "#text", "Text with and" + ) + ) + ); + + assertThat(result, equalTo(expectedData)); } /** * Test parsing with strict mode option. */ - public void testStrictParsing() { + public void testStrictParsing() throws Exception { String xml = "valid"; Map config = new HashMap<>(); @@ -553,9 +526,10 @@ public void testStrictParsing() { processor.execute(ingestDocument); - Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); - Map foo = (Map) data.get("foo"); - assertThat(foo.get("bar"), equalTo("valid")); + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); + + Map expectedData = Map.of("foo", Map.of("bar", "valid")); + assertThat(data, equalTo(expectedData)); // Test with invalid XML in strict mode String invalidXml = ""; @@ -563,17 +537,17 @@ public void testStrictParsing() { IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> processor.execute(invalidDocument)); - assertTrue( - "Error message should indicate XML is invalid", - exception.getMessage().contains("invalid XML") || exception.getCause().getMessage().contains("XML") - ); + assertThat(exception.getMessage(), containsString("contains invalid XML")); } /** * Test parsing XML with remove_namespaces option. */ - public void testRemoveNamespaces() { - String xml = "value"; + public void testRemoveNamespaces() throws Exception { + String xml = """ + + value + """; Map config = new HashMap<>(); config.put("remove_namespaces", true); @@ -582,11 +556,10 @@ public void testRemoveNamespaces() { processor.execute(ingestDocument); - Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); - Map foo = (Map) data.get("foo"); + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); - assertTrue("Element without namespace should be present", foo.containsKey("bar")); - assertThat(foo.get("bar"), equalTo("value")); + Map expectedDataWithoutNs = Map.of("foo", Map.of("bar", "value")); + assertThat(data, equalTo(expectedDataWithoutNs)); // Now test with removeNamespaces=false IngestDocument ingestDocument2 = createTestIngestDocument(xml); @@ -595,18 +568,16 @@ public void testRemoveNamespaces() { XmlProcessor processor2 = createTestProcessor(config); processor2.execute(ingestDocument2); - Map data2 = ingestDocument2.getFieldValue(TARGET_FIELD, Map.class); - Map foo2 = (Map) data2.get("foo"); + Map data2 = ingestDocument2.getFieldValue(TARGET_FIELD, Map.class); - // With removeNamespaces=false, the "ns:" prefix should be preserved - assertTrue("Element should be accessible with namespace prefix", foo2.containsKey("ns:bar")); - assertThat(foo2.get("ns:bar"), equalTo("value")); + Map expectedDataWithNs = Map.of("foo", Map.of("xmlns:ns", "http://example.org/ns", "ns:bar", "value")); + assertThat(data2, equalTo(expectedDataWithNs)); } /** * Test the force_content option. */ - public void testForceContent() { + public void testForceContent() throws Exception { String xml = "simple text"; Map config = new HashMap<>(); @@ -616,12 +587,10 @@ public void testForceContent() { processor.execute(ingestDocument); - Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); - Map foo = (Map) data.get("foo"); + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); - // With forceContent=true, the text should be in a #text field - assertTrue("Text content should be in #text field", foo.containsKey("#text")); - assertThat(foo.get("#text"), equalTo("simple text")); + Map expectedDataWithForceContent = Map.of("foo", Map.of("#text", "simple text")); + assertThat(data, equalTo(expectedDataWithForceContent)); // Now test with forceContent=false config.put("force_content", false); @@ -630,9 +599,9 @@ public void testForceContent() { processor2.execute(ingestDocument2); - Map data2 = ingestDocument2.getFieldValue(TARGET_FIELD, Map.class); + Map data2 = ingestDocument2.getFieldValue(TARGET_FIELD, Map.class); - // With forceContent=false, the text should be directly assigned to the element - assertThat(data2.get("foo"), equalTo("simple text")); + Map expectedDataWithoutForceContent = Map.of("foo", "simple text"); + assertThat(data2, equalTo(expectedDataWithoutForceContent)); } } From 893eadf3cb0a6147d7cf038a75dcf9db763310af Mon Sep 17 00:00:00 2001 From: Marc Guasch Date: Wed, 3 Sep 2025 11:02:10 +0200 Subject: [PATCH 22/54] Use provided secured xml factories --- .../ingest/common/XmlProcessor.java | 132 +++++++----------- 1 file changed, 48 insertions(+), 84 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index 401a3a2232e0a..c424f7e7d3a1c 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -11,6 +11,7 @@ import org.elasticsearch.cluster.metadata.ProjectId; import org.elasticsearch.common.Strings; +import org.elasticsearch.core.XmlUtils; import org.elasticsearch.ingest.AbstractProcessor; import org.elasticsearch.ingest.ConfigurationUtils; import org.elasticsearch.ingest.IngestDocument; @@ -61,13 +62,13 @@ public final class XmlProcessor extends AbstractProcessor { // Pre-compiled pattern to detect namespace prefixes private static final Pattern NAMESPACE_PATTERN = Pattern.compile("\\b[a-zA-Z][a-zA-Z0-9_-]*:[a-zA-Z][a-zA-Z0-9_-]*"); - // Pre-configured SAX parser factories for secure XML parsing + // Pre-configured secure XML parser factories using XmlUtils private static final SAXParserFactory SAX_PARSER_FACTORY = createSecureSaxParserFactory(); private static final SAXParserFactory SAX_PARSER_FACTORY_NS = createSecureSaxParserFactoryNamespaceAware(); private static final SAXParserFactory SAX_PARSER_FACTORY_STRICT = createSecureSaxParserFactoryStrict(); private static final SAXParserFactory SAX_PARSER_FACTORY_NS_STRICT = createSecureSaxParserFactoryNamespaceAwareStrict(); - // Pre-configured document builder factory for DOM creation + // Pre-configured secure document builder factory for DOM creation private static final DocumentBuilderFactory DOM_FACTORY = createSecureDocumentBuilderFactory(); private final String field; @@ -804,128 +805,91 @@ private String getAttributeName(String uri, String localName, String qName) { } /** - * Creates a secure, pre-configured SAX parser factory for XML parsing. - * This factory is configured to prevent XXE attacks with SAX-specific features. + * Creates a secure, pre-configured SAX parser factory for XML parsing using XmlUtils. */ private static SAXParserFactory createSecureSaxParserFactory() { - SAXParserFactory factory = SAXParserFactory.newInstance(); - factory.setValidating(false); - - // Configure SAX-specific security features to prevent XXE attacks try { - // SAX parser features - these are the correct features for SAXParserFactory - factory.setFeature("http://xml.org/sax/features/external-general-entities", false); - factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false); - factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); - factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); + SAXParserFactory factory = XmlUtils.getHardenedSaxParserFactory(); + factory.setValidating(false); + factory.setNamespaceAware(false); + return factory; } catch (Exception e) { - // Security features are critical - fail if they cannot be set throw new IllegalStateException("Cannot configure secure XML parsing features", e); } - - return factory; } /** - * Creates a secure, pre-configured namespace-aware SAX parser factory for XML parsing. - * This factory is configured to prevent XXE attacks and has namespace awareness enabled. + * Creates a secure, pre-configured namespace-aware SAX parser factory for XML parsing using XmlUtils. */ private static SAXParserFactory createSecureSaxParserFactoryNamespaceAware() { - SAXParserFactory factory = SAXParserFactory.newInstance(); - factory.setValidating(false); - factory.setNamespaceAware(true); - - // Configure SAX-specific security features to prevent XXE attacks try { - // SAX parser features - these are the correct features for SAXParserFactory - factory.setFeature("http://xml.org/sax/features/external-general-entities", false); - factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false); - factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); - factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); + SAXParserFactory factory = XmlUtils.getHardenedSaxParserFactory(); + factory.setValidating(false); + factory.setNamespaceAware(true); + return factory; } catch (Exception e) { - // Security features are critical - fail if they cannot be set throw new IllegalStateException("Cannot configure secure XML parsing features", e); } - - return factory; } /** - * Creates a secure, pre-configured SAX parser factory for strict XML parsing. - * This factory is configured to prevent XXE attacks and has strict validation enabled. + * Creates a secure, pre-configured SAX parser factory for strict XML parsing using XmlUtils. */ private static SAXParserFactory createSecureSaxParserFactoryStrict() { - SAXParserFactory factory = SAXParserFactory.newInstance(); - factory.setValidating(false); - - // Configure SAX-specific security features to prevent XXE attacks try { - // SAX parser features - these are the correct features for SAXParserFactory - factory.setFeature("http://xml.org/sax/features/external-general-entities", false); - factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false); - factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); - factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); - } catch (Exception e) { - // Security features are critical - fail if they cannot be set - throw new IllegalStateException("Cannot configure secure XML parsing features", e); - } + SAXParserFactory factory = XmlUtils.getHardenedSaxParserFactory(); + factory.setValidating(false); + factory.setNamespaceAware(false); - // Try to enable strict parsing features (optional - may not be supported) - try { - factory.setFeature("http://apache.org/xml/features/validation/check-full-element-content", true); + // Try to enable strict parsing features (optional - may not be supported) + try { + factory.setFeature("http://apache.org/xml/features/validation/check-full-element-content", true); + } catch (Exception e) { + // Strict parsing features are optional - continue without them if not supported + } + + return factory; } catch (Exception e) { - // Strict parsing features are optional - continue without them if not supported + throw new IllegalStateException("Cannot configure secure XML parsing features", e); } - - return factory; } /** - * Creates a secure, pre-configured namespace-aware SAX parser factory for strict XML parsing. - * This factory is configured to prevent XXE attacks, has namespace awareness enabled, and strict validation. + * Creates a secure, pre-configured namespace-aware SAX parser factory for strict XML parsing using XmlUtils. */ private static SAXParserFactory createSecureSaxParserFactoryNamespaceAwareStrict() { - SAXParserFactory factory = SAXParserFactory.newInstance(); - factory.setValidating(false); - factory.setNamespaceAware(true); - - // Configure SAX-specific security features to prevent XXE attacks try { - // SAX parser features - these are the correct features for SAXParserFactory - factory.setFeature("http://xml.org/sax/features/external-general-entities", false); - factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false); - factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); - factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); - } catch (Exception e) { - // Security features are critical - fail if they cannot be set - throw new IllegalStateException("Cannot configure secure XML parsing features", e); - } + SAXParserFactory factory = XmlUtils.getHardenedSaxParserFactory(); + factory.setValidating(false); + factory.setNamespaceAware(true); - // Try to enable strict parsing features (optional - may not be supported) - try { - factory.setFeature("http://apache.org/xml/features/validation/check-full-element-content", true); + // Try to enable strict parsing features (optional - may not be supported) + try { + factory.setFeature("http://apache.org/xml/features/validation/check-full-element-content", true); + } catch (Exception e) { + // Strict parsing features are optional - continue without them if not supported + } + + return factory; } catch (Exception e) { - // Strict parsing features are optional - continue without them if not supported + throw new IllegalStateException("Cannot configure secure XML parsing features", e); } - - return factory; } /** - * Creates a secure, pre-configured DocumentBuilderFactory for DOM creation. + * Creates a secure, pre-configured DocumentBuilderFactory for DOM creation using XmlUtils. * Since we only use this factory to create empty DOM documents programmatically - * (not to parse XML), XXE security features are not needed here. + * (not to parse XML), we use the hardened builder factory. * The SAX parser handles all XML parsing with appropriate security measures. */ private static DocumentBuilderFactory createSecureDocumentBuilderFactory() { - DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); - factory.setNamespaceAware(true); // Enable for maximum compatibility - factory.setValidating(false); - - // No XXE security features needed - we only create empty documents, - // never parse XML with this factory - - return factory; + try { + DocumentBuilderFactory factory = XmlUtils.getHardenedBuilderFactory(); + factory.setValidating(false); // Override validation for DOM creation + return factory; + } catch (Exception e) { + throw new IllegalStateException("Cannot configure secure XML parsing features", e); + } } /** From 10e18d064ac59cb9d67ce3c24ea7ad18c1b0a9b8 Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Wed, 3 Sep 2025 09:14:34 +0000 Subject: [PATCH 23/54] [CI] Auto commit changes from spotless --- .../common/XmlProcessorFactoryTests.java | 10 +++---- .../ingest/common/XmlProcessorTests.java | 26 +++++++++---------- 2 files changed, 16 insertions(+), 20 deletions(-) diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java index 2ed00a64b8d17..432426a805e1b 100644 --- a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java @@ -39,11 +39,9 @@ private XmlProcessor createProcessor(Map config) throws Exceptio XmlProcessor.Factory factory = createFactory(); String processorTag = randomAlphaOfLength(10); - // Make a copy of the config to avoid modifying the original Map configCopy = new HashMap<>(config); - // Create the processor (this should consume config parameters) XmlProcessor processor = factory.create(null, processorTag, null, configCopy, null); @@ -169,13 +167,13 @@ public void testCreateWithInvalidNamespacesConfig() throws Exception { ); expectCreationFailure( - + config, - + ElasticsearchParseException.class, - + "[namespaces] property isn't a map, but of type [java.lang.String]" - + ); } diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java index 419d8a2cbd3cb..bd7aeafb4249d 100644 --- a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java @@ -493,23 +493,21 @@ public void testRemoveEmptyValues() throws Exception { processor.execute(ingestDocument); Map result = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); - + Map expectedData = Map.of( - "root", Map.of( - "valid", "content", - "nested", Map.of( - "valid", "nested-content" - ), - "items", Map.of( - "item", List.of("first", "third", "fifth") - ), - "mixed", Map.of( - "valid", "content", - "#text", "Text with and" - ) + "root", + Map.of( + "valid", + "content", + "nested", + Map.of("valid", "nested-content"), + "items", + Map.of("item", List.of("first", "third", "fifth")), + "mixed", + Map.of("valid", "content", "#text", "Text with and") ) ); - + assertThat(result, equalTo(expectedData)); } From 3b57ff03b433f43d23d7a199c68184523e6e4079 Mon Sep 17 00:00:00 2001 From: Marc Guasch Date: Wed, 3 Sep 2025 15:55:14 +0200 Subject: [PATCH 24/54] Add applies_to section for XML processor documentation --- docs/reference/enrich-processor/index.md | 2 +- docs/reference/enrich-processor/xml-processor.md | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/reference/enrich-processor/index.md b/docs/reference/enrich-processor/index.md index c43d811e26d87..3ef87e8434ae0 100644 --- a/docs/reference/enrich-processor/index.md +++ b/docs/reference/enrich-processor/index.md @@ -162,7 +162,7 @@ Refer to [Enrich your data](docs-content://manage-data/ingest/transform-enrich/d [`trim` processor](/reference/enrich-processor/trim-processor.md) : Trims whitespace from field. -[`xml` processor](/reference/enrich-processor/xml-processor.md) +[`xml` processor](/reference/enrich-processor/xml-processor.md) {applies_to}`stack: ga 9.2` : Parses XML documents and converts them to JSON objects. diff --git a/docs/reference/enrich-processor/xml-processor.md b/docs/reference/enrich-processor/xml-processor.md index cd9d9db5531cc..b5eae3085bbbe 100644 --- a/docs/reference/enrich-processor/xml-processor.md +++ b/docs/reference/enrich-processor/xml-processor.md @@ -1,5 +1,7 @@ --- navigation_title: "XML" +applies_to: + stack: ga 9.2 mapped_pages: - https://www.elastic.co/guide/en/elasticsearch/reference/current/xml-processor.html --- From fba3b2b6c3a731180a7598de89e77d46b5a1db1a Mon Sep 17 00:00:00 2001 From: Marc Guasch Date: Tue, 9 Sep 2025 12:38:58 +0200 Subject: [PATCH 25/54] Refactor XmlProcessor to use lazy-initialized XML factories and add logging for configuration issues; update tests to suppress unchecked warnings. --- .../ingest/common/XmlProcessor.java | 81 ++++++++++++------- .../ingest/common/XmlProcessorTests.java | 4 +- 2 files changed, 55 insertions(+), 30 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index c424f7e7d3a1c..b3fd9e3e2f98c 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -9,6 +9,8 @@ package org.elasticsearch.ingest.common; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.elasticsearch.cluster.metadata.ProjectId; import org.elasticsearch.common.Strings; import org.elasticsearch.core.XmlUtils; @@ -56,20 +58,27 @@ public final class XmlProcessor extends AbstractProcessor { public static final String TYPE = "xml"; + private static final Logger logger = LogManager.getLogger(XmlProcessor.class); private static final XPathFactory XPATH_FACTORY = XPathFactory.newInstance(); // Pre-compiled pattern to detect namespace prefixes private static final Pattern NAMESPACE_PATTERN = Pattern.compile("\\b[a-zA-Z][a-zA-Z0-9_-]*:[a-zA-Z][a-zA-Z0-9_-]*"); - // Pre-configured secure XML parser factories using XmlUtils - private static final SAXParserFactory SAX_PARSER_FACTORY = createSecureSaxParserFactory(); - private static final SAXParserFactory SAX_PARSER_FACTORY_NS = createSecureSaxParserFactoryNamespaceAware(); - private static final SAXParserFactory SAX_PARSER_FACTORY_STRICT = createSecureSaxParserFactoryStrict(); - private static final SAXParserFactory SAX_PARSER_FACTORY_NS_STRICT = createSecureSaxParserFactoryNamespaceAwareStrict(); - - // Pre-configured secure document builder factory for DOM creation - private static final DocumentBuilderFactory DOM_FACTORY = createSecureDocumentBuilderFactory(); + /** + * Lazily-initialized XML factories to avoid node startup failures if the JDK doesn't support required functionality. + * This inner class will only be loaded when XML processing is actually used. + */ + private static class XmlFactories { + // Pre-configured secure XML parser factories using XmlUtils + static final SAXParserFactory SAX_PARSER_FACTORY = createSecureSaxParserFactory(); + static final SAXParserFactory SAX_PARSER_FACTORY_NS = createSecureSaxParserFactoryNamespaceAware(); + static final SAXParserFactory SAX_PARSER_FACTORY_STRICT = createSecureSaxParserFactoryStrict(); + static final SAXParserFactory SAX_PARSER_FACTORY_NS_STRICT = createSecureSaxParserFactoryNamespaceAwareStrict(); + + // Pre-configured secure document builder factory for DOM creation + static final DocumentBuilderFactory DOM_FACTORY = createSecureDocumentBuilderFactory(); + } private final String field; private final String targetField; @@ -578,7 +587,7 @@ public void startDocument() throws org.xml.sax.SAXException { // Use pre-configured secure DOM factory // Since we build DOM programmatically (createElementNS/createElement), // the factory's namespace awareness doesn't affect our usage - DocumentBuilder builder = DOM_FACTORY.newDocumentBuilder(); + DocumentBuilder builder = XmlFactories.DOM_FACTORY.newDocumentBuilder(); domDocument = builder.newDocument(); } catch (Exception e) { throw new org.xml.sax.SAXException("Failed to create DOM document", e); @@ -814,7 +823,8 @@ private static SAXParserFactory createSecureSaxParserFactory() { factory.setNamespaceAware(false); return factory; } catch (Exception e) { - throw new IllegalStateException("Cannot configure secure XML parsing features", e); + logger.warn("Cannot configure secure XML parsing features - XML processor may not work correctly", e); + return null; } } @@ -828,7 +838,8 @@ private static SAXParserFactory createSecureSaxParserFactoryNamespaceAware() { factory.setNamespaceAware(true); return factory; } catch (Exception e) { - throw new IllegalStateException("Cannot configure secure XML parsing features", e); + logger.warn("Cannot configure secure namespace-aware XML parsing features - XML processor may not work correctly", e); + return null; } } @@ -841,16 +852,13 @@ private static SAXParserFactory createSecureSaxParserFactoryStrict() { factory.setValidating(false); factory.setNamespaceAware(false); - // Try to enable strict parsing features (optional - may not be supported) - try { - factory.setFeature("http://apache.org/xml/features/validation/check-full-element-content", true); - } catch (Exception e) { - // Strict parsing features are optional - continue without them if not supported - } + // Try to enable strict parsing features (may not be supported on all JDKs) + factory.setFeature("http://apache.org/xml/features/validation/check-full-element-content", true); return factory; } catch (Exception e) { - throw new IllegalStateException("Cannot configure secure XML parsing features", e); + logger.warn("Cannot configure secure strict XML parsing features - XML processor may not work correctly", e); + return null; } } @@ -863,16 +871,13 @@ private static SAXParserFactory createSecureSaxParserFactoryNamespaceAwareStrict factory.setValidating(false); factory.setNamespaceAware(true); - // Try to enable strict parsing features (optional - may not be supported) - try { - factory.setFeature("http://apache.org/xml/features/validation/check-full-element-content", true); - } catch (Exception e) { - // Strict parsing features are optional - continue without them if not supported - } + // Try to enable strict parsing features (may not be supported on all JDKs) + factory.setFeature("http://apache.org/xml/features/validation/check-full-element-content", true); return factory; } catch (Exception e) { - throw new IllegalStateException("Cannot configure secure XML parsing features", e); + logger.warn("Cannot configure secure strict namespace-aware XML parsing features - XML processor may not work correctly", e); + return null; } } @@ -888,7 +893,8 @@ private static DocumentBuilderFactory createSecureDocumentBuilderFactory() { factory.setValidating(false); // Override validation for DOM creation return factory; } catch (Exception e) { - throw new IllegalStateException("Cannot configure secure XML parsing features", e); + logger.warn("Cannot configure secure DOM builder factory - XML processor may not work correctly", e); + return null; } } @@ -903,13 +909,32 @@ private static DocumentBuilderFactory createSecureDocumentBuilderFactory() { * * * @return the appropriate SAX parser factory for the current configuration + * @throws UnsupportedOperationException if the required XML factory is not available */ private SAXParserFactory selectSaxParserFactory() { boolean needsNamespaceAware = hasNamespaces() || removeNamespaces; + SAXParserFactory factory; + if (isStrict()) { - return needsNamespaceAware ? SAX_PARSER_FACTORY_NS_STRICT : SAX_PARSER_FACTORY_STRICT; + factory = needsNamespaceAware ? XmlFactories.SAX_PARSER_FACTORY_NS_STRICT : XmlFactories.SAX_PARSER_FACTORY_STRICT; + if (factory == null) { + throw new UnsupportedOperationException( + "Strict XML parsing with" + (needsNamespaceAware ? " namespace-aware " : " ") + + "features is not supported by the current JDK. Please try without strict_parsing=true or " + + "update your JDK to one that supports these XML features." + ); + } } else { - return needsNamespaceAware ? SAX_PARSER_FACTORY_NS : SAX_PARSER_FACTORY; + factory = needsNamespaceAware ? XmlFactories.SAX_PARSER_FACTORY_NS : XmlFactories.SAX_PARSER_FACTORY; + if (factory == null) { + throw new UnsupportedOperationException( + "XML parsing with" + (needsNamespaceAware ? " namespace-aware " : " ") + + "features is not supported by the current JDK. Please update your JDK to one that " + + "supports these XML features." + ); + } } + + return factory; } } diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java index bd7aeafb4249d..55ba4651a2464 100644 --- a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java @@ -23,7 +23,6 @@ /** * Tests for {@link XmlProcessor}. These tests ensure feature parity and test coverage. */ -@SuppressWarnings("unchecked") public class XmlProcessorTests extends ESTestCase { private static final String XML_FIELD = "xmldata"; @@ -172,7 +171,8 @@ public void testXPathMultipleElementsExtraction() throws Exception { processor.execute(ingestDocument); - List allBars = ingestDocument.getFieldValue("all_bars", List.class); + @SuppressWarnings("unchecked") + List allBars = (List) ingestDocument.getFieldValue("all_bars", List.class); List expectedBars = List.of("first", "second", "third"); assertThat(allBars, equalTo(expectedBars)); } From e22633faf71755af3c9d9748f1a0027add3baaa6 Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Tue, 9 Sep 2025 10:47:13 +0000 Subject: [PATCH 26/54] [CI] Auto commit changes from spotless --- .../ingest/common/XmlProcessor.java | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index b3fd9e3e2f98c..c22d1ee757d78 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -914,27 +914,29 @@ private static DocumentBuilderFactory createSecureDocumentBuilderFactory() { private SAXParserFactory selectSaxParserFactory() { boolean needsNamespaceAware = hasNamespaces() || removeNamespaces; SAXParserFactory factory; - + if (isStrict()) { factory = needsNamespaceAware ? XmlFactories.SAX_PARSER_FACTORY_NS_STRICT : XmlFactories.SAX_PARSER_FACTORY_STRICT; if (factory == null) { throw new UnsupportedOperationException( - "Strict XML parsing with" + (needsNamespaceAware ? " namespace-aware " : " ") + - "features is not supported by the current JDK. Please try without strict_parsing=true or " + - "update your JDK to one that supports these XML features." + "Strict XML parsing with" + + (needsNamespaceAware ? " namespace-aware " : " ") + + "features is not supported by the current JDK. Please try without strict_parsing=true or " + + "update your JDK to one that supports these XML features." ); } } else { factory = needsNamespaceAware ? XmlFactories.SAX_PARSER_FACTORY_NS : XmlFactories.SAX_PARSER_FACTORY; if (factory == null) { throw new UnsupportedOperationException( - "XML parsing with" + (needsNamespaceAware ? " namespace-aware " : " ") + - "features is not supported by the current JDK. Please update your JDK to one that " + - "supports these XML features." + "XML parsing with" + + (needsNamespaceAware ? " namespace-aware " : " ") + + "features is not supported by the current JDK. Please update your JDK to one that " + + "supports these XML features." ); } } - + return factory; } } From 21a8f85b6041dfbe60eab66d52a01a0be7c1773a Mon Sep 17 00:00:00 2001 From: Marc Guasch Date: Wed, 10 Sep 2025 11:12:33 +0200 Subject: [PATCH 27/54] Remove strict_parsing option --- .../enrich-processor/xml-processor.md | 53 +------- .../ingest/common/XmlProcessor.java | 113 ++---------------- .../common/XmlProcessorFactoryTests.java | 9 -- .../ingest/common/XmlProcessorTests.java | 27 ----- 4 files changed, 13 insertions(+), 189 deletions(-) diff --git a/docs/reference/enrich-processor/xml-processor.md b/docs/reference/enrich-processor/xml-processor.md index b5eae3085bbbe..93850168830f7 100644 --- a/docs/reference/enrich-processor/xml-processor.md +++ b/docs/reference/enrich-processor/xml-processor.md @@ -8,7 +8,7 @@ mapped_pages: # XML processor [xml-processor] -Parses XML documents and converts them to JSON objects using a DOM parser. This processor efficiently handles XML data with a single-parse architecture that supports both structured output and XPath extraction for optimal performance. +Parses XML documents and converts them to JSON objects using a streaming SAX parser. This processor efficiently handles XML data with a single-pass architecture that supports both structured output and XPath extraction for optimal performance. $$$xml-options$$$ @@ -18,13 +18,12 @@ $$$xml-options$$$ | `target_field` | no | `field` | The field that the converted structured object will be written into. Any existing content in this field will be overwritten. | | `store_xml` | no | `true` | If `true`, stores the parsed XML structure in the target field. If `false`, only XPath extraction results are stored and `target_field` is ignored. | | `ignore_missing` | no | `false` | If `true` and `field` does not exist, the processor quietly exits without modifying the document. | -| `ignore_failure` | no | `false` | Ignore failures for the processor. When `true` and XML parsing fails, adds `_xmlparsefailure` tag to the document. See [Handling pipeline failures](docs-content://manage-data/ingest/transform-enrich/ingest-pipelines.md#handling-pipeline-failures). | +| `ignore_failure` | no | `false` | Ignore failures for the processor. See [Handling pipeline failures](docs-content://manage-data/ingest/transform-enrich/ingest-pipelines.md#handling-pipeline-failures). | | `to_lower` | no | `false` | Convert XML element names and attribute names to lowercase. | | `remove_empty_values` | no | `false` | If `true`, the processor will filter out null and empty values from the parsed XML structure, including empty elements, elements with null values, and elements with whitespace-only content. | | `remove_namespaces` | no | `false` | If `true`, removes namespace prefixes from element and attribute names. | | `force_content` | no | `false` | If `true`, forces text content and attributes to always parse to a hash value with `#text` key for content. | | `force_array` | no | `false` | If `true`, forces all parsed values to be arrays. Single elements are wrapped in arrays. | -| `strict_parsing` | no | `false` | If `true`, enables strict XML validation that fails fast on invalid content. | | `xpath` | no | - | Map of XPath expressions to target field names. Extracts values from the XML using XPath and stores them in the specified fields. | | `namespaces` | no | - | Map of namespace prefixes to URIs for use with XPath expressions. Required when XPath expressions contain namespace prefixes. | | `description` | no | - | Description of the processor. Useful for describing the purpose of the processor or its configuration. | @@ -321,54 +320,6 @@ Result: } ``` - - -### Strict parsing mode - -Use `strict_parsing: true` for strict XML validation: - -```console -POST _ingest/pipeline/_simulate -{ - "pipeline": { - "processors": [ - { - "xml": { - "field": "xml_content", - "strict_parsing": true, - "ignore_failure": true - } - } - ] - }, - "docs": [ - { - "_source": { - "xml_content": "Invalid XML with control character \u0000" - } - } - ] -} -``` - -Result (with parsing failure): - -```console-result -{ - "docs": [ - { - "doc": { - ... - "_source": { - "xml_content": "Invalid XML with control character \u0000", - "tags": ["_xmlparsefailure"] - } - } - } - ] -} -``` - ### Mixed content handling When XML contains mixed content (text interspersed with elements), text fragments are combined and stored under the special `#text` key: diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index c22d1ee757d78..4566bf3829302 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -50,7 +50,6 @@ *
  • XML to JSON conversion with configurable structure options *
  • XPath extraction with namespace support *
  • Configurable options: force_array, force_content, remove_namespaces, to_lower - *
  • Strict parsing mode for XML validation *
  • Empty value filtering with remove_empty_values option *
  • Logstash-compatible error handling and behavior * @@ -73,8 +72,6 @@ private static class XmlFactories { // Pre-configured secure XML parser factories using XmlUtils static final SAXParserFactory SAX_PARSER_FACTORY = createSecureSaxParserFactory(); static final SAXParserFactory SAX_PARSER_FACTORY_NS = createSecureSaxParserFactoryNamespaceAware(); - static final SAXParserFactory SAX_PARSER_FACTORY_STRICT = createSecureSaxParserFactoryStrict(); - static final SAXParserFactory SAX_PARSER_FACTORY_NS_STRICT = createSecureSaxParserFactoryNamespaceAwareStrict(); // Pre-configured secure document builder factory for DOM creation static final DocumentBuilderFactory DOM_FACTORY = createSecureDocumentBuilderFactory(); @@ -92,7 +89,6 @@ private static class XmlFactories { private final Map xpathExpressions; private final Map namespaces; private final Map compiledXPathExpressions; - private final boolean strictParsing; XmlProcessor( String tag, @@ -107,8 +103,7 @@ private static class XmlFactories { boolean forceContent, boolean forceArray, Map xpathExpressions, - Map namespaces, - boolean strictParsing + Map namespaces ) { super(tag, description); this.field = field; @@ -123,7 +118,6 @@ private static class XmlFactories { this.xpathExpressions = xpathExpressions != null ? Map.copyOf(xpathExpressions) : Map.of(); this.namespaces = namespaces != null ? Map.copyOf(namespaces) : Map.of(); this.compiledXPathExpressions = compileXPathExpressions(this.xpathExpressions, this.namespaces); - this.strictParsing = strictParsing; } public String getField() { @@ -154,10 +148,6 @@ public boolean isForceContent() { return forceContent; } - public boolean isStrict() { - return strictParsing; - } - public boolean isForceArray() { return forceArray; } @@ -170,10 +160,6 @@ public Map getNamespaces() { return namespaces; } - public boolean getStrictParsing() { - return strictParsing; - } - @Override public IngestDocument execute(IngestDocument document) { Object fieldValue = document.getFieldValue(field, Object.class, ignoreMissing); @@ -465,9 +451,6 @@ public XmlProcessor create( } } - // Parse strict_parsing parameter - boolean strictParsing = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "strict_parsing", false); - return new XmlProcessor( processorTag, description, @@ -481,8 +464,7 @@ public XmlProcessor create( forceContent, forceArray, xpathExpressions, - namespaces, - strictParsing + namespaces ); } } @@ -508,26 +490,6 @@ private void parseXmlAndXPath(IngestDocument document, String xmlString) throws SAXParser parser = factory.newSAXParser(); - // Configure error handler for strict mode - if (isStrict()) { - parser.getXMLReader().setErrorHandler(new org.xml.sax.ErrorHandler() { - @Override - public void warning(org.xml.sax.SAXParseException exception) throws org.xml.sax.SAXException { - throw exception; - } - - @Override - public void error(org.xml.sax.SAXParseException exception) throws org.xml.sax.SAXException { - throw exception; - } - - @Override - public void fatalError(org.xml.sax.SAXParseException exception) throws org.xml.sax.SAXException { - throw exception; - } - }); - } - // Use enhanced handler that can build DOM during streaming when needed XmlStreamingWithDomHandler handler = new XmlStreamingWithDomHandler(needsDom); @@ -843,44 +805,6 @@ private static SAXParserFactory createSecureSaxParserFactoryNamespaceAware() { } } - /** - * Creates a secure, pre-configured SAX parser factory for strict XML parsing using XmlUtils. - */ - private static SAXParserFactory createSecureSaxParserFactoryStrict() { - try { - SAXParserFactory factory = XmlUtils.getHardenedSaxParserFactory(); - factory.setValidating(false); - factory.setNamespaceAware(false); - - // Try to enable strict parsing features (may not be supported on all JDKs) - factory.setFeature("http://apache.org/xml/features/validation/check-full-element-content", true); - - return factory; - } catch (Exception e) { - logger.warn("Cannot configure secure strict XML parsing features - XML processor may not work correctly", e); - return null; - } - } - - /** - * Creates a secure, pre-configured namespace-aware SAX parser factory for strict XML parsing using XmlUtils. - */ - private static SAXParserFactory createSecureSaxParserFactoryNamespaceAwareStrict() { - try { - SAXParserFactory factory = XmlUtils.getHardenedSaxParserFactory(); - factory.setValidating(false); - factory.setNamespaceAware(true); - - // Try to enable strict parsing features (may not be supported on all JDKs) - factory.setFeature("http://apache.org/xml/features/validation/check-full-element-content", true); - - return factory; - } catch (Exception e) { - logger.warn("Cannot configure secure strict namespace-aware XML parsing features - XML processor may not work correctly", e); - return null; - } - } - /** * Creates a secure, pre-configured DocumentBuilderFactory for DOM creation using XmlUtils. * Since we only use this factory to create empty DOM documents programmatically @@ -904,8 +828,6 @@ private static DocumentBuilderFactory createSecureDocumentBuilderFactory() { * Factory selection matrix:
      *
    • Regular parsing, no namespaces: SAX_PARSER_FACTORY *
    • Regular parsing, with namespaces: SAX_PARSER_FACTORY_NS - *
    • Strict parsing, no namespaces: SAX_PARSER_FACTORY_STRICT - *
    • Strict parsing, with namespaces: SAX_PARSER_FACTORY_NS_STRICT *
    * * @return the appropriate SAX parser factory for the current configuration @@ -913,28 +835,15 @@ private static DocumentBuilderFactory createSecureDocumentBuilderFactory() { */ private SAXParserFactory selectSaxParserFactory() { boolean needsNamespaceAware = hasNamespaces() || removeNamespaces; - SAXParserFactory factory; - - if (isStrict()) { - factory = needsNamespaceAware ? XmlFactories.SAX_PARSER_FACTORY_NS_STRICT : XmlFactories.SAX_PARSER_FACTORY_STRICT; - if (factory == null) { - throw new UnsupportedOperationException( - "Strict XML parsing with" - + (needsNamespaceAware ? " namespace-aware " : " ") - + "features is not supported by the current JDK. Please try without strict_parsing=true or " - + "update your JDK to one that supports these XML features." - ); - } - } else { - factory = needsNamespaceAware ? XmlFactories.SAX_PARSER_FACTORY_NS : XmlFactories.SAX_PARSER_FACTORY; - if (factory == null) { - throw new UnsupportedOperationException( - "XML parsing with" - + (needsNamespaceAware ? " namespace-aware " : " ") - + "features is not supported by the current JDK. Please update your JDK to one that " - + "supports these XML features." - ); - } + SAXParserFactory factory = needsNamespaceAware ? XmlFactories.SAX_PARSER_FACTORY_NS : XmlFactories.SAX_PARSER_FACTORY; + + if (factory == null) { + throw new UnsupportedOperationException( + "XML parsing" + + (needsNamespaceAware ? " with namespace-aware features " : " ") + + "is not supported by the current JDK. Please update your JDK to one that " + + "supports these XML features." + ); } return factory; diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java index 432426a805e1b..7106bbf306809 100644 --- a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java @@ -231,15 +231,6 @@ public void testCreateWithForceArray() throws Exception { assertThat(processor.isForceArray(), equalTo(true)); } - public void testCreateWithStrictParsing() throws Exception { - Map config = Map.of("field", DEFAULT_FIELD, "strict_parsing", true); - XmlProcessor processor = createProcessor(config); - - assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); - assertThat(processor.getStrictParsing(), equalTo(true)); - assertThat(processor.isStrict(), equalTo(true)); - } - public void testCreateWithMultipleOptions() throws Exception { Map config = Map.of( "field", diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java index 55ba4651a2464..f5e7e5d4e30ee 100644 --- a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java @@ -511,33 +511,6 @@ public void testRemoveEmptyValues() throws Exception { assertThat(result, equalTo(expectedData)); } - /** - * Test parsing with strict mode option. - */ - public void testStrictParsing() throws Exception { - String xml = "valid"; - - Map config = new HashMap<>(); - config.put("strict_parsing", true); - XmlProcessor processor = createTestProcessor(config); - IngestDocument ingestDocument = createTestIngestDocument(xml); - - processor.execute(ingestDocument); - - Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); - - Map expectedData = Map.of("foo", Map.of("bar", "valid")); - assertThat(data, equalTo(expectedData)); - - // Test with invalid XML in strict mode - String invalidXml = ""; - IngestDocument invalidDocument = createTestIngestDocument(invalidXml); - - IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> processor.execute(invalidDocument)); - - assertThat(exception.getMessage(), containsString("contains invalid XML")); - } - /** * Test parsing XML with remove_namespaces option. */ From f83b1eb38a2584b5ece1670b0e648995172a992a Mon Sep 17 00:00:00 2001 From: Sam Xiao Date: Thu, 11 Sep 2025 18:49:32 -0400 Subject: [PATCH 28/54] Joe's thread local and soft reference --- .../ingest/common/XmlProcessor.java | 54 ++++++++++++------- 1 file changed, 36 insertions(+), 18 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index 4566bf3829302..5c4e95e8017e4 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -23,6 +23,7 @@ import org.w3c.dom.NodeList; import java.io.ByteArrayInputStream; +import java.lang.ref.SoftReference; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.HashMap; @@ -30,6 +31,7 @@ import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; import java.util.regex.Pattern; import javax.xml.namespace.NamespaceContext; @@ -88,7 +90,10 @@ private static class XmlFactories { private final boolean forceArray; private final Map xpathExpressions; private final Map namespaces; + private final Map compiledXPathExpressions; + private final boolean needsDom; + private final SAXParserFactory factory; XmlProcessor( String tag, @@ -115,9 +120,13 @@ private static class XmlFactories { this.removeNamespaces = removeNamespaces; this.forceContent = forceContent; this.forceArray = forceArray; + this.xpathExpressions = xpathExpressions != null ? Map.copyOf(xpathExpressions) : Map.of(); this.namespaces = namespaces != null ? Map.copyOf(namespaces) : Map.of(); + this.compiledXPathExpressions = compileXPathExpressions(this.xpathExpressions, this.namespaces); + this.needsDom = this.xpathExpressions.isEmpty() == false; + this.factory = selectSaxParserFactory(this.namespaces.isEmpty() == false || removeNamespaces); } public String getField() { @@ -152,10 +161,6 @@ public boolean isForceArray() { return forceArray; } - public boolean hasNamespaces() { - return namespaces.isEmpty() == false; - } - public Map getNamespaces() { return namespaces; } @@ -469,6 +474,12 @@ public XmlProcessor create( } } + private static final Map>> PARSERS = new ConcurrentHashMap<>(); + static { + PARSERS.put(XmlFactories.SAX_PARSER_FACTORY, new ThreadLocal<>()); + PARSERS.put(XmlFactories.SAX_PARSER_FACTORY_NS, new ThreadLocal<>()); + } + /** * Main XML parsing method that converts XML to JSON and optionally extracts XPath values. * Uses streaming SAX parser with optional DOM building for XPath processing. @@ -482,18 +493,28 @@ private void parseXmlAndXPath(IngestDocument document, String xmlString) throws return; } - // Determine if we need DOM for XPath processing - boolean needsDom = xpathExpressions.isEmpty() == false; - - // Use the appropriate pre-configured SAX parser factory - SAXParserFactory factory = selectSaxParserFactory(); - - SAXParser parser = factory.newSAXParser(); + final SAXParser parser; + { + SAXParser innerParser; + final ThreadLocal> threadLocal = PARSERS.get(factory); + final SoftReference parserReference = threadLocal.get(); + innerParser = parserReference != null ? parserReference.get() : null; + if (innerParser == null) { + innerParser = factory.newSAXParser(); + threadLocal.set(new SoftReference<>(innerParser)); + } + parser = innerParser; + } - // Use enhanced handler that can build DOM during streaming when needed - XmlStreamingWithDomHandler handler = new XmlStreamingWithDomHandler(needsDom); + final XmlStreamingWithDomHandler handler; + try { + // Use enhanced handler that can build DOM during streaming when needed + handler = new XmlStreamingWithDomHandler(needsDom); - parser.parse(new ByteArrayInputStream(xmlString.getBytes(StandardCharsets.UTF_8)), handler); + parser.parse(new ByteArrayInputStream(xmlString.getBytes(StandardCharsets.UTF_8)), handler); + } finally { + parser.reset(); + } // Store structured result if needed if (storeXml) { @@ -833,10 +854,8 @@ private static DocumentBuilderFactory createSecureDocumentBuilderFactory() { * @return the appropriate SAX parser factory for the current configuration * @throws UnsupportedOperationException if the required XML factory is not available */ - private SAXParserFactory selectSaxParserFactory() { - boolean needsNamespaceAware = hasNamespaces() || removeNamespaces; + private static SAXParserFactory selectSaxParserFactory(final boolean needsNamespaceAware) { SAXParserFactory factory = needsNamespaceAware ? XmlFactories.SAX_PARSER_FACTORY_NS : XmlFactories.SAX_PARSER_FACTORY; - if (factory == null) { throw new UnsupportedOperationException( "XML parsing" @@ -845,7 +864,6 @@ private SAXParserFactory selectSaxParserFactory() { + "supports these XML features." ); } - return factory; } } From 998cae6b4ba7dfc0246f947118a17dbabedbff1d Mon Sep 17 00:00:00 2001 From: Sam Xiao Date: Thu, 11 Sep 2025 21:17:20 -0400 Subject: [PATCH 29/54] address comments and some minor refactoring --- .../ingest/common/XmlProcessor.java | 94 ++++++++++++------- 1 file changed, 58 insertions(+), 36 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index 5c4e95e8017e4..b1a9bbd490299 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -21,22 +21,27 @@ import org.w3c.dom.Document; import org.w3c.dom.Node; import org.w3c.dom.NodeList; +import org.xml.sax.SAXException; import java.io.ByteArrayInputStream; import java.lang.ref.SoftReference; import java.nio.charset.StandardCharsets; import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.regex.Pattern; import javax.xml.namespace.NamespaceContext; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import javax.xml.xpath.XPath; @@ -295,21 +300,30 @@ private void processXPathExpressionsFromDom(IngestDocument ingestDocument, Docum Object result = compiledExpression.evaluate(xmlDocument, XPathConstants.NODESET); if (result instanceof NodeList nodeList) { - List values = new ArrayList<>(); - - for (int i = 0; i < nodeList.getLength(); i++) { - Node node = nodeList.item(i); + // separate the case for 1 vs multiple nodeList elements to avoid unnecessary array allocation, this optimization is only + // done because this is a per-document hot code path + if (nodeList.getLength() == 1) { + Node node = nodeList.item(0); String value = getNodeValue(node); if (Strings.hasText(value)) { - values.add(value); + ingestDocument.setFieldValue(targetFieldName, value); + } + } else if (nodeList.getLength() > 1) { + List values = new ArrayList<>(); + for (int i = 0; i < nodeList.getLength(); i++) { + Node node = nodeList.item(i); + String value = getNodeValue(node); + if (Strings.hasText(value)) { + values.add(value); + } } - } - if (values.isEmpty() == false) { - if (values.size() == 1) { - ingestDocument.setFieldValue(targetFieldName, values.get(0)); - } else { - ingestDocument.setFieldValue(targetFieldName, values); + if (values.isEmpty() == false) { + if (values.size() == 1) { + ingestDocument.setFieldValue(targetFieldName, values.get(0)); + } else { + ingestDocument.setFieldValue(targetFieldName, values); + } } } } @@ -340,6 +354,13 @@ private static Map compileXPathExpressions( // Set namespace context if namespaces are defined boolean hasNamespaces = namespaces.isEmpty() == false; if (hasNamespaces) { + // build a read-only reverse map for quick look up + Map> uriToPrefixes = new HashMap<>(); + for (Map.Entry entry : namespaces.entrySet()) { + uriToPrefixes.computeIfAbsent(entry.getValue(), k -> new HashSet<>()).add(entry.getKey()); + } + uriToPrefixes.replaceAll((k, v) -> Collections.unmodifiableSet(v)); + xpath.setNamespaceContext(new NamespaceContext() { @Override public String getNamespaceURI(String prefix) { @@ -351,23 +372,22 @@ public String getNamespaceURI(String prefix) { @Override public String getPrefix(String namespaceURI) { - for (Map.Entry entry : namespaces.entrySet()) { - if (entry.getValue().equals(namespaceURI)) { - return entry.getKey(); - } + if (namespaceURI == null) { + throw new IllegalArgumentException("namespaceURI cannot be null"); + } + if (uriToPrefixes.containsKey(namespaceURI)) { + return uriToPrefixes.get(namespaceURI).iterator().next(); + } else { + return null; } - return null; } @Override public Iterator getPrefixes(String namespaceURI) { - List prefixes = new ArrayList<>(); - for (Map.Entry entry : namespaces.entrySet()) { - if (entry.getValue().equals(namespaceURI)) { - prefixes.add(entry.getKey()); - } + if (namespaceURI == null) { + throw new IllegalArgumentException("namespaceURI cannot be null"); } - return prefixes.iterator(); + return uriToPrefixes.getOrDefault(namespaceURI, Set.of()).iterator(); } }); } @@ -493,24 +513,11 @@ private void parseXmlAndXPath(IngestDocument document, String xmlString) throws return; } - final SAXParser parser; - { - SAXParser innerParser; - final ThreadLocal> threadLocal = PARSERS.get(factory); - final SoftReference parserReference = threadLocal.get(); - innerParser = parserReference != null ? parserReference.get() : null; - if (innerParser == null) { - innerParser = factory.newSAXParser(); - threadLocal.set(new SoftReference<>(innerParser)); - } - parser = innerParser; - } - + final SAXParser parser = getParser(factory); final XmlStreamingWithDomHandler handler; try { // Use enhanced handler that can build DOM during streaming when needed handler = new XmlStreamingWithDomHandler(needsDom); - parser.parse(new ByteArrayInputStream(xmlString.getBytes(StandardCharsets.UTF_8)), handler); } finally { parser.reset(); @@ -532,6 +539,21 @@ private void parseXmlAndXPath(IngestDocument document, String xmlString) throws } } + /** + * Gets the parser reference or creates a new one if the soft reference has been cleared. + */ + private SAXParser getParser(SAXParserFactory factory) throws SAXException, ParserConfigurationException { + final ThreadLocal> threadLocal = PARSERS.getOrDefault(factory, new ThreadLocal<>()); + final SoftReference parserReference = threadLocal.get(); + + SAXParser parser = parserReference != null ? parserReference.get() : null; + if (parser == null) { + parser = factory.newSAXParser(); + threadLocal.set(new SoftReference<>(parser)); + } + return parser; + } + /** * SAX ContentHandler that builds structured JSON output and optionally constructs a DOM tree during parsing. * Handles XML-to-JSON conversion with support for all processor configuration options. From 839d316fa01e8df8861f98b66accf31fda968e4e Mon Sep 17 00:00:00 2001 From: Joe Gallo Date: Mon, 15 Sep 2025 15:59:35 -0400 Subject: [PATCH 30/54] Silence some warnings from IntelliJ --- .../org/elasticsearch/ingest/common/XmlProcessor.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index b1a9bbd490299..83a40626b5393 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -52,7 +52,7 @@ /** * Processor that parses XML documents and converts them to JSON objects using a single-pass streaming approach. - * + *

    * Features:

      *
    • XML to JSON conversion with configurable structure options *
    • XPath extraction with namespace support @@ -206,7 +206,7 @@ public String getType() { /** * Determines if a value should be considered empty for filtering purposes. * Used by the remove_empty_values feature to filter out empty content. - * + *

      * Considers empty:

        *
      • null values *
      • empty or whitespace-only strings @@ -258,7 +258,7 @@ private String getNodeValue(Node node) { /** * Applies force_array logic to ensure all fields are arrays when enabled. - * + *

        * Behavior:

          *
        • If force_array is false: returns content unchanged *
        • If force_array is true and content is already a List: returns content unchanged @@ -281,7 +281,7 @@ private Object applyForceArray(String elementName, Object content) { /** * Evaluates precompiled XPath expressions against a DOM document and adds results to the ingest document. - * + *

          * Features:

            *
          • Uses precompiled XPath expressions for optimal performance *
          • Extracts text values from matched nodes (elements, attributes, text nodes) @@ -867,7 +867,7 @@ private static DocumentBuilderFactory createSecureDocumentBuilderFactory() { /** * Selects the appropriate pre-configured SAX parser factory based on processor configuration. - * + *

            * Factory selection matrix:

              *
            • Regular parsing, no namespaces: SAX_PARSER_FACTORY *
            • Regular parsing, with namespaces: SAX_PARSER_FACTORY_NS From 4cef5c8b37c31528d218345c98b6aa8cff6b97fb Mon Sep 17 00:00:00 2001 From: Joe Gallo Date: Mon, 15 Sep 2025 16:00:29 -0400 Subject: [PATCH 31/54] Silence some warnings from IntelliJ --- .../java/org/elasticsearch/ingest/common/XmlProcessor.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index 83a40626b5393..4d63585cc0531 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -320,7 +320,7 @@ private void processXPathExpressionsFromDom(IngestDocument ingestDocument, Docum if (values.isEmpty() == false) { if (values.size() == 1) { - ingestDocument.setFieldValue(targetFieldName, values.get(0)); + ingestDocument.setFieldValue(targetFieldName, values.getFirst()); } else { ingestDocument.setFieldValue(targetFieldName, values); } @@ -698,7 +698,7 @@ public void endElement(String uri, String localName, String qName) throws org.xm String textContentString = textContent.toString(); String trimmedText = textContentString.trim(); boolean hasText = textContentString.isBlank() == false; - boolean hasChildren = element.size() > 0; + boolean hasChildren = element.isEmpty() == false; Object elementValue; if (hasText == false && hasChildren == false) { From 4331abd010a129f0018b06942f58cd5da8044759 Mon Sep 17 00:00:00 2001 From: Joe Gallo Date: Mon, 15 Sep 2025 16:01:19 -0400 Subject: [PATCH 32/54] Silence some warnings from IntelliJ --- .../org/elasticsearch/ingest/common/XmlProcessor.java | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index 4d63585cc0531..b596182870adc 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -601,8 +601,7 @@ public void startDocument() throws org.xml.sax.SAXException { } @Override - public void startElement(String uri, String localName, String qName, org.xml.sax.Attributes attributes) - throws org.xml.sax.SAXException { + public void startElement(String uri, String localName, String qName, org.xml.sax.Attributes attributes) { String elementName = getElementName(uri, localName, qName); // Build structured representation (always) @@ -657,7 +656,7 @@ public void startElement(String uri, String localName, String qName, org.xml.sax } @Override - public void characters(char[] ch, int start, int length) throws org.xml.sax.SAXException { + public void characters(char[] ch, int start, int length) { // Add to structured output text accumulator if (elementStack.isEmpty() == false) { elementStack.peek().textContent().append(ch, start, length); @@ -674,7 +673,7 @@ public void characters(char[] ch, int start, int length) throws org.xml.sax.SAXE } @Override - public void endElement(String uri, String localName, String qName) throws org.xml.sax.SAXException { + public void endElement(String uri, String localName, String qName) { // Complete structured output element processing if (elementStack.isEmpty()) { return; @@ -773,7 +772,7 @@ public void endElement(String uri, String localName, String qName) throws org.xm } @Override - public void endDocument() throws org.xml.sax.SAXException { + public void endDocument() { // Document parsing complete } From 9da01c4dcf8f59b57e8d85bfc19c367a961ad915 Mon Sep 17 00:00:00 2001 From: Joe Gallo Date: Mon, 15 Sep 2025 16:02:20 -0400 Subject: [PATCH 33/54] Silence some warnings from IntelliJ --- .../org/elasticsearch/ingest/common/XmlProcessor.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index b596182870adc..047800147586a 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -266,11 +266,11 @@ private String getNodeValue(Node node) { *
            • Handles null content appropriately (wraps null in array if force_array is true) *
            * - * @param elementName the name of the element (for context, not used in current implementation) + * @param ignoredElementName the name of the element (for context, not used in current implementation) * @param content the content to potentially wrap in an array * @return the content, optionally wrapped in an array based on force_array setting */ - private Object applyForceArray(String elementName, Object content) { + private Object applyForceArray(String ignoredElementName, Object content) { if (forceArray && (content instanceof List) == false) { List arrayContent = new ArrayList<>(); arrayContent.add(content); // Add content even if it's null (for empty elements) @@ -784,7 +784,7 @@ public Document getDomDocument() { return domDocument; } - private String getElementName(String uri, String localName, String qName) { + private String getElementName(String ignoredUri, String localName, String qName) { String elementName; if (removeNamespaces) { elementName = localName != null && localName.isEmpty() == false ? localName : qName; @@ -800,7 +800,7 @@ private String getElementName(String uri, String localName, String qName) { return elementName; } - private String getAttributeName(String uri, String localName, String qName) { + private String getAttributeName(String ignoredUri, String localName, String qName) { String attrName; if (removeNamespaces) { attrName = localName != null && localName.isEmpty() == false ? localName : qName; From 62764f056305c5575a1b72454a612b144943df64 Mon Sep 17 00:00:00 2001 From: Joe Gallo Date: Mon, 15 Sep 2025 16:04:48 -0400 Subject: [PATCH 34/54] Prefer imports over qualified names --- .../ingest/common/XmlProcessor.java | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index 047800147586a..3fe85395da563 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -19,15 +19,21 @@ import org.elasticsearch.ingest.IngestDocument; import org.elasticsearch.ingest.Processor; import org.w3c.dom.Document; +import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; +import org.w3c.dom.Text; +import org.xml.sax.Attributes; import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; import java.io.ByteArrayInputStream; import java.lang.ref.SoftReference; import java.nio.charset.StandardCharsets; +import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Collections; +import java.util.Deque; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; @@ -558,7 +564,7 @@ private SAXParser getParser(SAXParserFactory factory) throws SAXException, Parse * SAX ContentHandler that builds structured JSON output and optionally constructs a DOM tree during parsing. * Handles XML-to-JSON conversion with support for all processor configuration options. */ - private class XmlStreamingWithDomHandler extends org.xml.sax.helpers.DefaultHandler { + private class XmlStreamingWithDomHandler extends DefaultHandler { /** * Record to encapsulate the parsing state for each XML element level. @@ -572,20 +578,20 @@ private record ElementParsingState( ) {} // Streaming parser state (for structured output) - private final java.util.Deque elementStack = new java.util.ArrayDeque<>(); + private final Deque elementStack = new ArrayDeque<>(); private Object rootResult = null; // DOM building state (for XPath processing when needed) private final boolean buildDom; private Document domDocument = null; - private final java.util.Deque domElementStack = new java.util.ArrayDeque<>(); + private final Deque domElementStack = new ArrayDeque<>(); XmlStreamingWithDomHandler(boolean buildDom) { this.buildDom = buildDom; } @Override - public void startDocument() throws org.xml.sax.SAXException { + public void startDocument() throws SAXException { // Initialize DOM document if needed if (buildDom) { try { @@ -595,13 +601,13 @@ public void startDocument() throws org.xml.sax.SAXException { DocumentBuilder builder = XmlFactories.DOM_FACTORY.newDocumentBuilder(); domDocument = builder.newDocument(); } catch (Exception e) { - throw new org.xml.sax.SAXException("Failed to create DOM document", e); + throw new SAXException("Failed to create DOM document", e); } } } @Override - public void startElement(String uri, String localName, String qName, org.xml.sax.Attributes attributes) { + public void startElement(String uri, String localName, String qName, Attributes attributes) { String elementName = getElementName(uri, localName, qName); // Build structured representation (always) @@ -623,7 +629,7 @@ public void startElement(String uri, String localName, String qName, org.xml.sax // Build DOM element simultaneously if needed if (buildDom && domDocument != null) { - org.w3c.dom.Element domElement; + Element domElement; if (uri != null && uri.isEmpty() == false && removeNamespaces == false) { domElement = domDocument.createElementNS(uri, qName); } else { @@ -666,7 +672,7 @@ public void characters(char[] ch, int start, int length) { if (buildDom && domElementStack.isEmpty() == false) { String text = new String(ch, start, length); if (text.isBlank() == false || removeEmptyValues == false) { - org.w3c.dom.Text textNode = domDocument.createTextNode(text); + Text textNode = domDocument.createTextNode(text); domElementStack.peek().appendChild(textNode); } } From feabad49a2f5d223ba74db6f5c7563ec8121f542 Mon Sep 17 00:00:00 2001 From: Joe Gallo Date: Mon, 15 Sep 2025 16:06:19 -0400 Subject: [PATCH 35/54] This can be static --- .../main/java/org/elasticsearch/ingest/common/XmlProcessor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index 3fe85395da563..5e9095fb60f74 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -548,7 +548,7 @@ private void parseXmlAndXPath(IngestDocument document, String xmlString) throws /** * Gets the parser reference or creates a new one if the soft reference has been cleared. */ - private SAXParser getParser(SAXParserFactory factory) throws SAXException, ParserConfigurationException { + private static SAXParser getParser(SAXParserFactory factory) throws SAXException, ParserConfigurationException { final ThreadLocal> threadLocal = PARSERS.getOrDefault(factory, new ThreadLocal<>()); final SoftReference parserReference = threadLocal.get(); From 33a529b40e968478fdb275fd33e8af1b309ed806 Mon Sep 17 00:00:00 2001 From: Joe Gallo Date: Mon, 15 Sep 2025 16:10:22 -0400 Subject: [PATCH 36/54] Read from the String directly --- .../java/org/elasticsearch/ingest/common/XmlProcessor.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index 5e9095fb60f74..e2ba8f4c7754c 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -24,12 +24,12 @@ import org.w3c.dom.NodeList; import org.w3c.dom.Text; import org.xml.sax.Attributes; +import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; -import java.io.ByteArrayInputStream; +import java.io.StringReader; import java.lang.ref.SoftReference; -import java.nio.charset.StandardCharsets; import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Collections; @@ -524,7 +524,7 @@ private void parseXmlAndXPath(IngestDocument document, String xmlString) throws try { // Use enhanced handler that can build DOM during streaming when needed handler = new XmlStreamingWithDomHandler(needsDom); - parser.parse(new ByteArrayInputStream(xmlString.getBytes(StandardCharsets.UTF_8)), handler); + parser.parse(new InputSource(new StringReader(xmlString)), handler); } finally { parser.reset(); } From d2e1f00792069410d2ff559ee5548e738fa17946 Mon Sep 17 00:00:00 2001 From: Joe Gallo Date: Mon, 15 Sep 2025 16:13:17 -0400 Subject: [PATCH 37/54] Don't use Hungarian notation --- .../ingest/common/XmlProcessor.java | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index e2ba8f4c7754c..4a9380d5a4f70 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -191,11 +191,11 @@ public IngestDocument execute(IngestDocument document) { throw new IllegalArgumentException("field [" + field + "] is not a string, cannot parse XML"); } - String xmlString = (String) fieldValue; + String xml = (String) fieldValue; try { // Always use streaming parser for optimal performance and memory usage if (storeXml || xpathExpressions.isEmpty() == false) { - parseXmlAndXPath(document, xmlString.trim()); + parseXmlAndXPath(document, xml.trim()); } } catch (Exception e) { throw new IllegalArgumentException("field [" + field + "] contains invalid XML", e); @@ -294,11 +294,11 @@ private Object applyForceArray(String ignoredElementName, Object content) { *
          • Single matches stored as strings, multiple matches as string arrays * * - * @param ingestDocument the ingest document to add XPath results to + * @param document the ingest document to add XPath results to * @param xmlDocument the DOM document to evaluate XPath expressions against * @throws XPathExpressionException if XPath processing fails */ - private void processXPathExpressionsFromDom(IngestDocument ingestDocument, Document xmlDocument) throws XPathExpressionException { + private void processXPathExpressionsFromDom(IngestDocument document, Document xmlDocument) throws XPathExpressionException { // Use precompiled XPath expressions for optimal performance for (Map.Entry entry : compiledXPathExpressions.entrySet()) { String targetFieldName = entry.getKey(); @@ -312,7 +312,7 @@ private void processXPathExpressionsFromDom(IngestDocument ingestDocument, Docum Node node = nodeList.item(0); String value = getNodeValue(node); if (Strings.hasText(value)) { - ingestDocument.setFieldValue(targetFieldName, value); + document.setFieldValue(targetFieldName, value); } } else if (nodeList.getLength() > 1) { List values = new ArrayList<>(); @@ -326,9 +326,9 @@ private void processXPathExpressionsFromDom(IngestDocument ingestDocument, Docum if (values.isEmpty() == false) { if (values.size() == 1) { - ingestDocument.setFieldValue(targetFieldName, values.getFirst()); + document.setFieldValue(targetFieldName, values.getFirst()); } else { - ingestDocument.setFieldValue(targetFieldName, values); + document.setFieldValue(targetFieldName, values); } } } @@ -511,11 +511,11 @@ public XmlProcessor create( * Uses streaming SAX parser with optional DOM building for XPath processing. * * @param document the ingest document to modify with parsed results - * @param xmlString the XML string to parse (should be trimmed) + * @param xml the XML string to parse (should be trimmed) * @throws Exception if XML parsing fails */ - private void parseXmlAndXPath(IngestDocument document, String xmlString) throws Exception { - if (Strings.hasText(xmlString) == false) { + private void parseXmlAndXPath(IngestDocument document, String xml) throws Exception { + if (Strings.hasText(xml) == false) { return; } @@ -524,7 +524,7 @@ private void parseXmlAndXPath(IngestDocument document, String xmlString) throws try { // Use enhanced handler that can build DOM during streaming when needed handler = new XmlStreamingWithDomHandler(needsDom); - parser.parse(new InputSource(new StringReader(xmlString)), handler); + parser.parse(new InputSource(new StringReader(xml)), handler); } finally { parser.reset(); } From 02b40a88953441063893981f3112f927811705aa Mon Sep 17 00:00:00 2001 From: Joe Gallo Date: Mon, 15 Sep 2025 16:19:14 -0400 Subject: [PATCH 38/54] Clarify logic and add a TODO --- .../org/elasticsearch/ingest/common/XmlProcessor.java | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index 4a9380d5a4f70..b7c85c8e3f223 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -191,12 +191,14 @@ public IngestDocument execute(IngestDocument document) { throw new IllegalArgumentException("field [" + field + "] is not a string, cannot parse XML"); } + // TODO this just seems like misconfiguration to me. Why should we all this at all? + if (storeXml == false && xpathExpressions.isEmpty()) { + return document; + } + String xml = (String) fieldValue; try { - // Always use streaming parser for optimal performance and memory usage - if (storeXml || xpathExpressions.isEmpty() == false) { - parseXmlAndXPath(document, xml.trim()); - } + parseXmlAndXPath(document, xml.trim()); } catch (Exception e) { throw new IllegalArgumentException("field [" + field + "] contains invalid XML", e); } From 97b4abfcc13577efa095bca8b07d5b28886714ca Mon Sep 17 00:00:00 2001 From: Joe Gallo Date: Mon, 15 Sep 2025 16:21:45 -0400 Subject: [PATCH 39/54] Handle input more conventionally --- .../ingest/common/XmlProcessor.java | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index b7c85c8e3f223..1fa6eac372dac 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -178,27 +178,20 @@ public Map getNamespaces() { @Override public IngestDocument execute(IngestDocument document) { - Object fieldValue = document.getFieldValue(field, Object.class, ignoreMissing); - - if (fieldValue == null) { - if (ignoreMissing) { - return document; - } + String input = document.getFieldValue(field, String.class, ignoreMissing); + if (input == null && ignoreMissing) { + return document; + } else if (input == null) { throw new IllegalArgumentException("field [" + field + "] is null, cannot parse XML"); } - if (fieldValue instanceof String == false) { - throw new IllegalArgumentException("field [" + field + "] is not a string, cannot parse XML"); - } - // TODO this just seems like misconfiguration to me. Why should we all this at all? if (storeXml == false && xpathExpressions.isEmpty()) { return document; } - String xml = (String) fieldValue; try { - parseXmlAndXPath(document, xml.trim()); + parseXmlAndXPath(document, input.trim()); } catch (Exception e) { throw new IllegalArgumentException("field [" + field + "] contains invalid XML", e); } From 35eae91af4790e27a473d6a83644ed3b6500ae39 Mon Sep 17 00:00:00 2001 From: Joe Gallo Date: Mon, 15 Sep 2025 16:51:35 -0400 Subject: [PATCH 40/54] Rewrite all these conditionals There was a warning here about hasChildren always being true at one point, and it just seemed like the overall logic was easier to follow as nested ifs (to me). --- .../ingest/common/XmlProcessor.java | 54 ++++++++++--------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index 1fa6eac372dac..abf883dbbf958 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -701,38 +701,42 @@ public void endElement(String uri, String localName, String qName) { boolean hasChildren = element.isEmpty() == false; Object elementValue; - if (hasText == false && hasChildren == false) { - // Empty element - if (removeEmptyValues == false) { - elementValue = applyForceArray(elementName, null); + if (hasChildren) { + if (hasText) { + // Both text and children/attributes + if (removeEmptyValues == false || isEmptyValue(trimmedText) == false) { + element.put("#text", trimmedText); + } + elementValue = (forceArray && forceContent) ? applyForceArray(elementName, element) : element; } else { - elementValue = null; + // Only child elements/attributes + elementValue = (forceArray && forceContent) ? applyForceArray(elementName, element) : element; } - } else if (hasText && hasChildren == false) { - // Only text content - if (forceContent) { - Map contentMap = new HashMap<>(); - if (removeEmptyValues == false || isEmptyValue(trimmedText) == false) { - contentMap.put("#text", trimmedText); + } else { + if (hasText) { + // Only text content + if (forceContent) { + Map contentMap = new HashMap<>(); + if (removeEmptyValues == false || isEmptyValue(trimmedText) == false) { + contentMap.put("#text", trimmedText); + } + elementValue = contentMap; + } else { + if (removeEmptyValues && isEmptyValue(trimmedText)) { + elementValue = null; + } else { + elementValue = trimmedText; + } } - elementValue = contentMap; + elementValue = applyForceArray(elementName, elementValue); } else { - if (removeEmptyValues && isEmptyValue(trimmedText)) { - elementValue = null; + // Empty element + if (removeEmptyValues == false) { + elementValue = applyForceArray(elementName, null); } else { - elementValue = trimmedText; + elementValue = null; } } - elementValue = applyForceArray(elementName, elementValue); - } else if (hasText == false && hasChildren) { - // Only child elements/attributes - elementValue = (forceArray && forceContent) ? applyForceArray(elementName, element) : element; - } else { - // Both text and children/attributes - if (removeEmptyValues == false || isEmptyValue(trimmedText) == false) { - element.put("#text", trimmedText); - } - elementValue = (forceArray && forceContent) ? applyForceArray(elementName, element) : element; } // If this is the root element, store the result From 98ee201b172cd11602c602186cd058dcb083e1c2 Mon Sep 17 00:00:00 2001 From: Joe Gallo Date: Mon, 15 Sep 2025 16:55:05 -0400 Subject: [PATCH 41/54] Avoid an unnecessary containsKey --- .../java/org/elasticsearch/ingest/common/XmlProcessor.java | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index abf883dbbf958..3e2b5d829db2f 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -376,11 +376,8 @@ public String getPrefix(String namespaceURI) { if (namespaceURI == null) { throw new IllegalArgumentException("namespaceURI cannot be null"); } - if (uriToPrefixes.containsKey(namespaceURI)) { - return uriToPrefixes.get(namespaceURI).iterator().next(); - } else { - return null; - } + Set prefixes = uriToPrefixes.get(namespaceURI); + return (prefixes == null || prefixes.isEmpty()) ? null : prefixes.iterator().next(); } @Override From 01c7ea594f2858d247cc2d4edaf0815ea1f1971f Mon Sep 17 00:00:00 2001 From: Joe Gallo Date: Mon, 15 Sep 2025 16:55:19 -0400 Subject: [PATCH 42/54] Tighten up the collections handling here This now results in an immutable map around immutable sets, with no unmodifiable wrappers. --- .../elasticsearch/ingest/common/XmlProcessor.java | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index 3e2b5d829db2f..1bbd464adc525 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -32,7 +32,6 @@ import java.lang.ref.SoftReference; import java.util.ArrayDeque; import java.util.ArrayList; -import java.util.Collections; import java.util.Deque; import java.util.HashMap; import java.util.HashSet; @@ -356,11 +355,15 @@ private static Map compileXPathExpressions( boolean hasNamespaces = namespaces.isEmpty() == false; if (hasNamespaces) { // build a read-only reverse map for quick look up - Map> uriToPrefixes = new HashMap<>(); - for (Map.Entry entry : namespaces.entrySet()) { - uriToPrefixes.computeIfAbsent(entry.getValue(), k -> new HashSet<>()).add(entry.getKey()); + final Map> uriToPrefixes; + { + Map> innerUriToPrefixes = new HashMap<>(); + for (Map.Entry entry : namespaces.entrySet()) { + innerUriToPrefixes.computeIfAbsent(entry.getValue(), k -> new HashSet<>()).add(entry.getKey()); + } + innerUriToPrefixes.replaceAll((k, v) -> Set.copyOf(v)); + uriToPrefixes = Map.copyOf(innerUriToPrefixes); } - uriToPrefixes.replaceAll((k, v) -> Collections.unmodifiableSet(v)); xpath.setNamespaceContext(new NamespaceContext() { @Override From 43cd0c8c9381b1b244ef9b34484b91ae70a044a2 Mon Sep 17 00:00:00 2001 From: Joe Gallo Date: Tue, 16 Sep 2025 13:53:26 -0400 Subject: [PATCH 43/54] Invert these conditionals for consistency reasons --- .../org/elasticsearch/ingest/common/XmlProcessor.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index 1bbd464adc525..9bacc95b86865 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -666,7 +666,7 @@ public void characters(char[] ch, int start, int length) { // Add to DOM text node if needed if (buildDom && domElementStack.isEmpty() == false) { String text = new String(ch, start, length); - if (text.isBlank() == false || removeEmptyValues == false) { + if (removeEmptyValues == false || text.isBlank() == false) { Text textNode = domDocument.createTextNode(text); domElementStack.peek().appendChild(textNode); } @@ -722,10 +722,10 @@ public void endElement(String uri, String localName, String qName) { } elementValue = contentMap; } else { - if (removeEmptyValues && isEmptyValue(trimmedText)) { - elementValue = null; - } else { + if (removeEmptyValues == false || isEmptyValue(trimmedText) == false) { elementValue = trimmedText; + } else { + elementValue = null; } } elementValue = applyForceArray(elementName, elementValue); From babffdb4607c59041a057fcc8a256b242524e162 Mon Sep 17 00:00:00 2001 From: Joe Gallo Date: Tue, 16 Sep 2025 13:55:35 -0400 Subject: [PATCH 44/54] Always use the utility function --- .../java/org/elasticsearch/ingest/common/XmlProcessor.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index 9bacc95b86865..6d51c007d5c51 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -666,7 +666,7 @@ public void characters(char[] ch, int start, int length) { // Add to DOM text node if needed if (buildDom && domElementStack.isEmpty() == false) { String text = new String(ch, start, length); - if (removeEmptyValues == false || text.isBlank() == false) { + if (removeEmptyValues == false || isEmptyValue(text) == false) { Text textNode = domDocument.createTextNode(text); domElementStack.peek().appendChild(textNode); } @@ -689,7 +689,7 @@ public void endElement(String uri, String localName, String qName) { // Add repeated elements as arrays for (Map.Entry> entry : repeatedElements.entrySet()) { List values = entry.getValue(); - if (removeEmptyValues == false || values.isEmpty() == false) { + if (removeEmptyValues == false || isEmptyValue(values) == false) { element.put(entry.getKey(), values); } } From bbe8d6c05b712ec01f256f24ea0cc7ed2d1192dc Mon Sep 17 00:00:00 2001 From: Joe Gallo Date: Tue, 16 Sep 2025 13:58:40 -0400 Subject: [PATCH 45/54] This can be static --- .../main/java/org/elasticsearch/ingest/common/XmlProcessor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index 6d51c007d5c51..29a27cad0cce2 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -217,7 +217,7 @@ public String getType() { * @param value the value to check * @return true if the value should be considered empty */ - private boolean isEmptyValue(Object value) { + private static boolean isEmptyValue(Object value) { if (value == null) { return true; } From 071f60ac1cb5b4bd7ebb086711274a5330349261 Mon Sep 17 00:00:00 2001 From: Joe Gallo Date: Tue, 16 Sep 2025 14:00:48 -0400 Subject: [PATCH 46/54] This doesn't need to handle the general case The callers are always either static-ly a String or a List, so let's just have static arities for exactly and only those versions. --- .../ingest/common/XmlProcessor.java | 36 +++++-------------- 1 file changed, 8 insertions(+), 28 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index 29a27cad0cce2..afca47de959d7 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -203,34 +203,14 @@ public String getType() { return TYPE; } - /** - * Determines if a value should be considered empty for filtering purposes. - * Used by the remove_empty_values feature to filter out empty content. - *

            - * Considers empty:

              - *
            • null values - *
            • empty or whitespace-only strings - *
            • empty Maps - *
            • empty Lists - *
            - * - * @param value the value to check - * @return true if the value should be considered empty - */ - private static boolean isEmptyValue(Object value) { - if (value == null) { - return true; - } - if (value instanceof String string) { - return string.isBlank(); - } - if (value instanceof Map map) { - return map.isEmpty(); - } - if (value instanceof List list) { - return list.isEmpty(); - } - return false; + // for use with remove_empty_values to filter out empty content + private static boolean isEmptyValue(List value) { + return value == null || value.isEmpty(); + } + + // for use with remove_empty_values to filter out empty content + private static boolean isEmptyValue(String value) { + return value == null || value.isBlank(); } /** From da00dab07c10cce4b19384c8d414df56e6fbf205 Mon Sep 17 00:00:00 2001 From: Joe Gallo Date: Tue, 16 Sep 2025 14:01:23 -0400 Subject: [PATCH 47/54] Whitespace They don't make a drug for what's wrong with me. --- .../java/org/elasticsearch/ingest/common/XmlProcessor.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index afca47de959d7..8e37182606a74 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -253,7 +253,7 @@ private String getNodeValue(Node node) { private Object applyForceArray(String ignoredElementName, Object content) { if (forceArray && (content instanceof List) == false) { List arrayContent = new ArrayList<>(); - arrayContent.add(content); // Add content even if it's null (for empty elements) + arrayContent.add(content); // Add content even if it's null (for empty elements) return arrayContent; } return content; @@ -841,7 +841,7 @@ private static SAXParserFactory createSecureSaxParserFactoryNamespaceAware() { private static DocumentBuilderFactory createSecureDocumentBuilderFactory() { try { DocumentBuilderFactory factory = XmlUtils.getHardenedBuilderFactory(); - factory.setValidating(false); // Override validation for DOM creation + factory.setValidating(false); // Override validation for DOM creation return factory; } catch (Exception e) { logger.warn("Cannot configure secure DOM builder factory - XML processor may not work correctly", e); From a4e6a93338d2a035a6c5cc57a49b2efa9ade39e0 Mon Sep 17 00:00:00 2001 From: Joe Gallo Date: Tue, 16 Sep 2025 14:06:01 -0400 Subject: [PATCH 48/54] Only trim the text if it's not blank --- .../elasticsearch/ingest/common/XmlProcessor.java | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index 8e37182606a74..8a4d4a02d356f 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -675,10 +675,14 @@ public void endElement(String uri, String localName, String qName) { } // Process text content and determine final element structure - String textContentString = textContent.toString(); - String trimmedText = textContentString.trim(); - boolean hasText = textContentString.isBlank() == false; - boolean hasChildren = element.isEmpty() == false; + final boolean hasText; + final String trimmedText; + { + String textContentString = textContent.toString(); + hasText = textContentString.isBlank() == false; + trimmedText = hasText ? textContentString.trim() : null; // only trim the text if it's not blank + } + final boolean hasChildren = element.isEmpty() == false; Object elementValue; if (hasChildren) { From 4336cfc4d1cec5e2346adf4f8b75bdbe06c622bf Mon Sep 17 00:00:00 2001 From: Joe Gallo Date: Thu, 18 Sep 2025 10:10:19 -0400 Subject: [PATCH 49/54] Handle these options in the opposite order --- .../ingest/common/XmlProcessor.java | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index 431945641811a..bfec9cf17c01e 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -417,18 +417,18 @@ public XmlProcessor create( boolean forceContent = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "force_content", false); boolean forceArray = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "force_array", false); - // Parse XPath expressions map - Map xpathExpressions = new HashMap<>(); - Map xpathConfig = ConfigurationUtils.readOptionalMap(TYPE, processorTag, config, "xpath"); - if (xpathConfig != null) { - for (Map.Entry entry : xpathConfig.entrySet()) { + // Parse namespaces map + Map namespaces = new HashMap<>(); + Map namespaceConfig = ConfigurationUtils.readOptionalMap(TYPE, processorTag, config, "namespaces"); + if (namespaceConfig != null) { + for (Map.Entry entry : namespaceConfig.entrySet()) { if (entry.getValue() instanceof String str) { - xpathExpressions.put(entry.getKey(), str); + namespaces.put(entry.getKey(), str); } else { throw new IllegalArgumentException( - "XPath target field [" + "Namespace prefix [" + entry.getKey() - + "] must be a string, got [" + + "] must have a string URI, got [" + entry.getValue().getClass().getSimpleName() + "]" ); @@ -436,18 +436,18 @@ public XmlProcessor create( } } - // Parse namespaces map - Map namespaces = new HashMap<>(); - Map namespaceConfig = ConfigurationUtils.readOptionalMap(TYPE, processorTag, config, "namespaces"); - if (namespaceConfig != null) { - for (Map.Entry entry : namespaceConfig.entrySet()) { + // Parse XPath expressions map + Map xpathExpressions = new HashMap<>(); + Map xpathConfig = ConfigurationUtils.readOptionalMap(TYPE, processorTag, config, "xpath"); + if (xpathConfig != null) { + for (Map.Entry entry : xpathConfig.entrySet()) { if (entry.getValue() instanceof String str) { - namespaces.put(entry.getKey(), str); + xpathExpressions.put(entry.getKey(), str); } else { throw new IllegalArgumentException( - "Namespace prefix [" + "XPath target field [" + entry.getKey() - + "] must have a string URI, got [" + + "] must be a string, got [" + entry.getValue().getClass().getSimpleName() + "]" ); From 9218fd92cc157f748c1fc2a334b8c8757c4bd272 Mon Sep 17 00:00:00 2001 From: Joe Gallo Date: Thu, 18 Sep 2025 10:11:16 -0400 Subject: [PATCH 50/54] Use better variable names here --- .../ingest/common/XmlProcessor.java | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index bfec9cf17c01e..0f0789ede68ac 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -422,12 +422,13 @@ public XmlProcessor create( Map namespaceConfig = ConfigurationUtils.readOptionalMap(TYPE, processorTag, config, "namespaces"); if (namespaceConfig != null) { for (Map.Entry entry : namespaceConfig.entrySet()) { - if (entry.getValue() instanceof String str) { - namespaces.put(entry.getKey(), str); + String prefix = entry.getKey(); + if (entry.getValue() instanceof String namespace) { + namespaces.put(prefix, namespace); } else { throw new IllegalArgumentException( "Namespace prefix [" - + entry.getKey() + + prefix + "] must have a string URI, got [" + entry.getValue().getClass().getSimpleName() + "]" @@ -441,13 +442,14 @@ public XmlProcessor create( Map xpathConfig = ConfigurationUtils.readOptionalMap(TYPE, processorTag, config, "xpath"); if (xpathConfig != null) { for (Map.Entry entry : xpathConfig.entrySet()) { - if (entry.getValue() instanceof String str) { - xpathExpressions.put(entry.getKey(), str); + String xpathExpression = entry.getKey(); + if (entry.getValue() instanceof String xpathTargetField) { + xpathExpressions.put(xpathExpression, xpathTargetField); } else { throw new IllegalArgumentException( - "XPath target field [" - + entry.getKey() - + "] must be a string, got [" + "XPath [" + + xpathExpression + + "] target field must be a string, got [" + entry.getValue().getClass().getSimpleName() + "]" ); From 980a7d40d99ee7a7a12fc46d665ceac8c6c007a0 Mon Sep 17 00:00:00 2001 From: Joe Gallo Date: Thu, 18 Sep 2025 10:27:51 -0400 Subject: [PATCH 51/54] Handle this validation in the factory --- .../ingest/common/XmlProcessor.java | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index 0f0789ede68ac..3fbc3a32fd48a 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -371,21 +371,10 @@ public Iterator getPrefixes(String namespaceURI) { }); } - // Use pre-compiled pattern to detect namespace prefixes - for (Map.Entry entry : xpathExpressions.entrySet()) { String xpathExpression = entry.getKey(); String targetFieldName = entry.getValue(); - // Validate namespace prefixes if no namespaces are configured - if (hasNamespaces == false && NAMESPACE_PATTERN.matcher(xpathExpression).find()) { - throw new IllegalArgumentException( - "Invalid XPath expression [" - + xpathExpression - + "]: contains namespace prefixes but no namespace configuration provided" - ); - } - try { XPathExpression compiledExpression = xpath.compile(xpathExpression); compiled.put(targetFieldName, compiledExpression); @@ -444,6 +433,15 @@ public XmlProcessor create( for (Map.Entry entry : xpathConfig.entrySet()) { String xpathExpression = entry.getKey(); if (entry.getValue() instanceof String xpathTargetField) { + + // If no namespaces are configured, then reject xpath expressions that contain namespaces + if (namespaces.isEmpty() && NAMESPACE_PATTERN.matcher(xpathExpression).find()) { + throw new IllegalArgumentException( + "Invalid XPath expression [" + + xpathExpression + + "]: contains namespace prefixes but no namespace configuration provided" + ); + } xpathExpressions.put(xpathExpression, xpathTargetField); } else { throw new IllegalArgumentException( From 68920c4344ae32b31c968d0d56f4f29cbc779872 Mon Sep 17 00:00:00 2001 From: Joe Gallo Date: Thu, 18 Sep 2025 10:40:47 -0400 Subject: [PATCH 52/54] Hoist the namespace context into a field --- .../ingest/common/XmlProcessor.java | 102 ++++++++++-------- 1 file changed, 55 insertions(+), 47 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index 3fbc3a32fd48a..96bf6ee2ca069 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -13,6 +13,7 @@ import org.apache.logging.log4j.Logger; import org.elasticsearch.cluster.metadata.ProjectId; import org.elasticsearch.common.Strings; +import org.elasticsearch.core.Nullable; import org.elasticsearch.core.XmlUtils; import org.elasticsearch.ingest.AbstractProcessor; import org.elasticsearch.ingest.ConfigurationUtils; @@ -99,6 +100,8 @@ private static class XmlFactories { private final Map xpathExpressions; private final Map namespaces; + @Nullable + private final NamespaceContext namespaceContext; private final Map compiledXPathExpressions; private final boolean needsDom; private final SAXParserFactory factory; @@ -128,11 +131,11 @@ private static class XmlFactories { this.removeNamespaces = removeNamespaces; this.forceContent = forceContent; this.forceArray = forceArray; - this.xpathExpressions = xpathExpressions != null ? Map.copyOf(xpathExpressions) : Map.of(); this.namespaces = namespaces != null ? Map.copyOf(namespaces) : Map.of(); - this.compiledXPathExpressions = compileXPathExpressions(this.xpathExpressions, this.namespaces); + this.namespaceContext = buildNamespaceContext(this.namespaces); + this.compiledXPathExpressions = compileXPathExpressions(this.namespaceContext, this.xpathExpressions); this.needsDom = this.xpathExpressions.isEmpty() == false; this.factory = selectSaxParserFactory(this.namespaces.isEmpty() == false || removeNamespaces); } @@ -308,67 +311,72 @@ private void processXPathExpressionsFromDom(IngestDocument document, Document xm } } + private static NamespaceContext buildNamespaceContext(Map namespaces) { + if (namespaces == null || namespaces.isEmpty()) { + return null; + } + + // build a read-only reverse map for quick look up + final Map> uriToPrefixes; + { + Map> innerUriToPrefixes = new HashMap<>(); + for (Map.Entry entry : namespaces.entrySet()) { + innerUriToPrefixes.computeIfAbsent(entry.getValue(), k -> new HashSet<>()).add(entry.getKey()); + } + innerUriToPrefixes.replaceAll((k, v) -> Set.copyOf(v)); + uriToPrefixes = Map.copyOf(innerUriToPrefixes); + } + + return new NamespaceContext() { + @Override + public String getNamespaceURI(String prefix) { + if (prefix == null) { + throw new IllegalArgumentException("Prefix cannot be null"); + } + return namespaces.getOrDefault(prefix, ""); + } + + @Override + public String getPrefix(String namespaceURI) { + if (namespaceURI == null) { + throw new IllegalArgumentException("namespaceURI cannot be null"); + } + Set prefixes = uriToPrefixes.get(namespaceURI); + return (prefixes == null || prefixes.isEmpty()) ? null : prefixes.iterator().next(); + } + + @Override + public Iterator getPrefixes(String namespaceURI) { + if (namespaceURI == null) { + throw new IllegalArgumentException("namespaceURI cannot be null"); + } + return uriToPrefixes.getOrDefault(namespaceURI, Set.of()).iterator(); + } + }; + } + /** * Compiles XPath expressions at processor creation time for optimal runtime performance. * This method pre-compiles all configured XPath expressions with appropriate namespace context, * eliminating the compilation overhead during document processing. * + * @param namespaceContext the namespace context for the XPath expressions, or null * @param xpathExpressions map of XPath expressions to target field names - * @param namespaces map of namespace prefixes to URIs * @return map of compiled XPath expressions keyed by target field name * @throws IllegalArgumentException if XPath compilation fails or namespace validation fails */ private static Map compileXPathExpressions( - Map xpathExpressions, - Map namespaces + NamespaceContext namespaceContext, + Map xpathExpressions ) { - if (xpathExpressions.isEmpty()) { + if (xpathExpressions == null || xpathExpressions.isEmpty()) { return Map.of(); } Map compiled = new HashMap<>(); XPath xpath = createSecureXPath(); - - // Set namespace context if namespaces are defined - boolean hasNamespaces = namespaces.isEmpty() == false; - if (hasNamespaces) { - // build a read-only reverse map for quick look up - final Map> uriToPrefixes; - { - Map> innerUriToPrefixes = new HashMap<>(); - for (Map.Entry entry : namespaces.entrySet()) { - innerUriToPrefixes.computeIfAbsent(entry.getValue(), k -> new HashSet<>()).add(entry.getKey()); - } - innerUriToPrefixes.replaceAll((k, v) -> Set.copyOf(v)); - uriToPrefixes = Map.copyOf(innerUriToPrefixes); - } - - xpath.setNamespaceContext(new NamespaceContext() { - @Override - public String getNamespaceURI(String prefix) { - if (prefix == null) { - throw new IllegalArgumentException("Prefix cannot be null"); - } - return namespaces.getOrDefault(prefix, ""); - } - - @Override - public String getPrefix(String namespaceURI) { - if (namespaceURI == null) { - throw new IllegalArgumentException("namespaceURI cannot be null"); - } - Set prefixes = uriToPrefixes.get(namespaceURI); - return (prefixes == null || prefixes.isEmpty()) ? null : prefixes.iterator().next(); - } - - @Override - public Iterator getPrefixes(String namespaceURI) { - if (namespaceURI == null) { - throw new IllegalArgumentException("namespaceURI cannot be null"); - } - return uriToPrefixes.getOrDefault(namespaceURI, Set.of()).iterator(); - } - }); + if (namespaceContext != null) { + xpath.setNamespaceContext(namespaceContext); } for (Map.Entry entry : xpathExpressions.entrySet()) { From 57ad518437df862ecdaf2120a36d13ce174419e1 Mon Sep 17 00:00:00 2001 From: Joe Gallo Date: Thu, 18 Sep 2025 11:11:59 -0400 Subject: [PATCH 53/54] Update this test string --- .../elasticsearch/ingest/common/XmlProcessorFactoryTests.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java index 7106bbf306809..cf75772f0fccf 100644 --- a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java @@ -144,7 +144,7 @@ public void testCreateWithInvalidXPathTargetField() throws Exception { expectCreationFailure( config, IllegalArgumentException.class, - "XPath target field [//author/text()] must be a string, got [Integer]" + "XPath [//author/text()] target field must be a string, got [Integer]" ); } From 88a03a34190c148a7a0e3e7094368be5bb0036e2 Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Thu, 2 Oct 2025 07:33:33 +0000 Subject: [PATCH 54/54] [CI] Update transport version definitions --- server/src/main/resources/transport/upper_bounds/8.18.csv | 2 +- server/src/main/resources/transport/upper_bounds/8.19.csv | 2 +- server/src/main/resources/transport/upper_bounds/9.0.csv | 2 +- server/src/main/resources/transport/upper_bounds/9.1.csv | 2 +- server/src/main/resources/transport/upper_bounds/9.2.csv | 2 +- server/src/main/resources/transport/upper_bounds/9.3.csv | 1 + 6 files changed, 6 insertions(+), 5 deletions(-) create mode 100644 server/src/main/resources/transport/upper_bounds/9.3.csv diff --git a/server/src/main/resources/transport/upper_bounds/8.18.csv b/server/src/main/resources/transport/upper_bounds/8.18.csv index 4eb5140004ea6..266bfbbd3bf78 100644 --- a/server/src/main/resources/transport/upper_bounds/8.18.csv +++ b/server/src/main/resources/transport/upper_bounds/8.18.csv @@ -1 +1 @@ -initial_elasticsearch_8_18_6,8840008 +transform_check_for_dangling_tasks,8840011 diff --git a/server/src/main/resources/transport/upper_bounds/8.19.csv b/server/src/main/resources/transport/upper_bounds/8.19.csv index 476468b203875..3600b3f8c633a 100644 --- a/server/src/main/resources/transport/upper_bounds/8.19.csv +++ b/server/src/main/resources/transport/upper_bounds/8.19.csv @@ -1 +1 @@ -initial_elasticsearch_8_19_3,8841067 +transform_check_for_dangling_tasks,8841070 diff --git a/server/src/main/resources/transport/upper_bounds/9.0.csv b/server/src/main/resources/transport/upper_bounds/9.0.csv index f8f50cc6d7839..c11e6837bb813 100644 --- a/server/src/main/resources/transport/upper_bounds/9.0.csv +++ b/server/src/main/resources/transport/upper_bounds/9.0.csv @@ -1 +1 @@ -initial_elasticsearch_9_0_6,9000015 +transform_check_for_dangling_tasks,9000018 diff --git a/server/src/main/resources/transport/upper_bounds/9.1.csv b/server/src/main/resources/transport/upper_bounds/9.1.csv index 5a65f2e578156..80b97d85f7511 100644 --- a/server/src/main/resources/transport/upper_bounds/9.1.csv +++ b/server/src/main/resources/transport/upper_bounds/9.1.csv @@ -1 +1 @@ -initial_elasticsearch_9_1_4,9112007 +transform_check_for_dangling_tasks,9112009 diff --git a/server/src/main/resources/transport/upper_bounds/9.2.csv b/server/src/main/resources/transport/upper_bounds/9.2.csv index e24f914a1d1ca..2147eab66c207 100644 --- a/server/src/main/resources/transport/upper_bounds/9.2.csv +++ b/server/src/main/resources/transport/upper_bounds/9.2.csv @@ -1 +1 @@ -ml_inference_endpoint_cache,9157000 +initial_9.2.0,9185000 diff --git a/server/src/main/resources/transport/upper_bounds/9.3.csv b/server/src/main/resources/transport/upper_bounds/9.3.csv new file mode 100644 index 0000000000000..2147eab66c207 --- /dev/null +++ b/server/src/main/resources/transport/upper_bounds/9.3.csv @@ -0,0 +1 @@ +initial_9.2.0,9185000