diff --git a/docs/changelog/130337.yaml b/docs/changelog/130337.yaml new file mode 100644 index 0000000000000..710c48f6367c7 --- /dev/null +++ b/docs/changelog/130337.yaml @@ -0,0 +1,6 @@ +pr: 130337 +summary: Add `xml` ingest processor for parsing XML +area: Ingest Node +type: enhancement +issues: + - 97364 diff --git a/docs/reference/enrich-processor/index.md b/docs/reference/enrich-processor/index.md index 55cb707006f54..fdf15602289a9 100644 --- a/docs/reference/enrich-processor/index.md +++ b/docs/reference/enrich-processor/index.md @@ -165,6 +165,9 @@ Refer to [Enrich your data](docs-content://manage-data/ingest/transform-enrich/d [`trim` processor](/reference/enrich-processor/trim-processor.md) : Trims whitespace from field. +[`xml` processor](/reference/enrich-processor/xml-processor.md) {applies_to}`stack: ga 9.2` +: Parses XML documents and converts them to JSON objects. + ## Data filtering processors [ingest-process-category-data-filtering] diff --git a/docs/reference/enrich-processor/toc.yml b/docs/reference/enrich-processor/toc.yml index 2a2d30ee86020..623c521b3832f 100644 --- a/docs/reference/enrich-processor/toc.yml +++ b/docs/reference/enrich-processor/toc.yml @@ -47,3 +47,4 @@ toc: - file: urldecode-processor.md - file: uri-parts-processor.md - file: user-agent-processor.md + - file: xml-processor.md diff --git a/docs/reference/enrich-processor/xml-processor.md b/docs/reference/enrich-processor/xml-processor.md new file mode 100644 index 0000000000000..93850168830f7 --- /dev/null +++ b/docs/reference/enrich-processor/xml-processor.md @@ -0,0 +1,485 @@ +--- +navigation_title: "XML" +applies_to: + stack: ga 9.2 +mapped_pages: + - https://www.elastic.co/guide/en/elasticsearch/reference/current/xml-processor.html +--- + +# XML processor [xml-processor] + +Parses XML documents and converts them to JSON objects using a streaming SAX parser. This processor efficiently handles XML data with a single-pass architecture that supports both structured output and XPath extraction for optimal performance. + +$$$xml-options$$$ + +| Name | Required | Default | Description | +| --- | --- | --- | --- | +| `field` | yes | - | The field containing the XML string to be parsed. | +| `target_field` | no | `field` | The field that the converted structured object will be written into. Any existing content in this field will be overwritten. | +| `store_xml` | no | `true` | If `true`, stores the parsed XML structure in the target field. If `false`, only XPath extraction results are stored and `target_field` is ignored. | +| `ignore_missing` | no | `false` | If `true` and `field` does not exist, the processor quietly exits without modifying the document. | +| `ignore_failure` | no | `false` | Ignore failures for the processor. See [Handling pipeline failures](docs-content://manage-data/ingest/transform-enrich/ingest-pipelines.md#handling-pipeline-failures). | +| `to_lower` | no | `false` | Convert XML element names and attribute names to lowercase. | +| `remove_empty_values` | no | `false` | If `true`, the processor will filter out null and empty values from the parsed XML structure, including empty elements, elements with null values, and elements with whitespace-only content. | +| `remove_namespaces` | no | `false` | If `true`, removes namespace prefixes from element and attribute names. | +| `force_content` | no | `false` | If `true`, forces text content and attributes to always parse to a hash value with `#text` key for content. | +| `force_array` | no | `false` | If `true`, forces all parsed values to be arrays. Single elements are wrapped in arrays. | +| `xpath` | no | - | Map of XPath expressions to target field names. Extracts values from the XML using XPath and stores them in the specified fields. | +| `namespaces` | no | - | Map of namespace prefixes to URIs for use with XPath expressions. Required when XPath expressions contain namespace prefixes. | +| `description` | no | - | Description of the processor. Useful for describing the purpose of the processor or its configuration. | +| `if` | no | - | Conditionally execute the processor. See [Conditionally run a processor](docs-content://manage-data/ingest/transform-enrich/ingest-pipelines.md#conditionally-run-processor). | +| `on_failure` | no | - | Handle failures for the processor. See [Handling pipeline failures](docs-content://manage-data/ingest/transform-enrich/ingest-pipelines.md#handling-pipeline-failures). | +| `tag` | no | - | Identifier for the processor. Useful for debugging and metrics. | + +## Configuration + +```js +{ + "xml": { + "field": "xml_field", + "target_field": "parsed_xml", + "remove_empty_values": true + } +} +``` + +## Examples + +### Basic XML parsing + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content" + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "William H. GaddisThe RecognitionsOne of the great seminal American novels." + } + } + ] +} +``` + +Result: + +```console-result +{ + "docs": [ + { + "doc": { + ... + "_source": { + "xml_content": "William H. GaddisThe RecognitionsOne of the great seminal American novels.", + "catalog": { + "book": { + "author": "William H. Gaddis", + "title": "The Recognitions", + "review": "One of the great seminal American novels." + } + } + } + } + } + ] +} +``` + +### Filtering empty values + +When `remove_empty_values` is set to `true`, the processor will remove empty elements from the parsed XML: + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content", + "target_field": "parsed_xml", + "remove_empty_values": true + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "William H. GaddisOne of the great seminal American novels. Some content" + } + } + ] +} +``` + +Result with empty elements filtered out: + +```console-result +{ + "docs": [ + { + "doc": { + ... + "_source": { + "xml_content": "William H. GaddisOne of the great seminal American novels. Some content", + "parsed_xml": { + "catalog": { + "book": { + "author": "William H. Gaddis", + "review": "One of the great seminal American novels.", + "nested": { + "valid_content": "Some content" + } + } + } + } + } + } + } + ] +} +``` + +### Handling XML attributes + +XML attributes are included as properties in the resulting JSON object alongside element content: + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content" + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "The RecognitionsWilliam H. Gaddis" + } + } + ] +} +``` + +Result: + +```console-result +{ + "docs": [ + { + "doc": { + ... + "_source": { + "xml_content": "The RecognitionsWilliam H. Gaddis", + "catalog": { + "version": "1.0", + "book": { + "id": "123", + "isbn": "978-0-684-80335-9", + "title": { + "lang": "en", + "#text": "The Recognitions" + }, + "author": { + "nationality": "American", + "#text": "William H. Gaddis" + } + } + } + } + } + } + ] +} +``` + +### XPath extraction + +The XML processor can extract specific values using XPath expressions and store them in designated fields: + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content", + "store_xml": false, + "xpath": { + "//book/title/text()": "book_title", + "//book/author/text()": "book_author", + "//book/@id": "book_id" + } + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "The RecognitionsWilliam H. Gaddis1984George Orwell" + } + } + ] +} +``` + +Result: + +```console-result +{ + "docs": [ + { + "doc": { + ... + "_source": { + "xml_content": "The RecognitionsWilliam H. Gaddis1984George Orwell", + "book_title": ["The Recognitions", "1984"], + "book_author": ["William H. Gaddis", "George Orwell"], + "book_id": ["123", "456"] + } + } + } + ] +} +``` + +### XPath with namespaces + +When working with XML that uses namespaces, you need to configure namespace mappings: + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content", + "namespaces": { + "book": "http://example.com/book", + "author": "http://example.com/author" + }, + "xpath": { + "//book:catalog/book:item/book:title/text()": "titles", + "//author:info/@name": "author_names" + } + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "The Recognitions" + } + } + ] +} +``` + +Result: + +```console-result +{ + "docs": [ + { + "doc": { + ... + "_source": { + "xml_content": "The Recognitions", + "titles": "The Recognitions", + "author_names": "William H. Gaddis", + "book:catalog": { + "book:item": { + "book:title": "The Recognitions", + "author:info": { + "name": "William H. Gaddis" + } + } + } + } + } + } + ] +} +``` + +### Mixed content handling + +When XML contains mixed content (text interspersed with elements), text fragments are combined and stored under the special `#text` key: + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content" + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "This text is bold and this is italic!" + } + } + ] +} +``` + +Result: + +```console-result +{ + "docs": [ + { + "doc": { + ... + "_source": { + "xml_content": "This text is bold and this is italic!", + "foo": { + "b": "bold", + "i": "italic", + "#text": "This text is and this is !" + } + } + } + } + ] +} +``` + +### Force content mode + +When `force_content` is `true`, all element text content is stored under the special `#text` key, even for simple elements without attributes. This provides a consistent structure when elements may have varying complexity. + +**Without force_content (default behavior):** + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content", + "force_content": false + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "The RecognitionsWilliam H. Gaddis" + } + } + ] +} +``` + +Result (simple elements as string values, complex elements with #text): + +```console-result +{ + "docs": [ + { + "doc": { + ... + "_source": { + "xml_content": "The RecognitionsWilliam H. Gaddis", + "book": { + "title": "The Recognitions", + "author": { + "nationality": "American", + "#text": "William H. Gaddis" + } + } + } + } + } + ] +} +``` + +**With force_content enabled:** + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content", + "force_content": true + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "The RecognitionsWilliam H. Gaddis" + } + } + ] +} +``` + +Result (all text content under #text key): + +```console-result +{ + "docs": [ + { + "doc": { + ... + "_source": { + "xml_content": "The RecognitionsWilliam H. Gaddis", + "book": { + "title": { + "#text": "The Recognitions" + }, + "author": { + "nationality": "American", + "#text": "William H. Gaddis" + } + } + } + } + } + ] +} +``` + +## XML features + +The XML processor supports: + +- **Elements with text content**: Converted to key-value pairs where the element name is the key and text content is the value +- **Nested elements**: Converted to nested JSON objects +- **Empty elements**: Converted to `null` values (can be filtered with `remove_empty_values`) +- **Repeated elements**: Converted to arrays when multiple elements with the same name exist at the same level +- **XML attributes**: Included as properties in the JSON object alongside element content. When an element has both attributes and text content, the text is stored under a special `#text` key +- **Mixed content**: Elements with both text and child elements include text under a special `#text` key while attributes and child elements become object properties +- **Namespaces**: Namespace prefixes are preserved by default and can be used in XPath expressions with the `namespaces` configuration. Use `remove_namespaces: true` to strip namespace prefixes from element names diff --git a/modules/ingest-common/src/main/java/module-info.java b/modules/ingest-common/src/main/java/module-info.java index c3b3ab90892d9..84e30519d2d1b 100644 --- a/modules/ingest-common/src/main/java/module-info.java +++ b/modules/ingest-common/src/main/java/module-info.java @@ -20,6 +20,8 @@ requires org.apache.lucene.analysis.common; requires org.jruby.joni; + requires java.xml; + exports org.elasticsearch.ingest.common; // for painless opens org.elasticsearch.ingest.common to org.elasticsearch.painless.spi; // whitelist resource access diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/IngestCommonPlugin.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/IngestCommonPlugin.java index 31f7034c2fd88..7ebf357a6f56e 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/IngestCommonPlugin.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/IngestCommonPlugin.java @@ -76,7 +76,8 @@ public Map getProcessors(Processor.Parameters paramet entry(TrimProcessor.TYPE, new TrimProcessor.Factory()), entry(URLDecodeProcessor.TYPE, new URLDecodeProcessor.Factory()), entry(UppercaseProcessor.TYPE, new UppercaseProcessor.Factory()), - entry(UriPartsProcessor.TYPE, new UriPartsProcessor.Factory()) + entry(UriPartsProcessor.TYPE, new UriPartsProcessor.Factory()), + entry(XmlProcessor.TYPE, new XmlProcessor.Factory()) ); } diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java new file mode 100644 index 0000000000000..96bf6ee2ca069 --- /dev/null +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -0,0 +1,897 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.ingest.common; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.elasticsearch.cluster.metadata.ProjectId; +import org.elasticsearch.common.Strings; +import org.elasticsearch.core.Nullable; +import org.elasticsearch.core.XmlUtils; +import org.elasticsearch.ingest.AbstractProcessor; +import org.elasticsearch.ingest.ConfigurationUtils; +import org.elasticsearch.ingest.IngestDocument; +import org.elasticsearch.ingest.Processor; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.w3c.dom.Text; +import org.xml.sax.Attributes; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import java.io.StringReader; +import java.lang.ref.SoftReference; +import java.util.ArrayDeque; +import java.util.ArrayList; +import java.util.Deque; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.regex.Pattern; + +import javax.xml.namespace.NamespaceContext; +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; +import javax.xml.xpath.XPath; +import javax.xml.xpath.XPathConstants; +import javax.xml.xpath.XPathExpression; +import javax.xml.xpath.XPathExpressionException; + +/** + * Processor that parses XML documents and converts them to JSON objects using a single-pass streaming approach. + *

+ * Features: + *

+ */ +public final class XmlProcessor extends AbstractProcessor { + + public static final String TYPE = "xml"; + private static final Logger logger = LogManager.getLogger(XmlProcessor.class); + + // Pre-compiled pattern to detect namespace prefixes + private static final Pattern NAMESPACE_PATTERN = Pattern.compile("\\b[a-zA-Z][a-zA-Z0-9_-]*:[a-zA-Z][a-zA-Z0-9_-]*"); + + /** + * Lazily-initialized XML factories to avoid node startup failures if the JDK doesn't support required functionality. + * This inner class will only be loaded when XML processing is actually used. + */ + private static class XmlFactories { + // Pre-configured secure XML parser factories using XmlUtils + static final SAXParserFactory SAX_PARSER_FACTORY = createSecureSaxParserFactory(); + static final SAXParserFactory SAX_PARSER_FACTORY_NS = createSecureSaxParserFactoryNamespaceAware(); + + // Pre-configured secure document builder factory for DOM creation + static final DocumentBuilderFactory DOM_FACTORY = createSecureDocumentBuilderFactory(); + } + + private final String field; + private final String targetField; + private final boolean ignoreMissing; + private final boolean toLower; + private final boolean removeEmptyValues; + private final boolean storeXml; + private final boolean removeNamespaces; + private final boolean forceContent; + private final boolean forceArray; + private final Map xpathExpressions; + private final Map namespaces; + + @Nullable + private final NamespaceContext namespaceContext; + private final Map compiledXPathExpressions; + private final boolean needsDom; + private final SAXParserFactory factory; + + XmlProcessor( + String tag, + String description, + String field, + String targetField, + boolean ignoreMissing, + boolean toLower, + boolean removeEmptyValues, + boolean storeXml, + boolean removeNamespaces, + boolean forceContent, + boolean forceArray, + Map xpathExpressions, + Map namespaces + ) { + super(tag, description); + this.field = field; + this.targetField = targetField; + this.ignoreMissing = ignoreMissing; + this.toLower = toLower; + this.removeEmptyValues = removeEmptyValues; + this.storeXml = storeXml; + this.removeNamespaces = removeNamespaces; + this.forceContent = forceContent; + this.forceArray = forceArray; + this.xpathExpressions = xpathExpressions != null ? Map.copyOf(xpathExpressions) : Map.of(); + this.namespaces = namespaces != null ? Map.copyOf(namespaces) : Map.of(); + + this.namespaceContext = buildNamespaceContext(this.namespaces); + this.compiledXPathExpressions = compileXPathExpressions(this.namespaceContext, this.xpathExpressions); + this.needsDom = this.xpathExpressions.isEmpty() == false; + this.factory = selectSaxParserFactory(this.namespaces.isEmpty() == false || removeNamespaces); + } + + public String getField() { + return field; + } + + public String getTargetField() { + return targetField; + } + + public boolean isIgnoreMissing() { + return ignoreMissing; + } + + public boolean isRemoveEmptyValues() { + return removeEmptyValues; + } + + public boolean isStoreXml() { + return storeXml; + } + + public boolean isRemoveNamespaces() { + return removeNamespaces; + } + + public boolean isForceContent() { + return forceContent; + } + + public boolean isForceArray() { + return forceArray; + } + + public Map getNamespaces() { + return namespaces; + } + + @Override + public IngestDocument execute(IngestDocument document) { + String input = document.getFieldValue(field, String.class, ignoreMissing); + if (input == null && ignoreMissing) { + return document; + } else if (input == null) { + throw new IllegalArgumentException("field [" + field + "] is null, cannot parse XML"); + } + + // TODO this just seems like misconfiguration to me. Why should we all this at all? + if (storeXml == false && xpathExpressions.isEmpty()) { + return document; + } + + try { + parseXmlAndXPath(document, input.trim()); + } catch (Exception e) { + throw new IllegalArgumentException("field [" + field + "] contains invalid XML", e); + } + + return document; + } + + @Override + public String getType() { + return TYPE; + } + + // for use with remove_empty_values to filter out empty content + private static boolean isEmptyValue(List value) { + return value == null || value.isEmpty(); + } + + // for use with remove_empty_values to filter out empty content + private static boolean isEmptyValue(String value) { + return value == null || value.isBlank(); + } + + /** + * Extract the text value from a DOM node for XPath result processing. + * Handles different node types appropriately:
    + *
  • TEXT_NODE and CDATA_SECTION_NODE: returns node value directly + *
  • ATTRIBUTE_NODE: returns attribute value + *
  • ELEMENT_NODE: returns text content (concatenated text of all descendants) + *
  • Other node types: returns text content as fallback + *
+ * + * @param node the DOM node to extract text from + * @return the text content of the node, or null if node is null + */ + private String getNodeValue(Node node) { + if (node == null) { + return null; + } + + return switch (node.getNodeType()) { + case Node.ATTRIBUTE_NODE, Node.CDATA_SECTION_NODE, Node.TEXT_NODE -> node.getNodeValue(); + default -> node.getTextContent(); + }; + } + + /** + * Applies force_array logic to ensure all fields are arrays when enabled. + *

+ * Behavior:

    + *
  • If force_array is false: returns content unchanged + *
  • If force_array is true and content is already a List: returns content unchanged + *
  • If force_array is true and content is not a List: wraps content in a new ArrayList + *
  • Handles null content appropriately (wraps null in array if force_array is true) + *
+ * + * @param ignoredElementName the name of the element (for context, not used in current implementation) + * @param content the content to potentially wrap in an array + * @return the content, optionally wrapped in an array based on force_array setting + */ + private Object applyForceArray(String ignoredElementName, Object content) { + if (forceArray && (content instanceof List) == false) { + List arrayContent = new ArrayList<>(); + arrayContent.add(content); // Add content even if it's null (for empty elements) + return arrayContent; + } + return content; + } + + /** + * Evaluates precompiled XPath expressions against a DOM document and adds results to the ingest document. + *

+ * Features:

    + *
  • Uses precompiled XPath expressions for optimal performance + *
  • Extracts text values from matched nodes (elements, attributes, text nodes) + *
  • Single matches stored as strings, multiple matches as string arrays + *
+ * + * @param document the ingest document to add XPath results to + * @param xmlDocument the DOM document to evaluate XPath expressions against + * @throws XPathExpressionException if XPath processing fails + */ + private void processXPathExpressionsFromDom(IngestDocument document, Document xmlDocument) throws XPathExpressionException { + // Use precompiled XPath expressions for optimal performance + for (Map.Entry entry : compiledXPathExpressions.entrySet()) { + String targetFieldName = entry.getKey(); + XPathExpression compiledExpression = entry.getValue(); + Object result = compiledExpression.evaluate(xmlDocument, XPathConstants.NODESET); + + if (result instanceof NodeList nodeList) { + // separate the case for 1 vs multiple nodeList elements to avoid unnecessary array allocation, this optimization is only + // done because this is a per-document hot code path + if (nodeList.getLength() == 1) { + Node node = nodeList.item(0); + String value = getNodeValue(node); + if (Strings.hasText(value)) { + document.setFieldValue(targetFieldName, value); + } + } else if (nodeList.getLength() > 1) { + List values = new ArrayList<>(); + for (int i = 0; i < nodeList.getLength(); i++) { + Node node = nodeList.item(i); + String value = getNodeValue(node); + if (Strings.hasText(value)) { + values.add(value); + } + } + + if (values.isEmpty() == false) { + if (values.size() == 1) { + document.setFieldValue(targetFieldName, values.getFirst()); + } else { + document.setFieldValue(targetFieldName, values); + } + } + } + } + } + } + + private static NamespaceContext buildNamespaceContext(Map namespaces) { + if (namespaces == null || namespaces.isEmpty()) { + return null; + } + + // build a read-only reverse map for quick look up + final Map> uriToPrefixes; + { + Map> innerUriToPrefixes = new HashMap<>(); + for (Map.Entry entry : namespaces.entrySet()) { + innerUriToPrefixes.computeIfAbsent(entry.getValue(), k -> new HashSet<>()).add(entry.getKey()); + } + innerUriToPrefixes.replaceAll((k, v) -> Set.copyOf(v)); + uriToPrefixes = Map.copyOf(innerUriToPrefixes); + } + + return new NamespaceContext() { + @Override + public String getNamespaceURI(String prefix) { + if (prefix == null) { + throw new IllegalArgumentException("Prefix cannot be null"); + } + return namespaces.getOrDefault(prefix, ""); + } + + @Override + public String getPrefix(String namespaceURI) { + if (namespaceURI == null) { + throw new IllegalArgumentException("namespaceURI cannot be null"); + } + Set prefixes = uriToPrefixes.get(namespaceURI); + return (prefixes == null || prefixes.isEmpty()) ? null : prefixes.iterator().next(); + } + + @Override + public Iterator getPrefixes(String namespaceURI) { + if (namespaceURI == null) { + throw new IllegalArgumentException("namespaceURI cannot be null"); + } + return uriToPrefixes.getOrDefault(namespaceURI, Set.of()).iterator(); + } + }; + } + + /** + * Compiles XPath expressions at processor creation time for optimal runtime performance. + * This method pre-compiles all configured XPath expressions with appropriate namespace context, + * eliminating the compilation overhead during document processing. + * + * @param namespaceContext the namespace context for the XPath expressions, or null + * @param xpathExpressions map of XPath expressions to target field names + * @return map of compiled XPath expressions keyed by target field name + * @throws IllegalArgumentException if XPath compilation fails or namespace validation fails + */ + private static Map compileXPathExpressions( + NamespaceContext namespaceContext, + Map xpathExpressions + ) { + if (xpathExpressions == null || xpathExpressions.isEmpty()) { + return Map.of(); + } + + Map compiled = new HashMap<>(); + XPath xpath = createSecureXPath(); + if (namespaceContext != null) { + xpath.setNamespaceContext(namespaceContext); + } + + for (Map.Entry entry : xpathExpressions.entrySet()) { + String xpathExpression = entry.getKey(); + String targetFieldName = entry.getValue(); + + try { + XPathExpression compiledExpression = xpath.compile(xpathExpression); + compiled.put(targetFieldName, compiledExpression); + } catch (XPathExpressionException e) { + throw new IllegalArgumentException("Invalid XPath expression [" + xpathExpression + "]", e); + } + } + + return Map.copyOf(compiled); + } + + public static final class Factory implements Processor.Factory { + + @Override + public XmlProcessor create( + Map registry, + String processorTag, + String description, + Map config, + ProjectId projectId + ) throws Exception { + String field = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "field"); + String targetField = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "target_field", field); + boolean ignoreMissing = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "ignore_missing", false); + boolean toLower = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "to_lower", false); + boolean removeEmptyValues = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "remove_empty_values", false); + boolean storeXml = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "store_xml", true); + boolean removeNamespaces = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "remove_namespaces", false); + boolean forceContent = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "force_content", false); + boolean forceArray = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "force_array", false); + + // Parse namespaces map + Map namespaces = new HashMap<>(); + Map namespaceConfig = ConfigurationUtils.readOptionalMap(TYPE, processorTag, config, "namespaces"); + if (namespaceConfig != null) { + for (Map.Entry entry : namespaceConfig.entrySet()) { + String prefix = entry.getKey(); + if (entry.getValue() instanceof String namespace) { + namespaces.put(prefix, namespace); + } else { + throw new IllegalArgumentException( + "Namespace prefix [" + + prefix + + "] must have a string URI, got [" + + entry.getValue().getClass().getSimpleName() + + "]" + ); + } + } + } + + // Parse XPath expressions map + Map xpathExpressions = new HashMap<>(); + Map xpathConfig = ConfigurationUtils.readOptionalMap(TYPE, processorTag, config, "xpath"); + if (xpathConfig != null) { + for (Map.Entry entry : xpathConfig.entrySet()) { + String xpathExpression = entry.getKey(); + if (entry.getValue() instanceof String xpathTargetField) { + + // If no namespaces are configured, then reject xpath expressions that contain namespaces + if (namespaces.isEmpty() && NAMESPACE_PATTERN.matcher(xpathExpression).find()) { + throw new IllegalArgumentException( + "Invalid XPath expression [" + + xpathExpression + + "]: contains namespace prefixes but no namespace configuration provided" + ); + } + xpathExpressions.put(xpathExpression, xpathTargetField); + } else { + throw new IllegalArgumentException( + "XPath [" + + xpathExpression + + "] target field must be a string, got [" + + entry.getValue().getClass().getSimpleName() + + "]" + ); + } + } + } + + return new XmlProcessor( + processorTag, + description, + field, + targetField, + ignoreMissing, + toLower, + removeEmptyValues, + storeXml, + removeNamespaces, + forceContent, + forceArray, + xpathExpressions, + namespaces + ); + } + } + + private static final Map>> PARSERS = new ConcurrentHashMap<>(); + static { + PARSERS.put(XmlFactories.SAX_PARSER_FACTORY, new ThreadLocal<>()); + PARSERS.put(XmlFactories.SAX_PARSER_FACTORY_NS, new ThreadLocal<>()); + } + + /** + * Main XML parsing method that converts XML to JSON and optionally extracts XPath values. + * Uses streaming SAX parser with optional DOM building for XPath processing. + * + * @param document the ingest document to modify with parsed results + * @param xml the XML string to parse (should be trimmed) + * @throws Exception if XML parsing fails + */ + private void parseXmlAndXPath(IngestDocument document, String xml) throws Exception { + if (Strings.hasText(xml) == false) { + return; + } + + final SAXParser parser = getParser(factory); + final XmlStreamingWithDomHandler handler; + try { + // Use enhanced handler that can build DOM during streaming when needed + handler = new XmlStreamingWithDomHandler(needsDom); + parser.parse(new InputSource(new StringReader(xml)), handler); + } finally { + parser.reset(); + } + + // Store structured result if needed + if (storeXml) { + Object streamingResult = handler.getStructuredResult(); + if (streamingResult != null) { + document.setFieldValue(targetField, streamingResult); + } + } + + // Process XPath expressions if DOM was built during streaming + if (needsDom) { + Document domDocument = handler.getDomDocument(); + assert domDocument != null : "DOM document should not be null when XPath processing is needed"; + processXPathExpressionsFromDom(document, domDocument); + } + } + + /** + * Gets the parser reference or creates a new one if the soft reference has been cleared. + */ + private static SAXParser getParser(SAXParserFactory factory) throws SAXException, ParserConfigurationException { + final ThreadLocal> threadLocal = PARSERS.getOrDefault(factory, new ThreadLocal<>()); + final SoftReference parserReference = threadLocal.get(); + + SAXParser parser = parserReference != null ? parserReference.get() : null; + if (parser == null) { + parser = factory.newSAXParser(); + threadLocal.set(new SoftReference<>(parser)); + } + return parser; + } + + /** + * SAX ContentHandler that builds structured JSON output and optionally constructs a DOM tree during parsing. + * Handles XML-to-JSON conversion with support for all processor configuration options. + */ + private class XmlStreamingWithDomHandler extends DefaultHandler { + + /** + * Record to encapsulate the parsing state for each XML element level. + * Maintains the 1:1:1:1 relationship between element data, name, text content, and repeated elements. + */ + private record ElementParsingState( + Map element, + String elementName, + StringBuilder textContent, + Map> repeatedElements + ) {} + + // Streaming parser state (for structured output) + private final Deque elementStack = new ArrayDeque<>(); + private Object rootResult = null; + + // DOM building state (for XPath processing when needed) + private final boolean buildDom; + private Document domDocument = null; + private final Deque domElementStack = new ArrayDeque<>(); + + XmlStreamingWithDomHandler(boolean buildDom) { + this.buildDom = buildDom; + } + + @Override + public void startDocument() throws SAXException { + // Initialize DOM document if needed + if (buildDom) { + try { + // Use pre-configured secure DOM factory + // Since we build DOM programmatically (createElementNS/createElement), + // the factory's namespace awareness doesn't affect our usage + DocumentBuilder builder = XmlFactories.DOM_FACTORY.newDocumentBuilder(); + domDocument = builder.newDocument(); + } catch (Exception e) { + throw new SAXException("Failed to create DOM document", e); + } + } + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes attributes) { + String elementName = getElementName(uri, localName, qName); + + // Build structured representation (always) + Map element = new HashMap<>(); + Map> repeatedElements = new HashMap<>(); + + // Process attributes for structured output + for (int i = 0; i < attributes.getLength(); i++) { + String attrName = getAttributeName(attributes.getURI(i), attributes.getLocalName(i), attributes.getQName(i)); + String attrValue = attributes.getValue(i); + + // Apply removeEmptyValues filtering to attributes + if (removeEmptyValues == false || isEmptyValue(attrValue) == false) { + element.put(attrName, attrValue); + } + } + + elementStack.push(new ElementParsingState(element, elementName, new StringBuilder(), repeatedElements)); + + // Build DOM element simultaneously if needed + if (buildDom && domDocument != null) { + Element domElement; + if (uri != null && uri.isEmpty() == false && removeNamespaces == false) { + domElement = domDocument.createElementNS(uri, qName); + } else { + domElement = domDocument.createElement(removeNamespaces ? localName : qName); + } + + // Add attributes to DOM element + for (int i = 0; i < attributes.getLength(); i++) { + String attrUri = attributes.getURI(i); + String attrLocalName = attributes.getLocalName(i); + String attrQName = attributes.getQName(i); + String attrValue = attributes.getValue(i); + + if (attrUri != null && attrUri.isEmpty() == false && removeNamespaces == false) { + domElement.setAttributeNS(attrUri, attrQName, attrValue); + } else { + domElement.setAttribute(removeNamespaces ? attrLocalName : attrQName, attrValue); + } + } + + // Add to parent or root + if (domElementStack.isEmpty()) { + domDocument.appendChild(domElement); + } else { + domElementStack.peek().appendChild(domElement); + } + + domElementStack.push(domElement); + } + } + + @Override + public void characters(char[] ch, int start, int length) { + // Add to structured output text accumulator + if (elementStack.isEmpty() == false) { + elementStack.peek().textContent().append(ch, start, length); + } + + // Add to DOM text node if needed + if (buildDom && domElementStack.isEmpty() == false) { + String text = new String(ch, start, length); + if (removeEmptyValues == false || isEmptyValue(text) == false) { + Text textNode = domDocument.createTextNode(text); + domElementStack.peek().appendChild(textNode); + } + } + } + + @Override + public void endElement(String uri, String localName, String qName) { + // Complete structured output element processing + if (elementStack.isEmpty()) { + return; + } + + ElementParsingState currentState = elementStack.pop(); + Map element = currentState.element(); + String elementName = currentState.elementName(); + StringBuilder textContent = currentState.textContent(); + Map> repeatedElements = currentState.repeatedElements(); + + // Add repeated elements as arrays + for (Map.Entry> entry : repeatedElements.entrySet()) { + List values = entry.getValue(); + if (removeEmptyValues == false || isEmptyValue(values) == false) { + element.put(entry.getKey(), values); + } + } + + // Process text content and determine final element structure + final boolean hasText; + final String trimmedText; + { + String textContentString = textContent.toString(); + hasText = textContentString.isBlank() == false; + trimmedText = hasText ? textContentString.trim() : null; // only trim the text if it's not blank + } + final boolean hasChildren = element.isEmpty() == false; + + Object elementValue; + if (hasChildren) { + if (hasText) { + // Both text and children/attributes + if (removeEmptyValues == false || isEmptyValue(trimmedText) == false) { + element.put("#text", trimmedText); + } + elementValue = (forceArray && forceContent) ? applyForceArray(elementName, element) : element; + } else { + // Only child elements/attributes + elementValue = (forceArray && forceContent) ? applyForceArray(elementName, element) : element; + } + } else { + if (hasText) { + // Only text content + if (forceContent) { + Map contentMap = new HashMap<>(); + if (removeEmptyValues == false || isEmptyValue(trimmedText) == false) { + contentMap.put("#text", trimmedText); + } + elementValue = contentMap; + } else { + if (removeEmptyValues == false || isEmptyValue(trimmedText) == false) { + elementValue = trimmedText; + } else { + elementValue = null; + } + } + elementValue = applyForceArray(elementName, elementValue); + } else { + // Empty element + if (removeEmptyValues == false) { + elementValue = applyForceArray(elementName, null); + } else { + elementValue = null; + } + } + } + + // If this is the root element, store the result + if (elementStack.isEmpty()) { + if (elementValue != null) { + Map result = new HashMap<>(); + result.put(elementName, elementValue); + rootResult = result; + } + } else { + // Add to parent element + if (elementValue != null) { + ElementParsingState parentState = elementStack.peek(); + Map parentElement = parentState.element(); + Map> parentRepeatedElements = parentState.repeatedElements(); + + if (parentElement.containsKey(elementName) || parentRepeatedElements.containsKey(elementName)) { + // Handle repeated elements + if (parentRepeatedElements.containsKey(elementName) == false) { + List list = new ArrayList<>(); + list.add(parentElement.get(elementName)); + parentRepeatedElements.put(elementName, list); + parentElement.remove(elementName); + } + parentRepeatedElements.get(elementName).add(elementValue); + } else { + // Apply force_array logic for single elements + Object finalContent = applyForceArray(elementName, elementValue); + parentElement.put(elementName, finalContent); + } + } + } + + // Complete DOM element if building DOM + if (buildDom && domElementStack.isEmpty() == false) { + domElementStack.pop(); + } + } + + @Override + public void endDocument() { + // Document parsing complete + } + + public Object getStructuredResult() { + return rootResult; + } + + public Document getDomDocument() { + return domDocument; + } + + private String getElementName(String ignoredUri, String localName, String qName) { + String elementName; + if (removeNamespaces) { + elementName = localName != null && localName.isEmpty() == false ? localName : qName; + } else { + elementName = qName; + } + + // Apply toLower if enabled + if (toLower) { + elementName = elementName.toLowerCase(Locale.ROOT); + } + + return elementName; + } + + private String getAttributeName(String ignoredUri, String localName, String qName) { + String attrName; + if (removeNamespaces) { + attrName = localName != null && localName.isEmpty() == false ? localName : qName; + } else { + attrName = qName; + } + + // Apply toLower if enabled + if (toLower) { + attrName = attrName.toLowerCase(Locale.ROOT); + } + + return attrName; + } + } + + /** + * Creates a secure, pre-configured SAX parser factory for XML parsing using XmlUtils. + */ + private static SAXParserFactory createSecureSaxParserFactory() { + try { + SAXParserFactory factory = XmlUtils.getHardenedSaxParserFactory(); + factory.setValidating(false); + factory.setNamespaceAware(false); + return factory; + } catch (Exception e) { + logger.warn("Cannot configure secure XML parsing features - XML processor may not work correctly", e); + return null; + } + } + + /** + * Creates a secure, pre-configured namespace-aware SAX parser factory for XML parsing using XmlUtils. + */ + private static SAXParserFactory createSecureSaxParserFactoryNamespaceAware() { + try { + SAXParserFactory factory = XmlUtils.getHardenedSaxParserFactory(); + factory.setValidating(false); + factory.setNamespaceAware(true); + return factory; + } catch (Exception e) { + logger.warn("Cannot configure secure namespace-aware XML parsing features - XML processor may not work correctly", e); + return null; + } + } + + /** + * Creates a secure, pre-configured DocumentBuilderFactory for DOM creation using XmlUtils. + * Since we only use this factory to create empty DOM documents programmatically + * (not to parse XML), we use the hardened builder factory. + * The SAX parser handles all XML parsing with appropriate security measures. + */ + private static DocumentBuilderFactory createSecureDocumentBuilderFactory() { + try { + DocumentBuilderFactory factory = XmlUtils.getHardenedBuilderFactory(); + factory.setValidating(false); // Override validation for DOM creation + return factory; + } catch (Exception e) { + logger.warn("Cannot configure secure DOM builder factory - XML processor may not work correctly", e); + return null; + } + } + + /** + * Creates a secure, pre-configured XPath object expression evaluation using XmlUtils. + */ + private static XPath createSecureXPath() { + try { + return XmlUtils.getHardenedXPath(); + } catch (Exception e) { + logger.warn("Cannot configure secure XPath object - XML processor may not work correctly", e); + return null; + } + } + + /** + * Selects the appropriate pre-configured SAX parser factory based on processor configuration. + *

+ * Factory selection matrix:

    + *
  • Regular parsing, no namespaces: SAX_PARSER_FACTORY + *
  • Regular parsing, with namespaces: SAX_PARSER_FACTORY_NS + *
+ * + * @return the appropriate SAX parser factory for the current configuration + * @throws UnsupportedOperationException if the required XML factory is not available + */ + private static SAXParserFactory selectSaxParserFactory(final boolean needsNamespaceAware) { + SAXParserFactory factory = needsNamespaceAware ? XmlFactories.SAX_PARSER_FACTORY_NS : XmlFactories.SAX_PARSER_FACTORY; + if (factory == null) { + throw new UnsupportedOperationException( + "XML parsing" + + (needsNamespaceAware ? " with namespace-aware features " : " ") + + "is not supported by the current JDK. Please update your JDK to one that " + + "supports these XML features." + ); + } + return factory; + } +} diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java new file mode 100644 index 0000000000000..cf75772f0fccf --- /dev/null +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java @@ -0,0 +1,300 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.ingest.common; + +import org.elasticsearch.ElasticsearchParseException; +import org.elasticsearch.test.ESTestCase; + +import java.util.HashMap; +import java.util.Map; + +import static org.hamcrest.Matchers.anEmptyMap; +import static org.hamcrest.Matchers.containsString; +import static org.hamcrest.Matchers.equalTo; + +public class XmlProcessorFactoryTests extends ESTestCase { + + private static final String DEFAULT_FIELD = "field1"; + private static final String DEFAULT_TARGET_FIELD = "target"; + + /** + * Creates a new XmlProcessor.Factory instance for testing. + */ + private XmlProcessor.Factory createFactory() { + return new XmlProcessor.Factory(); + } + + /** + * Creates a processor with the default factory and given configuration. + * This validates that all configuration parameters are consumed during processor creation. + */ + private XmlProcessor createProcessor(Map config) throws Exception { + XmlProcessor.Factory factory = createFactory(); + String processorTag = randomAlphaOfLength(10); + + // Make a copy of the config to avoid modifying the original + Map configCopy = new HashMap<>(config); + + // Create the processor (this should consume config parameters) + XmlProcessor processor = factory.create(null, processorTag, null, configCopy, null); + + // Validate that all configuration parameters were consumed + assertThat(configCopy, anEmptyMap()); + + return processor; + } + + /** + * Helper to expect processor creation failure with specific message. + */ + private void expectCreationFailure(Map config, Class exceptionClass, String expectedMessage) { + XmlProcessor.Factory factory = createFactory(); + String processorTag = randomAlphaOfLength(10); + + // Make a mutable copy since Map.of creates immutable maps + Map configCopy = new HashMap<>(config); + + Exception exception = expectThrows(exceptionClass, () -> factory.create(null, processorTag, null, configCopy, null)); + assertThat(exception.getMessage(), equalTo(expectedMessage)); + } + + /** + * Tests processor creation with various configurations. + */ + public void testCreate() throws Exception { + Map config = Map.of( + "field", + DEFAULT_FIELD, + "target_field", + DEFAULT_TARGET_FIELD, + "ignore_missing", + true, + "to_lower", + true, + "remove_empty_values", + true + ); + + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.getTargetField(), equalTo(DEFAULT_TARGET_FIELD)); + assertThat(processor.isIgnoreMissing(), equalTo(true)); + assertThat(processor.isRemoveEmptyValues(), equalTo(true)); + } + + public void testCreateWithDefaults() throws Exception { + Map config = Map.of("field", DEFAULT_FIELD); + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.getTargetField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.isIgnoreMissing(), equalTo(false)); + assertThat(processor.isRemoveEmptyValues(), equalTo(false)); + } + + public void testCreateMissingField() throws Exception { + Map config = Map.of(); // Empty config - no field specified + expectCreationFailure(config, ElasticsearchParseException.class, "[field] required property is missing"); + } + + public void testCreateWithRemoveEmptyValuesOnly() throws Exception { + Map config = Map.of("field", DEFAULT_FIELD, "remove_empty_values", true); + + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.isRemoveEmptyValues(), equalTo(true)); + assertThat(processor.isIgnoreMissing(), equalTo(false)); // other flags should remain default + } + + public void testCreateWithXPath() throws Exception { + Map xpathConfig = Map.of("//author/text()", "author_field", "//title/@lang", "language_field"); + Map config = Map.of("field", DEFAULT_FIELD, "xpath", xpathConfig); + + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + } + + public void testCreateWithInvalidXPathConfig() throws Exception { + Map config = Map.of( + "field", + DEFAULT_FIELD, + "xpath", + "invalid_string" // Should be a map + ); + + expectCreationFailure(config, ElasticsearchParseException.class, "[xpath] property isn't a map, but of type [java.lang.String]"); + } + + public void testCreateWithInvalidXPathTargetField() throws Exception { + Map xpathConfig = new HashMap<>(); + xpathConfig.put("//author/text()", 123); // Should be string + + Map config = Map.of("field", DEFAULT_FIELD, "xpath", xpathConfig); + + expectCreationFailure( + config, + IllegalArgumentException.class, + "XPath [//author/text()] target field must be a string, got [Integer]" + ); + } + + public void testCreateWithNamespaces() throws Exception { + Map namespacesConfig = Map.of("book", "http://example.com/book", "author", "http://example.com/author"); + Map config = Map.of("field", DEFAULT_FIELD, "namespaces", namespacesConfig); + + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.getNamespaces(), equalTo(namespacesConfig)); + } + + public void testCreateWithInvalidNamespacesConfig() throws Exception { + Map config = Map.of( + "field", + DEFAULT_FIELD, + "namespaces", + "invalid_string" // Should be a map + ); + + expectCreationFailure( + + config, + + ElasticsearchParseException.class, + + "[namespaces] property isn't a map, but of type [java.lang.String]" + + ); + } + + public void testCreateWithInvalidNamespaceURI() throws Exception { + Map namespacesConfig = new HashMap<>(); + namespacesConfig.put("book", 123); // Should be string + + Map config = Map.of("field", DEFAULT_FIELD, "namespaces", namespacesConfig); + + expectCreationFailure(config, IllegalArgumentException.class, "Namespace prefix [book] must have a string URI, got [Integer]"); + } + + public void testCreateWithXPathAndNamespaces() throws Exception { + Map xpathConfig = Map.of("//book:author/text()", "author_field", "//book:title/@lang", "language_field"); + Map namespacesConfig = Map.of("book", "http://example.com/book"); + Map config = Map.of("field", DEFAULT_FIELD, "xpath", xpathConfig, "namespaces", namespacesConfig); + + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.getNamespaces(), equalTo(namespacesConfig)); + } + + // Tests for individual boolean options + + public void testCreateWithStoreXmlFalse() throws Exception { + Map config = Map.of("field", DEFAULT_FIELD, "store_xml", false); + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.isStoreXml(), equalTo(false)); + } + + public void testCreateWithRemoveNamespaces() throws Exception { + Map config = Map.of("field", DEFAULT_FIELD, "remove_namespaces", true); + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.isRemoveNamespaces(), equalTo(true)); + } + + public void testCreateWithForceContent() throws Exception { + Map config = Map.of("field", DEFAULT_FIELD, "force_content", true); + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.isForceContent(), equalTo(true)); + } + + public void testCreateWithForceArray() throws Exception { + Map config = Map.of("field", DEFAULT_FIELD, "force_array", true); + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.isForceArray(), equalTo(true)); + } + + public void testCreateWithMultipleOptions() throws Exception { + Map config = Map.of( + "field", + DEFAULT_FIELD, + "ignore_missing", + true, + "force_content", + true, + "force_array", + true, + "remove_namespaces", + true + ); + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.isIgnoreMissing(), equalTo(true)); + assertThat(processor.isForceContent(), equalTo(true)); + assertThat(processor.isForceArray(), equalTo(true)); + assertThat(processor.isRemoveNamespaces(), equalTo(true)); + } + + // Tests for XPath compilation errors (testing precompilation feature) + + public void testCreateWithInvalidXPathExpression() throws Exception { + Map xpathConfig = Map.of("invalid xpath ][", "target_field"); + Map config = Map.of("field", DEFAULT_FIELD, "xpath", xpathConfig); + + XmlProcessor.Factory factory = createFactory(); + String processorTag = randomAlphaOfLength(10); + + // Make a mutable copy since Map.of creates immutable maps + Map configCopy = new HashMap<>(config); + + IllegalArgumentException exception = expectThrows( + IllegalArgumentException.class, + () -> factory.create(null, processorTag, null, configCopy, null) + ); + + // Check that the error message contains the XPath expression and indicates it's invalid + assertThat(exception.getMessage(), containsString("Invalid XPath expression [invalid xpath ][]")); + assertThat(exception.getCause().getMessage(), containsString("javax.xml.transform.TransformerException")); + } + + public void testCreateWithXPathUsingNamespacesWithoutConfiguration() throws Exception { + Map xpathConfig = Map.of("//book:title/text()", "title_field"); + Map config = Map.of("field", DEFAULT_FIELD, "xpath", xpathConfig); + + expectCreationFailure( + config, + IllegalArgumentException.class, + "Invalid XPath expression [//book:title/text()]: contains namespace prefixes but no namespace configuration provided" + ); + } + + public void testConfigurationParametersAreProperlyRemoved() throws Exception { + // Test that demonstrates configuration validation works when using production-like validation + // This test verifies that all valid configuration parameters are consumed during processor creation + + Map xpathConfig = Map.of("//test", "test_field"); + Map config = Map.of("field", DEFAULT_FIELD, "xpath", xpathConfig); + + // This should succeed as all parameters are valid + XmlProcessor processor = createProcessor(config); + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + } +} diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java new file mode 100644 index 0000000000000..f5e7e5d4e30ee --- /dev/null +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java @@ -0,0 +1,578 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.ingest.common; + +import org.elasticsearch.ingest.IngestDocument; +import org.elasticsearch.test.ESTestCase; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.hamcrest.Matchers.containsString; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.is; + +/** + * Tests for {@link XmlProcessor}. These tests ensure feature parity and test coverage. + */ +public class XmlProcessorTests extends ESTestCase { + + private static final String XML_FIELD = "xmldata"; + private static final String TARGET_FIELD = "data"; + + private static IngestDocument createTestIngestDocument(String xml) { + return new IngestDocument("_index", "_id", 1, null, null, new HashMap<>(Map.of(XML_FIELD, xml))); + } + + private static XmlProcessor createTestProcessor(Map config) throws Exception { + config.putIfAbsent("field", XML_FIELD); + config.putIfAbsent("target_field", TARGET_FIELD); + + XmlProcessor.Factory factory = new XmlProcessor.Factory(); + return factory.create(null, "_tag", null, config, null); + } + + /** + * Test parsing standard XML structure. + */ + public void testParseStandardXml() throws Exception { + String xml = ""; + + Map config = new HashMap<>(); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); + + Map expectedData = Map.of("foo", Map.of("key", "value")); + assertThat(data, equalTo(expectedData)); + } + + /** + * Test parsing XML with array elements (multiple elements with same name). + */ + public void testParseXmlWithArrayValue() throws Exception { + String xml = """ + + value1 + value2 + """; + + Map config = new HashMap<>(); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); + + Map expectedData = Map.of("foo", Map.of("key", List.of("value1", "value2"))); + assertThat(data, equalTo(expectedData)); + } + + /** + * Test parsing XML with nested elements. + */ + public void testParseXmlWithNestedElements() throws Exception { + String xml = """ + + + value + + """; + + Map config = new HashMap<>(); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); + + Map expectedData = Map.of("foo", Map.of("key1", Map.of("key2", "value"))); + assertThat(data, equalTo(expectedData)); + } + + /** + * Test parsing XML in a single item array. + */ + public void testParseXmlInSingleItemArray() throws Exception { + String xml = ""; + + Map config = new HashMap<>(); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); + + Map expectedData = Map.of("foo", Map.of("bar", "baz")); + assertThat(data, equalTo(expectedData)); + } + + /** + * Test extracting a single element using XPath. + */ + public void testXPathSingleElementExtraction() throws Exception { + String xml = """ + + hello + world + """; + + Map xpathMap = Map.of("/foo/bar/text()", "bar_content"); + + Map config = new HashMap<>(); + config.put("xpath", xpathMap); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + // Get the XPath result + Object barContent = ingestDocument.getFieldValue("bar_content", Object.class); + assertThat(barContent, equalTo("hello")); + + // Verify that the full parsed XML is also available + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); + + Map expectedData = Map.of("foo", Map.of("bar", "hello", "baz", "world")); + assertThat(data, equalTo(expectedData)); + } + + /** + * Test extracting multiple elements using XPath. + */ + public void testXPathMultipleElementsExtraction() throws Exception { + String xml = """ + + first + second + third + """; + + Map xpathMap = Map.of("/foo/bar", "all_bars"); + + Map config = new HashMap<>(); + config.put("xpath", xpathMap); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + @SuppressWarnings("unchecked") + List allBars = (List) ingestDocument.getFieldValue("all_bars", List.class); + List expectedBars = List.of("first", "second", "third"); + assertThat(allBars, equalTo(expectedBars)); + } + + /** + * Test extracting attributes using XPath. + */ + public void testXPathAttributeExtraction() throws Exception { + String xml = """ + + content + """; + + Map xpathMap = new HashMap<>(); + xpathMap.put("/foo/bar/@id", "bar_id"); + xpathMap.put("/foo/bar/@type", "bar_type"); + + Map config = new HashMap<>(); + config.put("xpath", xpathMap); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + String barId = ingestDocument.getFieldValue("bar_id", String.class); + assertThat(barId, equalTo("123")); + + String barType = ingestDocument.getFieldValue("bar_type", String.class); + assertThat(barType, equalTo("test")); + } + + /** + * Test extracting elements with namespaces using XPath. + */ + public void testXPathNamespacedExtraction() throws Exception { + String xml = """ + + + namespace-value + regular-value + """; + + Map namespaces = Map.of("myns", "http://example.org/ns1"); + Map xpathMap = Map.of("//myns:element/text()", "ns_value"); + + Map config = new HashMap<>(); + config.put("xpath", xpathMap); + config.put("namespaces", namespaces); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + String nsValue = ingestDocument.getFieldValue("ns_value", String.class); + assertThat(nsValue, equalTo("namespace-value")); + } + + /** + * Test parsing XML with mixed content (text and elements mixed together). + */ + public void testParseXmlWithMixedContent() throws Exception { + String xml = """ + + This text is bold and this is italic! + """; + + Map config = new HashMap<>(); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); + + Map expectedData = Map.of("foo", Map.of("b", "bold", "i", "italic", "#text", "This text is and this is !")); + assertThat(data, equalTo(expectedData)); + } + + /** + * Test parsing XML with CDATA sections. + */ + public void testParseXmlWithCDATA() throws Exception { + String xml = " that shouldn't be parsed!]]>"; + + Map config = new HashMap<>(); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); + + Map expectedData = Map.of("foo", "This is CDATA content with that shouldn't be parsed!"); + assertThat(data, equalTo(expectedData)); + } + + /** + * Test parsing XML with numeric data. + */ + public void testParseXmlWithNumericData() throws Exception { + String xml = """ + + 123 + 99.95 + true + """; + + Map config = new HashMap<>(); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); + + Map expectedData = Map.of("foo", Map.of("count", "123", "price", "99.95", "active", "true")); + assertThat(data, equalTo(expectedData)); + } + + /** + * Test parsing XML with force_array option enabled. + */ + public void testParseXmlWithForceArray() throws Exception { + String xml = "single_value"; + + Map config = new HashMap<>(); + config.put("force_array", true); // Enable force_array option + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); + + Map expectedData = Map.of("foo", Map.of("bar", List.of("single_value"))); + assertThat(data, equalTo(expectedData)); + } + + /** + * Test extracting multiple elements using multiple XPath expressions. + * Tests that multiple XPath expressions can be used simultaneously. + */ + public void testMultipleXPathExpressions() throws Exception { + String xml = """ + + + John + 30 + + + Jane + 25 + + """; + + // Configure multiple XPath expressions + Map xpathMap = new HashMap<>(); + xpathMap.put("/root/person[1]/n/text()", "first_person_name"); + xpathMap.put("/root/person[2]/n/text()", "second_person_name"); + xpathMap.put("/root/person/@id", "person_ids"); + + Map config = new HashMap<>(); + config.put("xpath", xpathMap); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + // Verify XPath results + Object firstName = ingestDocument.getFieldValue("first_person_name", Object.class); + assertThat(firstName, equalTo("John")); + + Object secondName = ingestDocument.getFieldValue("second_person_name", Object.class); + assertThat(secondName, equalTo("Jane")); + + List personIds = ingestDocument.getFieldValue("person_ids", List.class); + assertThat(personIds, equalTo(List.of("1", "2"))); + + // Verify that the target field was also created (since storeXml defaults to true) + assertThat(ingestDocument.hasField(TARGET_FIELD), equalTo(true)); + } + + /** + * Test handling of invalid XML with ignoreFailure=false. + */ + public void testInvalidXml() throws Exception { + String xml = ""; // Invalid XML missing closing tag + + Map config = new HashMap<>(); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> processor.execute(ingestDocument)); + + assertThat(exception.getMessage(), containsString("invalid XML")); + } + + /** + * Test handling of invalid XML with ignoreFailure=true. + * Note: The ignore_failure parameter is handled by the framework's OnFailureProcessor wrapper. + * When calling the processor directly (as in tests), exceptions are still thrown. + * This test verifies that the processor itself properly reports XML parsing errors. + */ + public void testInvalidXmlWithIgnoreFailure() throws Exception { + String xml = ""; // Invalid XML missing closing tag + + Map config = new HashMap<>(); + config.put("ignore_failure", true); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + // Even with ignore_failure=true, calling the processor directly still throws exceptions + // The framework's OnFailureProcessor wrapper handles the ignore_failure behavior in production + IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> processor.execute(ingestDocument)); + + assertThat(exception.getMessage(), containsString("invalid XML")); + } + + /** + * Test the store_xml=false option to not store parsed XML in target field. + */ + public void testNoStoreXml() throws Exception { + String xml = "value"; + + // Set up XPath to extract value but don't store XML + Map xpathMap = Map.of("/foo/bar/text()", "bar_content"); + + Map config = new HashMap<>(); + config.put("store_xml", false); // Do not store XML in target field + config.put("xpath", xpathMap); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + // Verify XPath result is stored + String barContent = ingestDocument.getFieldValue("bar_content", String.class); + assertThat(barContent, equalTo("value")); + + // Verify the target field was not created + assertThat(ingestDocument.hasField(TARGET_FIELD), is(false)); + } + + /** + * Test the to_lower option for converting field names to lowercase. + */ + public void testToLower() throws Exception { + String xml = "value"; + + Map config = new HashMap<>(); + config.put("to_lower", true); // Enable to_lower option + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + // Verify field names are lowercase + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); + + Map expectedData = Map.of("foo", Map.of("bar", "value")); + assertThat(data, equalTo(expectedData)); + } + + /** + * Test the ignore_missing option when field is missing. + */ + public void testIgnoreMissing() throws Exception { + String xmlField = "nonexistent_field"; + + Map config = new HashMap<>(); + config.put("field", xmlField); + config.put("ignore_missing", true); // Enable ignore_missing option + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = new IngestDocument("_index", "_id", 1, null, null, new HashMap<>(Map.of())); + processor.execute(ingestDocument); + + assertThat("Target field should not be created when source field is missing", ingestDocument.hasField(TARGET_FIELD), is(false)); + + // With ignoreMissing=false + config.put("ignore_missing", false); + XmlProcessor failingProcessor = createTestProcessor(config); + + // This should throw an exception + IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> failingProcessor.execute(ingestDocument)); + + assertThat(exception.getMessage(), containsString("not present as part of path")); + } + + /** + * Test that remove_empty_values correctly filters out empty values from arrays and mixed content. + */ + public void testRemoveEmptyValues() throws Exception { + // XML with mixed empty and non-empty elements, including array elements with mixed empty/non-empty values + String xml = """ + + + + content + + + nested-content + + + first + + third + + fifth + + Text with and content + """; + + Map config = new HashMap<>(); + config.put("remove_empty_values", true); + XmlProcessor processor = createTestProcessor(config); + + IngestDocument ingestDocument = createTestIngestDocument(xml); + processor.execute(ingestDocument); + + Map result = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); + + Map expectedData = Map.of( + "root", + Map.of( + "valid", + "content", + "nested", + Map.of("valid", "nested-content"), + "items", + Map.of("item", List.of("first", "third", "fifth")), + "mixed", + Map.of("valid", "content", "#text", "Text with and") + ) + ); + + assertThat(result, equalTo(expectedData)); + } + + /** + * Test parsing XML with remove_namespaces option. + */ + public void testRemoveNamespaces() throws Exception { + String xml = """ + + value + """; + + Map config = new HashMap<>(); + config.put("remove_namespaces", true); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); + + Map expectedDataWithoutNs = Map.of("foo", Map.of("bar", "value")); + assertThat(data, equalTo(expectedDataWithoutNs)); + + // Now test with removeNamespaces=false + IngestDocument ingestDocument2 = createTestIngestDocument(xml); + + config.put("remove_namespaces", false); + XmlProcessor processor2 = createTestProcessor(config); + processor2.execute(ingestDocument2); + + Map data2 = ingestDocument2.getFieldValue(TARGET_FIELD, Map.class); + + Map expectedDataWithNs = Map.of("foo", Map.of("xmlns:ns", "http://example.org/ns", "ns:bar", "value")); + assertThat(data2, equalTo(expectedDataWithNs)); + } + + /** + * Test the force_content option. + */ + public void testForceContent() throws Exception { + String xml = "simple text"; + + Map config = new HashMap<>(); + config.put("force_content", true); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); + + Map expectedDataWithForceContent = Map.of("foo", Map.of("#text", "simple text")); + assertThat(data, equalTo(expectedDataWithForceContent)); + + // Now test with forceContent=false + config.put("force_content", false); + XmlProcessor processor2 = createTestProcessor(config); + IngestDocument ingestDocument2 = createTestIngestDocument(xml); + + processor2.execute(ingestDocument2); + + Map data2 = ingestDocument2.getFieldValue(TARGET_FIELD, Map.class); + + Map expectedDataWithoutForceContent = Map.of("foo", "simple text"); + assertThat(data2, equalTo(expectedDataWithoutForceContent)); + } +} diff --git a/server/src/main/resources/transport/upper_bounds/8.18.csv b/server/src/main/resources/transport/upper_bounds/8.18.csv index 4eb5140004ea6..266bfbbd3bf78 100644 --- a/server/src/main/resources/transport/upper_bounds/8.18.csv +++ b/server/src/main/resources/transport/upper_bounds/8.18.csv @@ -1 +1 @@ -initial_elasticsearch_8_18_6,8840008 +transform_check_for_dangling_tasks,8840011 diff --git a/server/src/main/resources/transport/upper_bounds/8.19.csv b/server/src/main/resources/transport/upper_bounds/8.19.csv index 476468b203875..3600b3f8c633a 100644 --- a/server/src/main/resources/transport/upper_bounds/8.19.csv +++ b/server/src/main/resources/transport/upper_bounds/8.19.csv @@ -1 +1 @@ -initial_elasticsearch_8_19_3,8841067 +transform_check_for_dangling_tasks,8841070 diff --git a/server/src/main/resources/transport/upper_bounds/9.0.csv b/server/src/main/resources/transport/upper_bounds/9.0.csv index f8f50cc6d7839..c11e6837bb813 100644 --- a/server/src/main/resources/transport/upper_bounds/9.0.csv +++ b/server/src/main/resources/transport/upper_bounds/9.0.csv @@ -1 +1 @@ -initial_elasticsearch_9_0_6,9000015 +transform_check_for_dangling_tasks,9000018 diff --git a/server/src/main/resources/transport/upper_bounds/9.1.csv b/server/src/main/resources/transport/upper_bounds/9.1.csv index 5a65f2e578156..80b97d85f7511 100644 --- a/server/src/main/resources/transport/upper_bounds/9.1.csv +++ b/server/src/main/resources/transport/upper_bounds/9.1.csv @@ -1 +1 @@ -initial_elasticsearch_9_1_4,9112007 +transform_check_for_dangling_tasks,9112009 diff --git a/server/src/main/resources/transport/upper_bounds/9.2.csv b/server/src/main/resources/transport/upper_bounds/9.2.csv index e24f914a1d1ca..2147eab66c207 100644 --- a/server/src/main/resources/transport/upper_bounds/9.2.csv +++ b/server/src/main/resources/transport/upper_bounds/9.2.csv @@ -1 +1 @@ -ml_inference_endpoint_cache,9157000 +initial_9.2.0,9185000 diff --git a/server/src/main/resources/transport/upper_bounds/9.3.csv b/server/src/main/resources/transport/upper_bounds/9.3.csv new file mode 100644 index 0000000000000..2147eab66c207 --- /dev/null +++ b/server/src/main/resources/transport/upper_bounds/9.3.csv @@ -0,0 +1 @@ +initial_9.2.0,9185000