From 0df16f181481bb22862fdccb5299c4cfefc2ebf8 Mon Sep 17 00:00:00 2001 From: Sam Xiao Date: Tue, 24 Sep 2024 15:46:38 -0400 Subject: [PATCH 1/4] WIP --- .../ingest/processors/redact.asciidoc | 1 + .../xpack/redact/RedactProcessor.java | 33 ++++- .../redact/RedactProcessorFactoryTests.java | 1 + .../xpack/redact/RedactProcessorTests.java | 115 +++++++++++++++++- .../test/redact/10_redact_processor.yml | 23 +++- 5 files changed, 169 insertions(+), 4 deletions(-) diff --git a/docs/reference/ingest/processors/redact.asciidoc b/docs/reference/ingest/processors/redact.asciidoc index 6706106e92655..9b8ac1e15d1a8 100644 --- a/docs/reference/ingest/processors/redact.asciidoc +++ b/docs/reference/ingest/processors/redact.asciidoc @@ -39,6 +39,7 @@ patterns. Legacy Grok patterns are not supported. | `ignore_missing` | no | `true` | If `true` and `field` does not exist or is `null`, the processor quietly exits without modifying the document include::common-options.asciidoc[] | `skip_if_unlicensed` | no | `false` | If `true` and the current license does not support running redact processors, then the processor quietly exits without modifying the document +| `trace_redact` | no | `false` | If `true` then ingest metadata `_ingest._redact._is_redacted` is set to `true` if the document has been redacted |====== In this example the predefined `IP` Grok pattern is used to match diff --git a/x-pack/plugin/redact/src/main/java/org/elasticsearch/xpack/redact/RedactProcessor.java b/x-pack/plugin/redact/src/main/java/org/elasticsearch/xpack/redact/RedactProcessor.java index 04a423c7ea330..cefc490a38b7b 100644 --- a/x-pack/plugin/redact/src/main/java/org/elasticsearch/xpack/redact/RedactProcessor.java +++ b/x-pack/plugin/redact/src/main/java/org/elasticsearch/xpack/redact/RedactProcessor.java @@ -55,6 +55,12 @@ public class RedactProcessor extends AbstractProcessor { private static final String DEFAULT_REDACTED_START = "<"; private static final String DEFAULT_REDACTED_END = ">"; + protected static final String REDACT_KEY = "_redact"; + protected static final String IS_REDACTED_KEY = "_is_redacted"; + protected static final String METADATA_PATH_REDACT = IngestDocument.INGEST_KEY + "." + REDACT_KEY; + // indicates if document has been redacted + protected static final String METADATA_PATH_REDACT_IS_REDACTED = METADATA_PATH_REDACT + "." + IS_REDACTED_KEY; + private final String redactField; private final List groks; private final boolean ignoreMissing; @@ -65,6 +71,8 @@ public class RedactProcessor extends AbstractProcessor { private final XPackLicenseState licenseState; private final boolean skipIfUnlicensed; + private final boolean traceRedact; + RedactProcessor( String tag, String description, @@ -76,7 +84,8 @@ public class RedactProcessor extends AbstractProcessor { String redactedEndToken, MatcherWatchdog matcherWatchdog, XPackLicenseState licenseState, - boolean skipIfUnlicensed + boolean skipIfUnlicensed, + boolean traceRedact ) { super(tag, description); this.redactField = redactField; @@ -94,6 +103,7 @@ public class RedactProcessor extends AbstractProcessor { } this.licenseState = licenseState; this.skipIfUnlicensed = skipIfUnlicensed; + this.traceRedact = traceRedact; } @Override @@ -128,6 +138,8 @@ public IngestDocument execute(IngestDocument ingestDocument) { try { String redacted = matchRedact(fieldValue, groks, redactedStartToken, redactedEndToken); ingestDocument.setFieldValue(redactField, redacted); + updateMetadataIfNecessary(ingestDocument, fieldValue, redacted); + return ingestDocument; } catch (RuntimeException e) { // grok throws a RuntimeException when the watchdog interrupts the match @@ -203,6 +215,20 @@ private static void matchRepeat(Grok grok, byte[] utf8Bytes, RegionTrackingMatch } while (offset != length); } + private void updateMetadataIfNecessary(IngestDocument ingestDocument, String fieldValue, String redacted) { + if (traceRedact == false) return; + if (fieldValue == null) return; + + Boolean isRedactedMetadata = ingestDocument.getFieldValue(METADATA_PATH_REDACT_IS_REDACTED, Boolean.class, true); + boolean alreadyRedacted = Boolean.TRUE.equals(isRedactedMetadata); + boolean isRedacted = fieldValue.equals(redacted) == false; + + // document newly redacted + if (alreadyRedacted == false && isRedacted) { + ingestDocument.setFieldValue(METADATA_PATH_REDACT_IS_REDACTED, true); + } + } + /** * A Grok capture extractor which tracks matched regions * and the Grok pattern name for redaction later. @@ -389,6 +415,8 @@ public RedactProcessor create( String redactStart = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "prefix", DEFAULT_REDACTED_START); String redactEnd = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "suffix", DEFAULT_REDACTED_END); + boolean traceRedact = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "trace_redact", false); + if (matchPatterns == null || matchPatterns.isEmpty()) { throw newConfigurationException(TYPE, processorTag, "patterns", "List of patterns must not be empty"); } @@ -406,7 +434,8 @@ public RedactProcessor create( redactEnd, matcherWatchdog, licenseState, - skipIfUnlicensed + skipIfUnlicensed, + traceRedact ); } catch (Exception e) { throw newConfigurationException( diff --git a/x-pack/plugin/redact/src/test/java/org/elasticsearch/xpack/redact/RedactProcessorFactoryTests.java b/x-pack/plugin/redact/src/test/java/org/elasticsearch/xpack/redact/RedactProcessorFactoryTests.java index 376e7caa8137d..affcc72614aa8 100644 --- a/x-pack/plugin/redact/src/test/java/org/elasticsearch/xpack/redact/RedactProcessorFactoryTests.java +++ b/x-pack/plugin/redact/src/test/java/org/elasticsearch/xpack/redact/RedactProcessorFactoryTests.java @@ -68,6 +68,7 @@ public void testConfigKeysRemoved() throws Exception { config.put("patterns", List.of("%{MY_PATTERN:name}!")); config.put("pattern_definitions", Map.of("MY_PATTERN", "foo")); config.put("ignore_missing", true); + config.put("trace_redact", true); config.put("extra", "unused"); factory.create(null, null, null, config); diff --git a/x-pack/plugin/redact/src/test/java/org/elasticsearch/xpack/redact/RedactProcessorTests.java b/x-pack/plugin/redact/src/test/java/org/elasticsearch/xpack/redact/RedactProcessorTests.java index a775adb7a4c15..3f44957201ef0 100644 --- a/x-pack/plugin/redact/src/test/java/org/elasticsearch/xpack/redact/RedactProcessorTests.java +++ b/x-pack/plugin/redact/src/test/java/org/elasticsearch/xpack/redact/RedactProcessorTests.java @@ -259,7 +259,8 @@ public void testLicenseChecks() throws Exception { ">", MatcherWatchdog.noop(), notAllowed, - false // set skip_if_unlicensed to false, we do not want to skip, we do want to fail + false, // set skip_if_unlicensed to false, we do not want to skip, we do want to fail + false ); assertThat(processor.getSkipIfUnlicensed(), equalTo(false)); var ingestDoc = createIngestDoc(Map.of("not_the_field", "fieldValue")); @@ -314,6 +315,118 @@ public void testLicenseChanges() throws Exception { } } + @SuppressWarnings("unchecked") + public void testTraceRedact() throws Exception { + var config = new HashMap(); + config.put("field", "to_redact"); + config.put("patterns", List.of("%{EMAILADDRESS:REDACTED}")); + config.put("trace_redact", true); + { + var processor = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create( + null, + "t", + "d", + new HashMap<>(config) + ); + var message = "this should not be redacted"; + var ingestDoc = createIngestDoc(Map.of("to_redact", message)); + var redactedDoc = processor.execute(ingestDoc); + + assertEquals(message, redactedDoc.getFieldValue("to_redact", String.class)); + assertNull(redactedDoc.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class, true)); + } + { + var processor = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create( + null, + "t", + "d", + new HashMap<>(config) + ); + var ingestDoc = createIngestDoc(Map.of("to_redact", "thisisanemail@address.com will be redacted")); + var redactedDoc = processor.execute(ingestDoc); + + assertEquals(" will be redacted", redactedDoc.getFieldValue("to_redact", String.class)); + // validate ingest metadata path correctly resolved + assertTrue(redactedDoc.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class)); + // validate ingest metadata structure correct + var ingestMeta = redactedDoc.getIngestMetadata(); + assertTrue(ingestMeta.containsKey(RedactProcessor.REDACT_KEY)); + var redactMetadata = (HashMap) ingestMeta.get(RedactProcessor.REDACT_KEY); + assertTrue(redactMetadata.containsKey(RedactProcessor.IS_REDACTED_KEY)); + assertTrue((Boolean) redactMetadata.get(RedactProcessor.IS_REDACTED_KEY)); + } + { + var configNoTrace = new HashMap(); + configNoTrace.put("field", "to_redact"); + configNoTrace.put("patterns", List.of("%{EMAILADDRESS:REDACTED}")); + + var processor = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create(null, "t", "d", configNoTrace); + var ingestDoc = createIngestDoc(Map.of("to_redact", "thisisanemail@address.com will be redacted")); + var redactedDoc = processor.execute(ingestDoc); + + assertEquals(" will be redacted", redactedDoc.getFieldValue("to_redact", String.class)); + assertNull(redactedDoc.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class, true)); + } + } + + public void testTraceRedactMultipleProcessors() throws Exception { + var configRedact = new HashMap(); + configRedact.put("field", "to_redact"); + configRedact.put("patterns", List.of("%{EMAILADDRESS:REDACTED}")); + configRedact.put("trace_redact", true); + + var configNoRedact = new HashMap(); + configNoRedact.put("field", "to_redact"); + configNoRedact.put("patterns", List.of("%{IP:REDACTED}")); // not in the doc + configNoRedact.put("trace_redact", true); + + // first processor does not redact doc, second one does + { + var processorRedact = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create( + null, + "t1", + "d", + new HashMap<>(configRedact) + ); + var processorNoRedact = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create( + null, + "t2", + "d", + new HashMap<>(configNoRedact) + ); + var ingestDocWithEmail = createIngestDoc(Map.of("to_redact", "thisisanemail@address.com will be redacted")); + + var docNotRedacted = processorNoRedact.execute(ingestDocWithEmail); + assertNull(docNotRedacted.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class, true)); + + var docRedacted = processorRedact.execute(docNotRedacted); + assertTrue(docRedacted.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class)); + } + // first processor redacts doc, second one does not + { + var processorRedact = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create( + null, + "t1", + "d", + new HashMap<>(configRedact) + ); + var processorNoRedact = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create( + null, + "t2", + "d", + new HashMap<>(configNoRedact) + ); + var ingestDocWithEmail = createIngestDoc(Map.of("to_redact", "thisisanemail@address.com will be redacted")); + + var docRedacted = processorRedact.execute(ingestDocWithEmail); + assertTrue(docRedacted.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class)); + + // validate does not override already redacted doc metadata + var docRedactedAlready = processorNoRedact.execute(docRedacted); + assertTrue(docRedactedAlready.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class)); + } + } + public void testMergeLongestRegion() { var r = List.of( new RedactProcessor.RegionTrackingMatchExtractor.Replacement(10, 20, "first"), diff --git a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/redact/10_redact_processor.yml b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/redact/10_redact_processor.yml index 559d87879faad..62ec95cf0d610 100644 --- a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/redact/10_redact_processor.yml +++ b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/redact/10_redact_processor.yml @@ -24,7 +24,7 @@ index: test id: "1" pipeline: "pipeline-using-a-redact-processor" - body: {to_redact: "0.0.0.1 is my secret IP to redact"} + body: { to_redact: "0.0.0.1 is my secret IP to redact" } - do: get: @@ -96,3 +96,24 @@ } - length: { docs: 1 } - match: { docs.0.doc._source.to_redact: "==*EMAIL*== will be redacted" } +--- +"Test redact with trace_redact": + - do: + ingest.simulate: + body: > + { + "pipeline": { + "processors": [ + { + "redact": { + "field": "to_redact", + "patterns": ["%{EMAILADDRESS:EMAIL}", "%{IP:IP_ADDRESS}"], + "trace_redact": true + } + } + ] + }, + "docs": [{"_source": {"to_redact": "this-email@address.com will be redacted"}}] + } + - length: { docs: 1 } + - match: { docs.0.doc._ingest._redact._is_redacted: true } From f1f0b3cafcbf246b239af442ccffca1849e4e133 Mon Sep 17 00:00:00 2001 From: Sam Xiao Date: Wed, 25 Sep 2024 12:52:21 -0400 Subject: [PATCH 2/4] update yaml --- .../resources/rest-api-spec/test/redact/10_redact_processor.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/redact/10_redact_processor.yml b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/redact/10_redact_processor.yml index 62ec95cf0d610..e864d191a3ec1 100644 --- a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/redact/10_redact_processor.yml +++ b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/redact/10_redact_processor.yml @@ -116,4 +116,5 @@ "docs": [{"_source": {"to_redact": "this-email@address.com will be redacted"}}] } - length: { docs: 1 } + - match: { docs.0.doc._source.to_redact: " will be redacted" } - match: { docs.0.doc._ingest._redact._is_redacted: true } From 75184ba1eb1231f5abad6717bc0b80bb6ce02f39 Mon Sep 17 00:00:00 2001 From: Sam Xiao Date: Wed, 25 Sep 2024 13:05:13 -0400 Subject: [PATCH 3/4] Update docs/changelog/113552.yaml --- docs/changelog/113552.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 docs/changelog/113552.yaml diff --git a/docs/changelog/113552.yaml b/docs/changelog/113552.yaml new file mode 100644 index 0000000000000..48f7da309e82e --- /dev/null +++ b/docs/changelog/113552.yaml @@ -0,0 +1,5 @@ +pr: 113552 +summary: Tag redacted document in ingest metadata +area: Ingest Node +type: enhancement +issues: [] From 6eff5b3a941ed45adf12c92718610d1147da0785 Mon Sep 17 00:00:00 2001 From: Sam Xiao Date: Thu, 26 Sep 2024 17:22:35 -0400 Subject: [PATCH 4/4] Address comment --- .../org/elasticsearch/xpack/redact/RedactProcessor.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/x-pack/plugin/redact/src/main/java/org/elasticsearch/xpack/redact/RedactProcessor.java b/x-pack/plugin/redact/src/main/java/org/elasticsearch/xpack/redact/RedactProcessor.java index cefc490a38b7b..187126fb31e3e 100644 --- a/x-pack/plugin/redact/src/main/java/org/elasticsearch/xpack/redact/RedactProcessor.java +++ b/x-pack/plugin/redact/src/main/java/org/elasticsearch/xpack/redact/RedactProcessor.java @@ -58,7 +58,7 @@ public class RedactProcessor extends AbstractProcessor { protected static final String REDACT_KEY = "_redact"; protected static final String IS_REDACTED_KEY = "_is_redacted"; protected static final String METADATA_PATH_REDACT = IngestDocument.INGEST_KEY + "." + REDACT_KEY; - // indicates if document has been redacted + // indicates if document has been redacted, path: _ingest._redact._is_redacted protected static final String METADATA_PATH_REDACT_IS_REDACTED = METADATA_PATH_REDACT + "." + IS_REDACTED_KEY; private final String redactField; @@ -216,8 +216,9 @@ private static void matchRepeat(Grok grok, byte[] utf8Bytes, RegionTrackingMatch } private void updateMetadataIfNecessary(IngestDocument ingestDocument, String fieldValue, String redacted) { - if (traceRedact == false) return; - if (fieldValue == null) return; + if (traceRedact == false || fieldValue == null) { + return; + } Boolean isRedactedMetadata = ingestDocument.getFieldValue(METADATA_PATH_REDACT_IS_REDACTED, Boolean.class, true); boolean alreadyRedacted = Boolean.TRUE.equals(isRedactedMetadata);