-
Notifications
You must be signed in to change notification settings - Fork 25.5k
Tag redacted document in ingest pipeline #113552
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
0df16f1
f1f0b3c
75184ba
b2a2d10
6eff5b3
5fab3a9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
pr: 113552 | ||
summary: Tag redacted document in ingest metadata | ||
area: Ingest Node | ||
type: enhancement | ||
issues: [] |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -55,6 +55,12 @@ public class RedactProcessor extends AbstractProcessor { | |
private static final String DEFAULT_REDACTED_START = "<"; | ||
private static final String DEFAULT_REDACTED_END = ">"; | ||
|
||
protected static final String REDACT_KEY = "_redact"; | ||
protected static final String IS_REDACTED_KEY = "_is_redacted"; | ||
protected static final String METADATA_PATH_REDACT = IngestDocument.INGEST_KEY + "." + REDACT_KEY; | ||
// indicates if document has been redacted, path: _ingest._redact._is_redacted | ||
protected static final String METADATA_PATH_REDACT_IS_REDACTED = METADATA_PATH_REDACT + "." + IS_REDACTED_KEY; | ||
|
||
private final String redactField; | ||
private final List<Grok> groks; | ||
private final boolean ignoreMissing; | ||
|
@@ -65,6 +71,8 @@ public class RedactProcessor extends AbstractProcessor { | |
private final XPackLicenseState licenseState; | ||
private final boolean skipIfUnlicensed; | ||
|
||
private final boolean traceRedact; | ||
|
||
RedactProcessor( | ||
String tag, | ||
String description, | ||
|
@@ -76,7 +84,8 @@ public class RedactProcessor extends AbstractProcessor { | |
String redactedEndToken, | ||
MatcherWatchdog matcherWatchdog, | ||
XPackLicenseState licenseState, | ||
boolean skipIfUnlicensed | ||
boolean skipIfUnlicensed, | ||
boolean traceRedact | ||
) { | ||
super(tag, description); | ||
this.redactField = redactField; | ||
|
@@ -94,6 +103,7 @@ public class RedactProcessor extends AbstractProcessor { | |
} | ||
this.licenseState = licenseState; | ||
this.skipIfUnlicensed = skipIfUnlicensed; | ||
this.traceRedact = traceRedact; | ||
} | ||
|
||
@Override | ||
|
@@ -128,6 +138,8 @@ public IngestDocument execute(IngestDocument ingestDocument) { | |
try { | ||
String redacted = matchRedact(fieldValue, groks, redactedStartToken, redactedEndToken); | ||
ingestDocument.setFieldValue(redactField, redacted); | ||
updateMetadataIfNecessary(ingestDocument, fieldValue, redacted); | ||
|
||
return ingestDocument; | ||
} catch (RuntimeException e) { | ||
// grok throws a RuntimeException when the watchdog interrupts the match | ||
|
@@ -203,6 +215,21 @@ private static void matchRepeat(Grok grok, byte[] utf8Bytes, RegionTrackingMatch | |
} while (offset != length); | ||
} | ||
|
||
private void updateMetadataIfNecessary(IngestDocument ingestDocument, String fieldValue, String redacted) { | ||
if (traceRedact == false || fieldValue == null) { | ||
return; | ||
} | ||
|
||
Boolean isRedactedMetadata = ingestDocument.getFieldValue(METADATA_PATH_REDACT_IS_REDACTED, Boolean.class, true); | ||
boolean alreadyRedacted = Boolean.TRUE.equals(isRedactedMetadata); | ||
boolean isRedacted = fieldValue.equals(redacted) == false; | ||
|
||
// document newly redacted | ||
if (alreadyRedacted == false && isRedacted) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does it matter if it's already redacted? We only skip setting this if the metadata field is true already. It doesn't look like it can ever be false. Couldn't we just overwrite the field regardless of its existing value? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is possible that a redact processor does not actually modify the document (the pattern is not found), in that case I think we should not set In the case of multiple redact processors, if the first processor redacts the doc and sets |
||
ingestDocument.setFieldValue(METADATA_PATH_REDACT_IS_REDACTED, true); | ||
} | ||
} | ||
|
||
/** | ||
* A Grok capture extractor which tracks matched regions | ||
* and the Grok pattern name for redaction later. | ||
|
@@ -389,6 +416,8 @@ public RedactProcessor create( | |
String redactStart = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "prefix", DEFAULT_REDACTED_START); | ||
String redactEnd = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "suffix", DEFAULT_REDACTED_END); | ||
|
||
boolean traceRedact = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "trace_redact", false); | ||
|
||
if (matchPatterns == null || matchPatterns.isEmpty()) { | ||
throw newConfigurationException(TYPE, processorTag, "patterns", "List of patterns must not be empty"); | ||
} | ||
|
@@ -406,7 +435,8 @@ public RedactProcessor create( | |
redactEnd, | ||
matcherWatchdog, | ||
licenseState, | ||
skipIfUnlicensed | ||
skipIfUnlicensed, | ||
traceRedact | ||
); | ||
} catch (Exception e) { | ||
throw newConfigurationException( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -259,7 +259,8 @@ public void testLicenseChecks() throws Exception { | |
">", | ||
MatcherWatchdog.noop(), | ||
notAllowed, | ||
false // set skip_if_unlicensed to false, we do not want to skip, we do want to fail | ||
false, // set skip_if_unlicensed to false, we do not want to skip, we do want to fail | ||
false | ||
); | ||
assertThat(processor.getSkipIfUnlicensed(), equalTo(false)); | ||
var ingestDoc = createIngestDoc(Map.of("not_the_field", "fieldValue")); | ||
|
@@ -314,6 +315,118 @@ public void testLicenseChanges() throws Exception { | |
} | ||
} | ||
|
||
@SuppressWarnings("unchecked") | ||
public void testTraceRedact() throws Exception { | ||
var config = new HashMap<String, Object>(); | ||
config.put("field", "to_redact"); | ||
config.put("patterns", List.of("%{EMAILADDRESS:REDACTED}")); | ||
config.put("trace_redact", true); | ||
{ | ||
var processor = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create( | ||
null, | ||
"t", | ||
"d", | ||
new HashMap<>(config) | ||
); | ||
var message = "this should not be redacted"; | ||
var ingestDoc = createIngestDoc(Map.of("to_redact", message)); | ||
var redactedDoc = processor.execute(ingestDoc); | ||
|
||
assertEquals(message, redactedDoc.getFieldValue("to_redact", String.class)); | ||
assertNull(redactedDoc.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class, true)); | ||
} | ||
{ | ||
var processor = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create( | ||
null, | ||
"t", | ||
"d", | ||
new HashMap<>(config) | ||
); | ||
var ingestDoc = createIngestDoc(Map.of("to_redact", "[email protected] will be redacted")); | ||
var redactedDoc = processor.execute(ingestDoc); | ||
|
||
assertEquals("<REDACTED> will be redacted", redactedDoc.getFieldValue("to_redact", String.class)); | ||
// validate ingest metadata path correctly resolved | ||
assertTrue(redactedDoc.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class)); | ||
// validate ingest metadata structure correct | ||
var ingestMeta = redactedDoc.getIngestMetadata(); | ||
assertTrue(ingestMeta.containsKey(RedactProcessor.REDACT_KEY)); | ||
var redactMetadata = (HashMap<String, Object>) ingestMeta.get(RedactProcessor.REDACT_KEY); | ||
assertTrue(redactMetadata.containsKey(RedactProcessor.IS_REDACTED_KEY)); | ||
assertTrue((Boolean) redactMetadata.get(RedactProcessor.IS_REDACTED_KEY)); | ||
} | ||
{ | ||
var configNoTrace = new HashMap<String, Object>(); | ||
configNoTrace.put("field", "to_redact"); | ||
configNoTrace.put("patterns", List.of("%{EMAILADDRESS:REDACTED}")); | ||
|
||
var processor = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create(null, "t", "d", configNoTrace); | ||
var ingestDoc = createIngestDoc(Map.of("to_redact", "[email protected] will be redacted")); | ||
var redactedDoc = processor.execute(ingestDoc); | ||
|
||
assertEquals("<REDACTED> will be redacted", redactedDoc.getFieldValue("to_redact", String.class)); | ||
assertNull(redactedDoc.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class, true)); | ||
} | ||
} | ||
|
||
public void testTraceRedactMultipleProcessors() throws Exception { | ||
var configRedact = new HashMap<String, Object>(); | ||
configRedact.put("field", "to_redact"); | ||
configRedact.put("patterns", List.of("%{EMAILADDRESS:REDACTED}")); | ||
configRedact.put("trace_redact", true); | ||
|
||
var configNoRedact = new HashMap<String, Object>(); | ||
configNoRedact.put("field", "to_redact"); | ||
configNoRedact.put("patterns", List.of("%{IP:REDACTED}")); // not in the doc | ||
configNoRedact.put("trace_redact", true); | ||
|
||
// first processor does not redact doc, second one does | ||
{ | ||
var processorRedact = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create( | ||
null, | ||
"t1", | ||
"d", | ||
new HashMap<>(configRedact) | ||
); | ||
var processorNoRedact = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create( | ||
null, | ||
"t2", | ||
"d", | ||
new HashMap<>(configNoRedact) | ||
); | ||
var ingestDocWithEmail = createIngestDoc(Map.of("to_redact", "[email protected] will be redacted")); | ||
|
||
var docNotRedacted = processorNoRedact.execute(ingestDocWithEmail); | ||
assertNull(docNotRedacted.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class, true)); | ||
|
||
var docRedacted = processorRedact.execute(docNotRedacted); | ||
assertTrue(docRedacted.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class)); | ||
} | ||
// first processor redacts doc, second one does not | ||
{ | ||
var processorRedact = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create( | ||
null, | ||
"t1", | ||
"d", | ||
new HashMap<>(configRedact) | ||
); | ||
var processorNoRedact = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create( | ||
null, | ||
"t2", | ||
"d", | ||
new HashMap<>(configNoRedact) | ||
); | ||
var ingestDocWithEmail = createIngestDoc(Map.of("to_redact", "[email protected] will be redacted")); | ||
|
||
var docRedacted = processorRedact.execute(ingestDocWithEmail); | ||
assertTrue(docRedacted.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class)); | ||
|
||
// validate does not override already redacted doc metadata | ||
var docRedactedAlready = processorNoRedact.execute(docRedacted); | ||
assertTrue(docRedactedAlready.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class)); | ||
} | ||
} | ||
|
||
public void testMergeLongestRegion() { | ||
var r = List.of( | ||
new RedactProcessor.RegionTrackingMatchExtractor.Replacement(10, 20, "first"), | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,7 +24,7 @@ | |
index: test | ||
id: "1" | ||
pipeline: "pipeline-using-a-redact-processor" | ||
body: {to_redact: "0.0.0.1 is my secret IP to redact"} | ||
body: { to_redact: "0.0.0.1 is my secret IP to redact" } | ||
|
||
- do: | ||
get: | ||
|
@@ -96,3 +96,25 @@ | |
} | ||
- length: { docs: 1 } | ||
- match: { docs.0.doc._source.to_redact: "==*EMAIL*== will be redacted" } | ||
--- | ||
"Test redact with trace_redact": | ||
- do: | ||
ingest.simulate: | ||
body: > | ||
{ | ||
"pipeline": { | ||
"processors": [ | ||
{ | ||
"redact": { | ||
"field": "to_redact", | ||
"patterns": ["%{EMAILADDRESS:EMAIL}", "%{IP:IP_ADDRESS}"], | ||
"trace_redact": true | ||
} | ||
} | ||
] | ||
}, | ||
"docs": [{"_source": {"to_redact": "[email protected] will be redacted"}}] | ||
} | ||
- length: { docs: 1 } | ||
- match: { docs.0.doc._source.to_redact: "<EMAIL> will be redacted" } | ||
- match: { docs.0.doc._ingest._redact._is_redacted: true } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As I'm adding those fields to the Elasticsearch specification, I'm wondering, why do we need underscores here? Under
_ingest
, we already havetimestamp
andpipeline
that don't use underscores.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi @pquentin , to my understanding, underscore is convention for ES managed/populated attributes. For example, Grok processor adds similar field
_ingest._grok_match_index
with underscore in ingest metadata.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK, thank you.