diff --git a/relay-server/src/processing/spans/process.rs b/relay-server/src/processing/spans/process.rs index 66b6f0eab7..de7ef90a05 100644 --- a/relay-server/src/processing/spans/process.rs +++ b/relay-server/src/processing/spans/process.rs @@ -7,14 +7,15 @@ use relay_event_normalization::{ }; use relay_event_schema::processor::{ProcessingState, ValueType, process_value}; use relay_event_schema::protocol::{AttachmentV2Meta, Span, SpanId, SpanV2}; +use relay_pii::PiiAttachmentsProcessor; use relay_protocol::Annotated; use crate::envelope::{ContainerItems, EnvelopeHeaders, Item, ItemContainer, ParentId, WithHeader}; use crate::managed::Managed; -use crate::processing::Context; use crate::processing::spans::{ self, Error, ExpandedAttachment, ExpandedSpan, ExpandedSpans, Result, SampledSpans, }; +use crate::processing::{Context, utils}; use crate::services::outcome::DiscardReason; /// Parses all serialized spans. @@ -224,16 +225,32 @@ fn validate_timestamps(span: &SpanV2) -> Result<()> { pub fn scrub(spans: &mut Managed, ctx: Context<'_>) { spans.retain( |spans| &mut spans.spans, - |span, _| { + |span, r| { scrub_span(&mut span.span, ctx) .inspect_err(|err| relay_log::debug!("failed to scrub pii from span: {err}"))?; - // TODO: Also scrub the attachment + span.attachments + .retain_mut(|attachment| match scrub_attachment(attachment, ctx) { + Ok(()) => true, + Err(err) => { + relay_log::debug!("failed to scrub pii from span attachment: {err}"); + r.reject_err(err, &*attachment); + false + } + }); + Ok::<(), Error>(()) }, ); - // TODO: Also scrub the standalone attachments + spans.retain( + |spans| &mut spans.stand_alone_attachments, + |attachment, _| { + scrub_attachment(attachment, ctx).inspect_err(|err| { + relay_log::debug!("failed to scrub pii from span attachment: {err}") + }) + }, + ); } fn scrub_span(span: &mut Annotated, ctx: Context<'_>) -> Result<()> { @@ -254,6 +271,47 @@ fn scrub_span(span: &mut Annotated, ctx: Context<'_>) -> Result<()> { Ok(()) } +fn scrub_attachment(attachment: &mut ExpandedAttachment, ctx: Context<'_>) -> Result<()> { + let pii_config_from_scrubbing = ctx + .project_info + .config + .datascrubbing_settings + .pii_config() + .map_err(|e| Error::PiiConfig(e.clone()))?; + + let ExpandedAttachment { meta, body } = attachment; + + relay_pii::eap::scrub( + ValueType::Attachments, // TODO: How do we event want to allow people to filter here? + meta, + ctx.project_info.config.pii_config.as_ref(), + pii_config_from_scrubbing.as_ref(), + )?; + + if let Some(config) = ctx.project_info.config.pii_config.as_ref() + // FIXME: Not a fan of this at all, the fact that the UI is misleading around this does not help IMO. + // Also also kind of weird since you can add a bogus rule to get around this (or even add a rule by accident to get around this). + // From attachments.rs: + // We temporarily only scrub attachments to projects that have at least one simple attachment rule, + // such as `$attachments.'foo.txt'`. + // After we have assessed the impact on performance we can relax this condition. + && utils::attachments::has_simple_attachment_selector(config) + { + let filename = meta + .value() + .and_then(|m| m.filename.as_str()) + .unwrap_or_default(); + + let processor = PiiAttachmentsProcessor::new(config.compiled()); + let mut payload = body.to_vec(); + if processor.scrub_attachment(filename, &mut payload) { + *body = Bytes::from(payload); + }; + } + + Ok(()) +} + fn span_duration(span: &SpanV2) -> Option { let start_timestamp = *span.start_timestamp.value()?; let end_timestamp = *span.end_timestamp.value()?; @@ -266,6 +324,7 @@ mod tests { use relay_pii::{DataScrubbingConfig, PiiConfig}; use relay_protocol::SerializableAnnotated; use relay_sampling::evaluation::ReservoirCounters; + use uuid::Uuid; use crate::services::projects::project::ProjectInfo; @@ -701,6 +760,91 @@ mod tests { "###); } + #[test] + fn test_scrub_attachment_body() { + let pii_config = serde_json::from_value::(serde_json::json!({ + "rules": {"0": {"type": "ip", "redaction": {"method": "remove"}}}, + "applications": {"$attachments.'data.txt'": ["0"]} + })) + .unwrap(); + + let mut attachment = ExpandedAttachment { + meta: Annotated::new(AttachmentV2Meta { + attachment_id: Annotated::new(Uuid::new_v4()), + filename: Annotated::new("data.txt".to_owned()), + content_type: Annotated::new("text/plain".to_owned()), + ..Default::default() + }), + body: Bytes::from("Some IP: 127.0.0.1"), + }; + + let ctx = make_context(DataScrubbingConfig::default(), Some(pii_config)); + scrub_attachment(&mut attachment, ctx).unwrap(); + + assert_eq!(attachment.body, "Some IP: *********"); + } + + #[test] + fn test_scrub_attachment_meta() { + use relay_event_schema::protocol::{Attribute, AttributeType, Attributes}; + use relay_protocol::Value; + + let pii_config = serde_json::from_value::(serde_json::json!({ + "applications": {"$string": ["@email:replace"]} + })) + .unwrap(); + + let mut attributes = Attributes::new(); + attributes.0.insert( + "email_attr".to_owned(), + Annotated::new(Attribute::new( + AttributeType::String, + Value::String("john.doe@example.com".to_owned()), + )), + ); + + let mut attachment = ExpandedAttachment { + meta: Annotated::new(AttachmentV2Meta { + attachment_id: Annotated::new(Uuid::new_v4()), + filename: Annotated::new("data.txt".to_owned()), + content_type: Annotated::new("text/plain".to_owned()), + attributes: Annotated::new(attributes), + ..Default::default() + }), + body: Bytes::from("Some attachment body"), + }; + + let ctx = make_context(DataScrubbingConfig::default(), Some(pii_config)); + scrub_attachment(&mut attachment, ctx).unwrap(); + + let attrs = &attachment.meta.value().unwrap().attributes; + insta::assert_json_snapshot!(SerializableAnnotated(attrs), @r#" + { + "email_attr": { + "type": "string", + "value": "[email]" + }, + "_meta": { + "email_attr": { + "value": { + "": { + "rem": [ + [ + "@email:replace", + "s", + 0, + 7 + ] + ], + "len": 20 + } + } + } + } + } + "#); + } + #[test] fn test_validate_timestamp_start_before_end() { validate_timestamps(&SpanV2 { diff --git a/relay-server/src/processing/utils/attachments.rs b/relay-server/src/processing/utils/attachments.rs index 2341dbf543..d0fced6ecd 100644 --- a/relay-server/src/processing/utils/attachments.rs +++ b/relay-server/src/processing/utils/attachments.rs @@ -108,7 +108,7 @@ fn scrub_view_hierarchy(item: &mut crate::envelope::Item, config: &relay_pii::Pi } } -fn has_simple_attachment_selector(config: &relay_pii::PiiConfig) -> bool { +pub fn has_simple_attachment_selector(config: &relay_pii::PiiConfig) -> bool { for application in &config.applications { if let SelectorSpec::Path(vec) = &application.0 { let Some([a, b]) = vec.get(0..2) else { diff --git a/tests/integration/test_attachmentsv2.py b/tests/integration/test_attachmentsv2.py index be406b06bf..436e4080a0 100644 --- a/tests/integration/test_attachmentsv2.py +++ b/tests/integration/test_attachmentsv2.py @@ -800,3 +800,351 @@ def test_span_attachment_independent_rate_limiting( assert outcome_counter == expected_outcomes outcomes_consumer.assert_empty() + + +def test_attachment_default_pii_scrubbing_meta( + mini_sentry, + relay, + secret_attribute, +): + attribute_key, attribute_value, expected_value, rule_type = secret_attribute + + project_id = 42 + project_config = mini_sentry.add_full_project_config(project_id) + project_config["config"]["features"] = [ + "organizations:standalone-span-ingestion", + "projects:span-v2-experimental-processing", + "projects:span-v2-attachment-processing", + ] + project_config["config"].setdefault( + "datascrubbingSettings", + { + "scrubData": True, + "scrubDefaults": True, + "scrubIpAddresses": True, + }, + ) + + relay = relay(mini_sentry, options=TEST_CONFIG) + + ts = datetime.now(timezone.utc) + span_id = "eee19b7ec3c1b174" + trace_id = "5b8efff798038103d269b633813fc60c" + envelope = envelope_with_spans( + { + "start_timestamp": ts.timestamp(), + "end_timestamp": ts.timestamp() + 0.5, + "trace_id": trace_id, + "span_id": span_id, + "is_segment": True, + "name": "test span", + "status": "ok", + }, + trace_info={ + "trace_id": trace_id, + "public_key": project_config["publicKeys"][0]["publicKey"], + }, + ) + + metadata = { + "attachment_id": str(uuid.uuid4()), + "timestamp": ts.timestamp(), + "filename": "data.txt", + "content_type": "text/plain", + "attributes": { + attribute_key: {"type": "string", "value": attribute_value}, + }, + } + body = b"Some content" + metadata_bytes = json.dumps(metadata, separators=(",", ":")).encode("utf-8") + combined_payload = metadata_bytes + body + + envelope.add_item( + Item( + payload=PayloadRef(bytes=combined_payload), + type="attachment", + headers={ + "content_type": "application/vnd.sentry.attachment.v2", + "meta_length": len(metadata_bytes), + "span_id": span_id, + "length": len(combined_payload), + }, + ) + ) + + relay.send_envelope(project_id, envelope) + forwarded = mini_sentry.get_captured_event() + + attachment = next(i for i in forwarded.items if i.type == "attachment") + meta_length = attachment.headers.get("meta_length") + payload = attachment.payload.bytes + metadata_part = json.loads(payload[:meta_length].decode("utf-8")) + + assert metadata_part == { + "attachment_id": mock.ANY, + "timestamp": time_within_delta(ts), + "filename": "data.txt", + "content_type": "text/plain", + "attributes": { + attribute_key: {"type": "string", "value": expected_value}, + }, + "_meta": { + "attributes": { + attribute_key: { + "value": { + "": { + "len": mock.ANY, + "rem": [[rule_type, mock.ANY, mock.ANY, mock.ANY]], + } + } + } + } + }, + } + + +def test_attachment_pii_scrubbing_meta_attribute( + mini_sentry, + relay, + scrubbing_rule, +): + rule_type, test_value, expected_scrubbed = scrubbing_rule + + project_id = 42 + project_config = mini_sentry.add_full_project_config(project_id) + project_config["config"]["features"] = [ + "organizations:standalone-span-ingestion", + "projects:span-v2-experimental-processing", + "projects:span-v2-attachment-processing", + ] + project_config["config"]["piiConfig"]["applications"] = {"$string": [rule_type]} + + relay = relay(mini_sentry, options=TEST_CONFIG) + + ts = datetime.now(timezone.utc) + span_id = "eee19b7ec3c1b174" + trace_id = "5b8efff798038103d269b633813fc60c" + envelope = envelope_with_spans( + { + "start_timestamp": ts.timestamp(), + "end_timestamp": ts.timestamp() + 0.5, + "trace_id": trace_id, + "span_id": span_id, + "is_segment": True, + "name": "test span", + "status": "ok", + }, + trace_info={ + "trace_id": trace_id, + "public_key": project_config["publicKeys"][0]["publicKey"], + }, + ) + + metadata = { + "attachment_id": str(uuid.uuid4()), + "timestamp": ts.timestamp(), + "filename": "data.txt", + "content_type": "text/plain", + "attributes": { + "test_pii": {"type": "string", "value": test_value}, + }, + } + body = b"Some attachment content" + metadata_bytes = json.dumps(metadata, separators=(",", ":")).encode("utf-8") + combined_payload = metadata_bytes + body + + envelope.add_item( + Item( + payload=PayloadRef(bytes=combined_payload), + type="attachment", + headers={ + "content_type": "application/vnd.sentry.attachment.v2", + "meta_length": len(metadata_bytes), + "span_id": span_id, + "length": len(combined_payload), + }, + ) + ) + + relay.send_envelope(project_id, envelope) + forwarded = mini_sentry.get_captured_event() + + attachment = next(i for i in forwarded.items if i.type == "attachment") + meta_length = attachment.headers.get("meta_length") + payload = attachment.payload.bytes + metadata_part = json.loads(payload[:meta_length].decode("utf-8")) + + assert metadata_part == { + "attachment_id": mock.ANY, + "timestamp": time_within_delta(ts), + "filename": "data.txt", + "content_type": "text/plain", + "attributes": { + "test_pii": {"type": "string", "value": expected_scrubbed}, + }, + "_meta": { + "attributes": { + "test_pii": { + "value": { + "": { + "len": mock.ANY, + "rem": [[rule_type, mock.ANY, mock.ANY, mock.ANY]], + } + } + } + } + }, + } + + +def test_attachment_pii_scrubbing_body(mini_sentry, relay): + project_id = 42 + project_config = mini_sentry.add_full_project_config(project_id) + project_config["config"]["features"] = [ + "organizations:standalone-span-ingestion", + "projects:span-v2-experimental-processing", + "projects:span-v2-attachment-processing", + ] + project_config["config"]["piiConfig"] = { + "rules": {"0": {"type": "ip", "redaction": {"method": "remove"}}}, + "applications": {"$attachments.'log.txt'": ["0"]}, + } + + relay = relay(mini_sentry, options=TEST_CONFIG) + + ts = datetime.now(timezone.utc) + span_id = "eee19b7ec3c1b174" + trace_id = "5b8efff798038103d269b633813fc60c" + envelope = envelope_with_spans( + { + "start_timestamp": ts.timestamp(), + "end_timestamp": ts.timestamp() + 0.5, + "trace_id": trace_id, + "span_id": span_id, + "is_segment": True, + "name": "test span", + "status": "ok", + }, + trace_info={ + "trace_id": trace_id, + "public_key": project_config["publicKeys"][0]["publicKey"], + }, + ) + + metadata = { + "attachment_id": str(uuid.uuid4()), + "timestamp": ts.timestamp(), + "filename": "log.txt", + "content_type": "text/plain", + } + body = b"before 127.0.0.1 after" + metadata_bytes = json.dumps(metadata, separators=(",", ":")).encode("utf-8") + combined_payload = metadata_bytes + body + + envelope.add_item( + Item( + payload=PayloadRef(bytes=combined_payload), + type="attachment", + headers={ + "content_type": "application/vnd.sentry.attachment.v2", + "meta_length": len(metadata_bytes), + "span_id": span_id, + "length": len(combined_payload), + }, + ) + ) + + relay.send_envelope(project_id, envelope) + forwarded = mini_sentry.get_captured_event() + + attachment = next(i for i in forwarded.items if i.type == "attachment") + meta_length = attachment.headers.get("meta_length") + payload = attachment.payload.bytes + body_part = payload[meta_length:] + + assert body_part == b"before ********* after" + + +def test_attachment_pii_selector_on_attachment_meta_data(mini_sentry, relay): + project_id = 42 + project_config = mini_sentry.add_full_project_config(project_id) + project_config["config"]["features"] = [ + "organizations:standalone-span-ingestion", + "projects:span-v2-experimental-processing", + "projects:span-v2-attachment-processing", + ] + + # TODO: Decide if this is the most sensible selector + project_config["config"]["piiConfig"] = { + "applications": {"$attachments.**": ["@ip:replace"]}, + } + + relay = relay(mini_sentry, options=TEST_CONFIG) + + ts = datetime.now(timezone.utc) + span_id = "eee19b7ec3c1b174" + trace_id = "5b8efff798038103d269b633813fc60c" + envelope = envelope_with_spans( + { + "start_timestamp": ts.timestamp(), + "end_timestamp": ts.timestamp() + 0.5, + "trace_id": trace_id, + "span_id": span_id, + "is_segment": True, + "name": "test span", + "status": "ok", + }, + trace_info={ + "trace_id": trace_id, + "public_key": project_config["publicKeys"][0]["publicKey"], + }, + ) + + metadata = { + "attachment_id": str(uuid.uuid4()), + "timestamp": ts.timestamp(), + "filename": "data.txt", + "content_type": "text/plain", + "attributes": { + "test_pii": {"type": "string", "value": "127.0.0.1"}, + }, + } + body = b"Some content" + metadata_bytes = json.dumps(metadata, separators=(",", ":")).encode("utf-8") + combined_payload = metadata_bytes + body + + envelope.add_item( + Item( + payload=PayloadRef(bytes=combined_payload), + type="attachment", + headers={ + "content_type": "application/vnd.sentry.attachment.v2", + "meta_length": len(metadata_bytes), + "span_id": span_id, + "length": len(combined_payload), + }, + ) + ) + + relay.send_envelope(project_id, envelope) + forwarded = mini_sentry.get_captured_event() + + attachment = next(i for i in forwarded.items if i.type == "attachment") + meta_length = attachment.headers.get("meta_length") + payload = attachment.payload.bytes + metadata_part = json.loads(payload[:meta_length].decode("utf-8")) + + assert metadata_part == { + "attachment_id": mock.ANY, + "timestamp": time_within_delta(ts), + "filename": "data.txt", + "content_type": "text/plain", + "attributes": {"test_pii": {"type": "string", "value": "[ip]"}}, + "_meta": { + "attributes": { + "test_pii": { + "value": {"": {"rem": [["@ip:replace", "s", 0, 4]], "len": 9}} + } + } + }, + }