-
-
Notifications
You must be signed in to change notification settings - Fork 158
normalise field name: change prefix from @ to _ to allow proper querying #1514
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -118,6 +118,9 @@ impl EventFormat for Event { | |
| )); | ||
| } | ||
|
|
||
| // Rename JSON keys starting with '@' to '_' to match the schema | ||
| let value_arr = rename_json_keys(value_arr); | ||
|
|
||
| Ok((value_arr, schema, is_first)) | ||
| } | ||
|
|
||
|
|
@@ -257,6 +260,27 @@ fn collect_keys<'a>(values: impl Iterator<Item = &'a Value>) -> Result<Vec<&'a s | |
| Ok(keys) | ||
| } | ||
|
|
||
| /// Renames JSON keys to match the schema transformation using normalize_field_name | ||
| fn rename_json_keys(values: Vec<Value>) -> Vec<Value> { | ||
| values | ||
| .into_iter() | ||
| .map(|value| { | ||
| if let Value::Object(map) = value { | ||
| let new_map: serde_json::Map<String, Value> = map | ||
| .into_iter() | ||
| .map(|(mut key, val)| { | ||
| super::normalize_field_name(&mut key); | ||
| (key, val) | ||
| }) | ||
| .collect(); | ||
| Value::Object(new_map) | ||
| } else { | ||
| value | ||
| } | ||
| }) | ||
| .collect() | ||
| } | ||
|
Comment on lines
+263
to
+282
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Implementation is correct, but consider key collision edge case. The function correctly normalizes JSON object keys. However, if an input object contains both "@foo" and "_foo", they will collide after normalization, potentially causing data loss (the second insertion would overwrite the first). While this edge case may be rare, consider whether validation or error handling is needed: 🛡️ Potential collision detection fn rename_json_keys(values: Vec<Value>) -> Vec<Value> {
values
.into_iter()
.map(|value| {
if let Value::Object(map) = value {
+ let mut seen_keys = std::collections::HashSet::new();
let new_map: serde_json::Map<String, Value> = map
.into_iter()
.map(|(mut key, val)| {
+ let original_key = key.clone();
super::normalize_field_name(&mut key);
+ if !seen_keys.insert(key.clone()) {
+ tracing::warn!("Key collision detected: '{}' normalizes to existing key '{}'", original_key, key);
+ }
(key, val)
})
.collect();
Value::Object(new_map)
} else {
value
}
})
.collect()
} |
||
|
|
||
| fn fields_mismatch( | ||
| schema: &[Arc<Field>], | ||
| body: &Value, | ||
|
|
@@ -267,7 +291,10 @@ fn fields_mismatch( | |
| if val.is_null() { | ||
| continue; | ||
| } | ||
| let Some(field) = get_field(schema, name) else { | ||
| // Normalize field name to match schema transformation | ||
| let mut lookup_name = name.to_string(); | ||
| super::normalize_field_name(&mut lookup_name); | ||
| let Some(field) = get_field(schema, &lookup_name) else { | ||
| return true; | ||
| }; | ||
| if !valid_type(field, val, schema_version, static_schema_flag) { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🧩 Analysis chain
🏁 Script executed:
Repository: parseablehq/parseable
Length of output: 76
🏁 Script executed:
Repository: parseablehq/parseable
Length of output: 1820
🏁 Script executed:
Repository: parseablehq/parseable
Length of output: 2711
🏁 Script executed:
rg -n "fn collect_keys" --type rustRepository: parseablehq/parseable
Length of output: 184
🏁 Script executed:
rg -n "fn derive_arrow_schema" --type rustRepository: parseablehq/parseable
Length of output: 117
🏁 Script executed:
rg -n "fn rename_json_keys" --type rustRepository: parseablehq/parseable
Length of output: 149
🏁 Script executed:
sed -n '231,243p' src/event/format/json.rsRepository: parseablehq/parseable
Length of output: 454
🏁 Script executed:
sed -n '244,263p' src/event/format/json.rsRepository: parseablehq/parseable
Length of output: 691
🏁 Script executed:
sed -n '264,290p' src/event/format/json.rsRepository: parseablehq/parseable
Length of output: 868
🏁 Script executed:
rg -n "fn normalize_field_name" --type rust -A 5Repository: parseablehq/parseable
Length of output: 379
🏁 Script executed:
rg -n "stream_schema" src/event/format/json.rs --type rust -B 2 -A 2Repository: parseablehq/parseable
Length of output: 1112
🏁 Script executed:
Repository: parseablehq/parseable
Length of output: 47
🏁 Script executed:
rg -n "HashMap.*Field\|stream_schema.*insert" src/event/format/json.rs --type rust -B 2 -A 2Repository: parseablehq/parseable
Length of output: 47
🏁 Script executed:
Repository: parseablehq/parseable
Length of output: 47
🏁 Script executed:
sed -n '55,80p' src/event/format/json.rsRepository: parseablehq/parseable
Length of output: 1136
🏁 Script executed:
rg -n "impl.*Handler.*for.*JsonEvent" src/event/format/json.rs --type rust -B 2 -A 15Repository: parseablehq/parseable
Length of output: 47
🏁 Script executed:
Repository: parseablehq/parseable
Length of output: 1416
🏁 Script executed:
Repository: parseablehq/parseable
Length of output: 47
🏁 Script executed:
sed -n '869,890p' src/streams.rsRepository: parseablehq/parseable
Length of output: 122
🏁 Script executed:
Repository: parseablehq/parseable
Length of output: 961
🏁 Script executed:
Repository: parseablehq/parseable
Length of output: 47
🏁 Script executed:
sed -n '82,110p' src/event/format/json.rsRepository: parseablehq/parseable
Length of output: 1443
🏁 Script executed:
Repository: parseablehq/parseable
Length of output: 47
🏁 Script executed:
sed -n '594,620p' src/handlers/http/ingest.rsRepository: parseablehq/parseable
Length of output: 861
🏁 Script executed:
Repository: parseablehq/parseable
Length of output: 47
🏁 Script executed:
sed -n '75,115p' src/metadata.rsRepository: parseablehq/parseable
Length of output: 1421
🏁 Script executed:
Repository: parseablehq/parseable
Length of output: 47
🏁 Script executed:
Repository: parseablehq/parseable
Length of output: 47
🏁 Script executed:
Repository: parseablehq/parseable
Length of output: 47
🏁 Script executed:
sed -n '140,180p' src/event/format/json.rsRepository: parseablehq/parseable
Length of output: 1427
🏁 Script executed:
Repository: parseablehq/parseable
Length of output: 47
🏁 Script executed:
Repository: parseablehq/parseable
Length of output: 6112
🏁 Script executed:
Repository: parseablehq/parseable
Length of output: 47
🏁 Script executed:
rg -n "create_schema_from_metastore\|persist_schema\|set_schema" src/parseable --type rust -B 3 -A 8Repository: parseablehq/parseable
Length of output: 47
🏁 Script executed:
rg -n "normalize_field_name" src --type rust -B 5 -A 5Repository: parseablehq/parseable
Length of output: 3561
🏁 Script executed:
sed -n '1,60p' src/event/format/json.rsRepository: parseablehq/parseable
Length of output: 1954
Normalize collected keys before schema lookup to avoid unnecessary schema inference.
For existing streams, the stored schema uses normalized field names (e.g.,
_fooinstead of@foo). However,collect_keysextracts field names as-is from the incoming JSON. Whenderive_arrow_schemaattempts to match these non-normalized keys against the stored schema, the lookup fails and triggers a full schema inference path.This causes unnecessary overhead on every event containing "@" prefixed fields, since:
collect_keysextracts "@foo"derive_arrow_schematries to find "@foo" in schema (which has "_foo")rename_json_keysNormalize the collected keys before passing them to
derive_arrow_schemato avoid repeated schema inference for existing streams.🤖 Prompt for AI Agents