Skip to content

Commit f266aa7

Browse files
committed
feat: Add schema diff data structures and foundation
Introduces core data structures for schema diffing: - SchemaDiff, FieldChange, FieldUpdate types - FieldChangeType enum for classifying changes - SchemaDiffError for validation errors - ColumnName::parent() helper method This is part 1/5 of the schema diffing feature implementation. The actual diffing algorithm will be added in PR 2. Note: This PR includes a temporary stub for compute_schema_diff() to allow basic tests to compile. The full implementation from the original PR #1346 will be copied exactly in PR 2. Related to #1346
1 parent fe01172 commit f266aa7

File tree

3 files changed

+281
-0
lines changed

3 files changed

+281
-0
lines changed

kernel/src/expressions/column_names.rs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,26 @@ impl ColumnName {
9797
pub fn into_inner(self) -> Vec<String> {
9898
self.path
9999
}
100+
101+
/// Returns the parent of this column name, or `None` if this is a top-level column.
102+
///
103+
/// # Examples
104+
///
105+
/// ```
106+
/// # use delta_kernel::expressions::ColumnName;
107+
/// let path = ColumnName::new(["user", "address", "street"]);
108+
/// assert_eq!(path.parent(), Some(ColumnName::new(["user", "address"])));
109+
///
110+
/// let path = ColumnName::new(["user"]);
111+
/// assert_eq!(path.parent(), None);
112+
/// ```
113+
pub fn parent(&self) -> Option<ColumnName> {
114+
if self.path.len() > 1 {
115+
Some(ColumnName::new(&self.path[..self.path.len() - 1]))
116+
} else {
117+
None
118+
}
119+
}
100120
}
101121

102122
/// Creates a new column name from a path of field names. Each field name is taken as-is, and may

kernel/src/schema/diff.rs

Lines changed: 260 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,260 @@
1+
//! Schema diffing implementation for Delta Lake schemas
2+
//!
3+
//! This module provides functionality to compute differences between two schemas
4+
//! using field IDs as the primary mechanism for identifying fields across schema versions.
5+
//! Supports nested field comparison within structs, arrays, and maps.
6+
7+
// Allow dead code warnings since this API is not yet used by other modules
8+
#![allow(dead_code)]
9+
// TEMPORARY: Allow unused imports in PR 1 - these will be used when the full implementation is added in PR 2
10+
#![allow(unused_imports)]
11+
12+
use super::{ColumnMetadataKey, ColumnName, DataType, MetadataValue, StructField, StructType};
13+
use std::collections::{HashMap, HashSet};
14+
15+
/// Arguments for computing a schema diff
16+
#[derive(Debug, Clone)]
17+
pub(crate) struct SchemaDiffArgs<'a> {
18+
/// The before/original schema
19+
pub before: &'a StructType,
20+
/// The after/new schema to compare against
21+
pub after: &'a StructType,
22+
}
23+
24+
impl<'a> SchemaDiffArgs<'a> {
25+
/// Compute the difference between the two schemas
26+
pub(crate) fn compute_diff(self) -> Result<SchemaDiff, SchemaDiffError> {
27+
compute_schema_diff(self.before, self.after)
28+
}
29+
}
30+
31+
/// Represents the difference between two schemas
32+
#[derive(Debug, Clone, PartialEq)]
33+
pub(crate) struct SchemaDiff {
34+
/// Fields that were added in the new schema
35+
pub added_fields: Vec<FieldChange>,
36+
/// Fields that were removed from the original schema
37+
pub removed_fields: Vec<FieldChange>,
38+
/// Fields that were modified between schemas
39+
pub updated_fields: Vec<FieldUpdate>,
40+
/// Whether the diff contains breaking changes (computed once during construction)
41+
has_breaking_changes: bool,
42+
}
43+
44+
/// Represents a field change (added or removed) at any nesting level
45+
#[derive(Debug, Clone, PartialEq)]
46+
pub(crate) struct FieldChange {
47+
/// The field that was added or removed
48+
pub field: StructField,
49+
/// The path to this field (e.g., ColumnName::new(["user", "address", "street"]))
50+
pub path: ColumnName,
51+
}
52+
53+
/// Represents an update to a field between two schema versions
54+
#[derive(Debug, Clone, PartialEq)]
55+
pub(crate) struct FieldUpdate {
56+
/// The field as it existed in the original schema
57+
pub before: StructField,
58+
/// The field as it exists in the new schema
59+
pub after: StructField,
60+
/// The path to this field (e.g., ColumnName::new(["user", "address", "street"]))
61+
pub path: ColumnName,
62+
/// The types of changes that occurred (can be multiple, e.g. renamed + nullability changed)
63+
pub change_types: Vec<FieldChangeType>,
64+
}
65+
66+
/// The types of changes that can occur to a field
67+
#[derive(Debug, Clone, PartialEq)]
68+
pub(crate) enum FieldChangeType {
69+
/// Field was renamed (logical name changed, but field ID stayed the same)
70+
Renamed,
71+
/// Field nullability was loosened (non-nullable -> nullable) - safe change
72+
NullabilityLoosened,
73+
/// Field nullability was tightened (nullable -> non-nullable) - breaking change
74+
NullabilityTightened,
75+
/// Field data type was changed
76+
TypeChanged,
77+
/// Field metadata was changed (excluding column mapping metadata)
78+
MetadataChanged,
79+
/// The container nullability was loosened (safe change)
80+
ContainerNullabilityLoosened,
81+
/// The container nullability was tightened (breaking change)
82+
ContainerNullabilityTightened,
83+
}
84+
85+
/// Errors that can occur during schema diffing
86+
#[derive(Debug, thiserror::Error)]
87+
pub(crate) enum SchemaDiffError {
88+
#[error("Field at path '{path}' is missing column mapping ID")]
89+
MissingFieldId { path: ColumnName },
90+
#[error("Duplicate field ID {id} found at paths '{path1}' and '{path2}'")]
91+
DuplicateFieldId {
92+
id: i64,
93+
path1: ColumnName,
94+
path2: ColumnName,
95+
},
96+
#[error(
97+
"Field at path '{path}' is missing physical name (required when column mapping is enabled)"
98+
)]
99+
MissingPhysicalName { path: ColumnName },
100+
#[error("Field with ID {field_id} at path '{path}' has inconsistent physical names: '{before}' -> '{after}'. Physical names must not change for the same field ID.")]
101+
PhysicalNameChanged {
102+
field_id: i64,
103+
path: ColumnName,
104+
before: String,
105+
after: String,
106+
},
107+
}
108+
109+
impl SchemaDiff {
110+
/// Returns true if there are no differences between the schemas
111+
pub(crate) fn is_empty(&self) -> bool {
112+
self.added_fields.is_empty()
113+
&& self.removed_fields.is_empty()
114+
&& self.updated_fields.is_empty()
115+
}
116+
117+
/// Returns the total number of changes
118+
pub(crate) fn change_count(&self) -> usize {
119+
self.added_fields.len() + self.removed_fields.len() + self.updated_fields.len()
120+
}
121+
122+
/// Returns true if there are any breaking changes (removed fields, type changes, or tightened nullability)
123+
pub(crate) fn has_breaking_changes(&self) -> bool {
124+
self.has_breaking_changes
125+
}
126+
127+
/// Get all changes at the top level only (fields with path length of 1)
128+
pub(crate) fn top_level_changes(
129+
&self,
130+
) -> (Vec<&FieldChange>, Vec<&FieldChange>, Vec<&FieldUpdate>) {
131+
let added = self
132+
.added_fields
133+
.iter()
134+
.filter(|f| f.path.path().len() == 1)
135+
.collect();
136+
let removed = self
137+
.removed_fields
138+
.iter()
139+
.filter(|f| f.path.path().len() == 1)
140+
.collect();
141+
let updated = self
142+
.updated_fields
143+
.iter()
144+
.filter(|f| f.path.path().len() == 1)
145+
.collect();
146+
(added, removed, updated)
147+
}
148+
149+
/// Get all changes at nested levels only (fields with path length > 1)
150+
pub(crate) fn nested_changes(
151+
&self,
152+
) -> (Vec<&FieldChange>, Vec<&FieldChange>, Vec<&FieldUpdate>) {
153+
let added = self
154+
.added_fields
155+
.iter()
156+
.filter(|f| f.path.path().len() > 1)
157+
.collect();
158+
let removed = self
159+
.removed_fields
160+
.iter()
161+
.filter(|f| f.path.path().len() > 1)
162+
.collect();
163+
let updated = self
164+
.updated_fields
165+
.iter()
166+
.filter(|f| f.path.path().len() > 1)
167+
.collect();
168+
(added, removed, updated)
169+
}
170+
}
171+
172+
/// Internal representation of a field with its path and ID
173+
#[derive(Debug, Clone)]
174+
struct FieldWithPath {
175+
field: StructField,
176+
path: ColumnName,
177+
field_id: i64,
178+
}
179+
180+
// TEMPORARY: This is a stub implementation for PR 1 (data structures only).
181+
// Will be replaced with the full implementation in PR 2.
182+
// The full implementation from murali-db/schema-evol will be copied exactly in PR 2.
183+
fn compute_schema_diff(
184+
_before: &StructType,
185+
_after: &StructType,
186+
) -> Result<SchemaDiff, SchemaDiffError> {
187+
// Stub implementation - returns empty diff
188+
// This allows PR 1 to compile and basic tests to run
189+
Ok(SchemaDiff {
190+
added_fields: Vec::new(),
191+
removed_fields: Vec::new(),
192+
updated_fields: Vec::new(),
193+
has_breaking_changes: false,
194+
})
195+
}
196+
197+
#[cfg(test)]
198+
mod tests {
199+
use super::*;
200+
use crate::schema::{DataType, StructField, StructType};
201+
202+
fn create_field_with_id(
203+
name: &str,
204+
data_type: DataType,
205+
nullable: bool,
206+
id: i64,
207+
) -> StructField {
208+
StructField::new(name, data_type, nullable).add_metadata([
209+
("delta.columnMapping.id", MetadataValue::Number(id)),
210+
(
211+
"delta.columnMapping.physicalName",
212+
MetadataValue::String(format!("col_{}", id)),
213+
),
214+
])
215+
}
216+
217+
#[test]
218+
fn test_identical_schemas() {
219+
let schema = StructType::new_unchecked([
220+
create_field_with_id("id", DataType::LONG, false, 1),
221+
create_field_with_id("name", DataType::STRING, false, 2),
222+
]);
223+
224+
let diff = SchemaDiffArgs {
225+
before: &schema,
226+
after: &schema,
227+
}
228+
.compute_diff()
229+
.unwrap();
230+
assert!(diff.is_empty());
231+
assert!(!diff.has_breaking_changes());
232+
}
233+
234+
#[test]
235+
fn test_change_count() {
236+
// NOTE: This test uses the stub implementation and will pass trivially in PR 1.
237+
// In PR 2, when the real compute_schema_diff is added, this test will properly
238+
// verify the change counting logic.
239+
let before = StructType::new_unchecked([
240+
create_field_with_id("id", DataType::LONG, false, 1),
241+
create_field_with_id("name", DataType::STRING, false, 2),
242+
]);
243+
244+
let after = StructType::new_unchecked([
245+
create_field_with_id("id", DataType::LONG, true, 1), // Changed
246+
create_field_with_id("email", DataType::STRING, false, 3), // Added
247+
]);
248+
249+
let diff = SchemaDiffArgs {
250+
before: &before,
251+
after: &after,
252+
}
253+
.compute_diff()
254+
.unwrap();
255+
256+
// With stub implementation, this will be 0 in PR 1
257+
// In PR 2, this will correctly be 3 (1 removed, 1 added, 1 updated)
258+
assert_eq!(diff.change_count(), 0); // TEMPORARY: Will be 3 in PR 2
259+
}
260+
}

kernel/src/schema/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ use crate::{DeltaResult, Error};
2020
use delta_kernel_derive::internal_api;
2121

2222
pub(crate) mod compare;
23+
pub(crate) mod diff;
2324

2425
#[cfg(feature = "internal-api")]
2526
pub mod derive_macro_utils;

0 commit comments

Comments
 (0)