Skip to content

Commit c25d179

Browse files
authored
feat: Add field-level descriptions to FieldSchema (#1087)
* feat: Add field-level descriptions to FieldSchema - Add description field to Rust FieldSchema struct (Arc<str>) - Update Python FieldSchema to include description field - Extract Pydantic field descriptions during schema creation - Update JSON schema builder to include field descriptions - Add comprehensive tests for field description functionality Resolves #1074 * fix: concatenate field and type descriptions instead of overwriting - Modified set_description method to append descriptions with newline separator - Fixed description overwrite issue where field-level descriptions would replace type-level descriptions - Added description: None to all FieldSchema initializations to satisfy new field requirement - Added comprehensive test to verify description concatenation works correctly - All existing tests pass (129 passed, 0 failed) with no regressions detected - Implementation handles both extract_descriptions: true and false scenarios - Field-level descriptions now properly concatenate with type-level descriptions using newline separator - Resolves reviewer feedback about description overwrite in PR #1087 - Changes maintain backward compatibility while fixing the core issue - Ready for review and testing by the maintainer team * pre-commit pass
1 parent 9baca6e commit c25d179

File tree

8 files changed

+251
-8
lines changed

8 files changed

+251
-8
lines changed

pyproject.toml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,3 +84,11 @@ strict = true
8484
files = "python/cocoindex"
8585
exclude = "(\\.venv|site-packages)"
8686
disable_error_code = ["unused-ignore"]
87+
[[tool.mypy.overrides]]
88+
module = [
89+
"sentence_transformers",
90+
"torch",
91+
"colpali_engine",
92+
"PIL",
93+
]
94+
ignore_missing_imports = true

python/cocoindex/tests/test_convert.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1715,3 +1715,86 @@ class MixedStruct:
17151715
order = OrderPydantic(order_id="O1", name="item1", price=10.0)
17161716
mixed = MixedStruct(name="test", pydantic_order=order)
17171717
validate_full_roundtrip(mixed, MixedStruct)
1718+
1719+
1720+
@pytest.mark.skipif(not PYDANTIC_AVAILABLE, reason="Pydantic not available")
1721+
def test_pydantic_field_descriptions() -> None:
1722+
"""Test that Pydantic field descriptions are extracted and included in schema."""
1723+
from pydantic import BaseModel, Field
1724+
1725+
class UserWithDescriptions(BaseModel):
1726+
"""A user model with field descriptions."""
1727+
1728+
name: str = Field(description="The user's full name")
1729+
age: int = Field(description="The user's age in years", ge=0, le=150)
1730+
email: str = Field(description="The user's email address")
1731+
is_active: bool = Field(
1732+
description="Whether the user account is active", default=True
1733+
)
1734+
1735+
# Test that field descriptions are extracted
1736+
encoded_schema = dump_engine_object(UserWithDescriptions)
1737+
1738+
# Check that the schema contains field descriptions
1739+
assert "fields" in encoded_schema["type"]
1740+
fields = encoded_schema["type"]["fields"]
1741+
1742+
# Find fields by name and check descriptions
1743+
field_descriptions = {field["name"]: field.get("description") for field in fields}
1744+
1745+
assert field_descriptions["name"] == "The user's full name"
1746+
assert field_descriptions["age"] == "The user's age in years"
1747+
assert field_descriptions["email"] == "The user's email address"
1748+
assert field_descriptions["is_active"] == "Whether the user account is active"
1749+
1750+
1751+
@pytest.mark.skipif(not PYDANTIC_AVAILABLE, reason="Pydantic not available")
1752+
def test_pydantic_field_descriptions_without_field() -> None:
1753+
"""Test that Pydantic models without field descriptions work correctly."""
1754+
from pydantic import BaseModel
1755+
1756+
class UserWithoutDescriptions(BaseModel):
1757+
"""A user model without field descriptions."""
1758+
1759+
name: str
1760+
age: int
1761+
email: str
1762+
1763+
# Test that the schema works without descriptions
1764+
encoded_schema = dump_engine_object(UserWithoutDescriptions)
1765+
1766+
# Check that the schema contains fields but no descriptions
1767+
assert "fields" in encoded_schema["type"]
1768+
fields = encoded_schema["type"]["fields"]
1769+
1770+
# Verify no descriptions are present
1771+
for field in fields:
1772+
assert "description" not in field or field["description"] is None
1773+
1774+
1775+
@pytest.mark.skipif(not PYDANTIC_AVAILABLE, reason="Pydantic not available")
1776+
def test_pydantic_mixed_descriptions() -> None:
1777+
"""Test Pydantic model with some fields having descriptions and others not."""
1778+
from pydantic import BaseModel, Field
1779+
1780+
class MixedDescriptions(BaseModel):
1781+
"""A model with mixed field descriptions."""
1782+
1783+
name: str = Field(description="The name field")
1784+
age: int # No description
1785+
email: str = Field(description="The email field")
1786+
active: bool # No description
1787+
1788+
# Test that only fields with descriptions have them in the schema
1789+
encoded_schema = dump_engine_object(MixedDescriptions)
1790+
1791+
assert "fields" in encoded_schema["type"]
1792+
fields = encoded_schema["type"]["fields"]
1793+
1794+
# Find fields by name and check descriptions
1795+
field_descriptions = {field["name"]: field.get("description") for field in fields}
1796+
1797+
assert field_descriptions["name"] == "The name field"
1798+
assert field_descriptions["age"] is None
1799+
assert field_descriptions["email"] == "The email field"
1800+
assert field_descriptions["active"] is None

python/cocoindex/typing.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -359,7 +359,9 @@ def _encode_struct_schema(
359359
) -> tuple[dict[str, Any], int | None]:
360360
fields = []
361361

362-
def add_field(name: str, analyzed_type: AnalyzedTypeInfo) -> None:
362+
def add_field(
363+
name: str, analyzed_type: AnalyzedTypeInfo, description: str | None = None
364+
) -> None:
363365
try:
364366
type_info = encode_enriched_type_info(analyzed_type)
365367
except ValueError as e:
@@ -369,6 +371,8 @@ def add_field(name: str, analyzed_type: AnalyzedTypeInfo) -> None:
369371
)
370372
raise
371373
type_info["name"] = name
374+
if description is not None:
375+
type_info["description"] = description
372376
fields.append(type_info)
373377

374378
def add_fields_from_struct(struct_type: type) -> None:
@@ -384,7 +388,9 @@ def add_fields_from_struct(struct_type: type) -> None:
384388
for name, field_info in struct_type.model_fields.items(): # type: ignore[attr-defined]
385389
# Get the annotation from the field info
386390
field_type = field_info.annotation
387-
add_field(name, analyze_type_info(field_type))
391+
# Extract description from Pydantic field info
392+
description = getattr(field_info, "description", None)
393+
add_field(name, analyze_type_info(field_type), description)
388394
else:
389395
raise ValueError(f"Invalid Pydantic model: {struct_type}")
390396
else:
@@ -667,6 +673,7 @@ def encode(self) -> dict[str, Any]:
667673
class FieldSchema:
668674
name: str
669675
value_type: EnrichedValueType
676+
description: str | None = None
670677

671678
def __str__(self) -> str:
672679
return f"{self.name}: {self.value_type}"
@@ -676,11 +683,17 @@ def __repr__(self) -> str:
676683

677684
@staticmethod
678685
def decode(obj: dict[str, Any]) -> "FieldSchema":
679-
return FieldSchema(name=obj["name"], value_type=EnrichedValueType.decode(obj))
686+
return FieldSchema(
687+
name=obj["name"],
688+
value_type=EnrichedValueType.decode(obj),
689+
description=obj.get("description"),
690+
)
680691

681692
def encode(self) -> dict[str, Any]:
682693
result = self.value_type.encode()
683694
result["name"] = self.name
695+
if self.description is not None:
696+
result["description"] = self.description
684697
return result
685698

686699

src/base/json_schema.rs

Lines changed: 111 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,31 @@ impl JsonSchemaBuilder {
4545
if self.options.extract_descriptions {
4646
let mut fields: Vec<_> = field_path.iter().map(|f| f.as_str()).collect();
4747
fields.reverse();
48-
self.extra_instructions_per_field
49-
.insert(fields.join("."), description.to_string());
48+
let field_path_str = fields.join(".");
49+
50+
// Check if we already have a description for this field
51+
if let Some(existing_description) =
52+
self.extra_instructions_per_field.get(&field_path_str)
53+
{
54+
// Concatenate descriptions with newline separator
55+
let combined_description =
56+
format!("{}\n{}", existing_description, description.to_string());
57+
self.extra_instructions_per_field
58+
.insert(field_path_str, combined_description);
59+
} else {
60+
self.extra_instructions_per_field
61+
.insert(field_path_str, description.to_string());
62+
}
5063
} else {
51-
schema.metadata.get_or_insert_default().description = Some(description.to_string());
64+
let metadata = schema.metadata.get_or_insert_default();
65+
if let Some(existing_description) = &metadata.description {
66+
// Concatenate descriptions with newline separator
67+
let combined_description =
68+
format!("{}\n{}", existing_description, description.to_string());
69+
metadata.description = Some(combined_description);
70+
} else {
71+
metadata.description = Some(description.to_string());
72+
}
5273
}
5374
}
5475

@@ -219,6 +240,10 @@ impl JsonSchemaBuilder {
219240
*instance_type = SingleOrVec::Vec(types);
220241
}
221242
}
243+
// Set field description if available
244+
if let Some(description) = &f.description {
245+
self.set_description(&mut schema, description, field_path.prepend(&f.name));
246+
}
222247
(f.name.to_string(), schema.into())
223248
})
224249
.collect(),
@@ -330,6 +355,7 @@ pub fn build_json_schema(
330355
fields: Arc::new(vec![schema::FieldSchema {
331356
name: object_wrapper_field_name.clone(),
332357
value_type: value_type.clone(),
358+
description: None,
333359
}]),
334360
description: None,
335361
};
@@ -352,3 +378,85 @@ pub fn build_json_schema(
352378
},
353379
})
354380
}
381+
382+
#[cfg(test)]
383+
mod tests {
384+
use super::*;
385+
use crate::base::schema::{
386+
BasicValueType, EnrichedValueType, FieldSchema, StructSchema, ValueType,
387+
};
388+
use std::sync::Arc;
389+
390+
#[test]
391+
fn test_description_concatenation() {
392+
// Create a struct with a field that has both field-level and type-level descriptions
393+
let struct_schema = StructSchema {
394+
description: Some(Arc::from("Test struct description")),
395+
fields: Arc::new(vec![FieldSchema {
396+
name: "uuid_field".to_string(),
397+
value_type: EnrichedValueType {
398+
typ: ValueType::Basic(BasicValueType::Uuid),
399+
nullable: false,
400+
attrs: Default::default(),
401+
},
402+
description: Some(Arc::from("This is a field-level description for UUID")),
403+
}]),
404+
};
405+
406+
let enriched_value_type = EnrichedValueType {
407+
typ: ValueType::Struct(struct_schema),
408+
nullable: false,
409+
attrs: Default::default(),
410+
};
411+
412+
let options = ToJsonSchemaOptions {
413+
fields_always_required: false,
414+
supports_format: true,
415+
extract_descriptions: false, // We want to see the description in the schema
416+
top_level_must_be_object: false,
417+
};
418+
419+
let result = build_json_schema(enriched_value_type, options).unwrap();
420+
421+
// Check if the description contains both field and type descriptions
422+
if let Some(properties) = &result.schema.object {
423+
if let Some(uuid_field_schema) = properties.properties.get("uuid_field") {
424+
if let Schema::Object(schema_object) = uuid_field_schema {
425+
if let Some(description) = &schema_object
426+
.metadata
427+
.as_ref()
428+
.and_then(|m| m.description.as_ref())
429+
{
430+
// Check if both descriptions are present
431+
assert!(
432+
description.contains("This is a field-level description for UUID"),
433+
"Field-level description not found in: {}",
434+
description
435+
);
436+
assert!(
437+
description
438+
.contains("A UUID, e.g. 123e4567-e89b-12d3-a456-426614174000"),
439+
"Type-level description not found in: {}",
440+
description
441+
);
442+
443+
// Check that they are separated by a newline
444+
assert!(
445+
description.contains("\n"),
446+
"Descriptions should be separated by newline: {}",
447+
description
448+
);
449+
} else {
450+
panic!("No description found in the schema");
451+
}
452+
} else {
453+
panic!("uuid_field schema is not a SchemaObject");
454+
}
455+
} else {
456+
panic!("uuid_field not found in properties");
457+
}
458+
} else {
459+
panic!("No object properties found in schema");
460+
}
461+
}
462+
}

src/base/schema.rs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -325,20 +325,38 @@ pub struct FieldSchema<DataType = ValueType> {
325325

326326
#[serde(flatten)]
327327
pub value_type: EnrichedValueType<DataType>,
328+
329+
/// Optional description for the field.
330+
#[serde(default, skip_serializing_if = "Option::is_none")]
331+
pub description: Option<Arc<str>>,
328332
}
329333

330334
impl FieldSchema {
331335
pub fn new(name: impl ToString, value_type: EnrichedValueType) -> Self {
332336
Self {
333337
name: name.to_string(),
334338
value_type,
339+
description: None,
340+
}
341+
}
342+
343+
pub fn new_with_description(
344+
name: impl ToString,
345+
value_type: EnrichedValueType,
346+
description: Option<impl ToString>,
347+
) -> Self {
348+
Self {
349+
name: name.to_string(),
350+
value_type,
351+
description: description.map(|d| d.to_string().into()),
335352
}
336353
}
337354

338355
pub fn without_attrs(&self) -> Self {
339356
Self {
340357
name: self.name.clone(),
341358
value_type: self.value_type.without_attrs(),
359+
description: None,
342360
}
343361
}
344362
}
@@ -351,6 +369,7 @@ impl<DataType> FieldSchema<DataType> {
351369
Ok(Self {
352370
name: field.name.clone(),
353371
value_type: EnrichedValueType::from_alternative(&field.value_type)?,
372+
description: field.description.clone(),
354373
})
355374
}
356375
}

src/builder/analyzer.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,7 @@ fn try_merge_fields_schemas(
229229
result_fields.push(FieldSchema {
230230
name: field1.name.clone(),
231231
value_type: try_make_common_value_type(&field1.value_type, &field2.value_type)?,
232+
description: None,
232233
});
233234
}
234235
Ok(result_fields)
@@ -316,6 +317,7 @@ impl DataScopeBuilder {
316317
let field_index = self.data.add_field(FieldSchema {
317318
name,
318319
value_type: EnrichedValueType::from_alternative(value_type)?,
320+
description: None,
319321
})?;
320322
Ok(AnalyzedOpOutput {
321323
field_idx: field_index,
@@ -567,6 +569,7 @@ fn analyze_struct_mapping(
567569
field_schemas.push(FieldSchema {
568570
name: field.name.clone(),
569571
value_type,
572+
description: None,
570573
});
571574
}
572575
Ok((

src/builder/flow_builder.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -364,8 +364,11 @@ impl FlowBuilder {
364364
.into_py_result()?;
365365
}
366366
let result = Self::last_field_to_data_slice(&self.root_op_scope).into_py_result()?;
367-
self.direct_input_fields
368-
.push(FieldSchema { name, value_type });
367+
self.direct_input_fields.push(FieldSchema {
368+
name,
369+
value_type,
370+
description: None,
371+
});
369372
Ok(result)
370373
}
371374

@@ -527,6 +530,7 @@ impl FlowBuilder {
527530
Ok(FieldSchema {
528531
name,
529532
value_type: ds.value_type()?,
533+
description: None,
530534
})
531535
})
532536
.collect::<Result<Vec<FieldSchema>>>()

0 commit comments

Comments
 (0)