Skip to content

Commit 9bcd487

Browse files
authored
Add attributes to annotate filename / mime type fields for content (#203)
1 parent 90bb0e9 commit 9bcd487

File tree

6 files changed

+160
-51
lines changed

6 files changed

+160
-51
lines changed

src/base/field_attrs.rs

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,16 @@ use const_format::concatcp;
22

33
pub static COCOINDEX_PREFIX: &str = "cocoindex.io/";
44

5-
/// Expected mime types for bytes and str.
6-
pub static _MIME_TYPE: &str = concatcp!(COCOINDEX_PREFIX, "mime_type");
5+
/// Present for bytes and str. It points to fields that represents the original file name for the data.
6+
/// Type: AnalyzedValueMapping
7+
pub static CONTENT_FILENAME: &str = concatcp!(COCOINDEX_PREFIX, "content_filename");
78

8-
/// Base text for chunks.
9+
/// Present for bytes and str. It points to fields that represents mime types for the data.
10+
/// Type: AnalyzedValueMapping
11+
pub static CONTENT_MIME_TYPE: &str = concatcp!(COCOINDEX_PREFIX, "content_mime_type");
12+
13+
/// Present for chunks. It points to fields that the chunks are for.
14+
/// Type: AnalyzedValueMapping
915
pub static CHUNK_BASE_TEXT: &str = concatcp!(COCOINDEX_PREFIX, "chunk_base_text");
1016

1117
/// Base text for an embedding vector.

src/base/schema.rs

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ impl std::fmt::Display for BasicValueType {
6565
}
6666
}
6767

68-
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
68+
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
6969
pub struct StructSchema {
7070
pub fields: Arc<Vec<FieldSchema>>,
7171

@@ -172,17 +172,10 @@ impl std::fmt::Display for CollectionSchema {
172172
}
173173

174174
impl CollectionSchema {
175-
pub fn new(
176-
kind: CollectionKind,
177-
fields: Vec<FieldSchema>,
178-
description: Option<Arc<str>>,
179-
) -> Self {
175+
pub fn new(kind: CollectionKind, row: StructSchema) -> Self {
180176
Self {
181177
kind,
182-
row: StructSchema {
183-
fields: Arc::new(fields),
184-
description,
185-
},
178+
row,
186179
collectors: Default::default(),
187180
}
188181
}

src/ops/functions/split_recursively.rs

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -585,18 +585,23 @@ impl SimpleFunctionFactoryBase for Factory {
585585
.next_optional_arg("language")?
586586
.expect_type(&ValueType::Basic(BasicValueType::Str))?,
587587
};
588-
let output_schema = make_output_type(CollectionSchema::new(
589-
CollectionKind::Table,
590-
vec![
591-
FieldSchema::new("location", make_output_type(BasicValueType::Range)),
592-
FieldSchema::new("text", make_output_type(BasicValueType::Str)),
593-
],
594-
None,
595-
))
596-
.with_attr(
597-
field_attrs::CHUNK_BASE_TEXT,
598-
serde_json::to_value(args_resolver.get_analyze_value(&args.text))?,
599-
);
588+
589+
let mut struct_schema = StructSchema::default();
590+
let mut schema_builder = StructSchemaBuilder::new(&mut struct_schema);
591+
schema_builder.add_field(FieldSchema::new(
592+
"location",
593+
make_output_type(BasicValueType::Range),
594+
));
595+
schema_builder.add_field(FieldSchema::new(
596+
"text",
597+
make_output_type(BasicValueType::Str),
598+
));
599+
let output_schema =
600+
make_output_type(CollectionSchema::new(CollectionKind::Table, struct_schema))
601+
.with_attr(
602+
field_attrs::CHUNK_BASE_TEXT,
603+
serde_json::to_value(args_resolver.get_analyze_value(&args.text))?,
604+
);
600605
Ok((args, output_schema))
601606
}
602607

src/ops/sdk.rs

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
1+
use crate::builder::plan::AnalyzedFieldReference;
2+
use crate::builder::plan::AnalyzedLocalFieldReference;
3+
use std::collections::BTreeMap;
4+
use std::sync::Arc;
5+
16
pub use super::factory_bases::*;
27
pub use super::interface::*;
38
pub use crate::base::schema::*;
9+
pub use crate::base::spec::*;
410
pub use crate::base::value::*;
511
pub use anyhow::Result;
612
pub use axum::async_trait;
@@ -46,3 +52,75 @@ macro_rules! fields_value {
4652
$crate::base::value::FieldValues { fields: std::vec![ $(($field).into()),+ ] }
4753
};
4854
}
55+
56+
pub struct SchemaBuilderFieldRef(AnalyzedLocalFieldReference);
57+
58+
impl SchemaBuilderFieldRef {
59+
pub fn to_field_ref(&self) -> AnalyzedFieldReference {
60+
AnalyzedFieldReference {
61+
local: self.0.clone(),
62+
scope_up_level: 0,
63+
}
64+
}
65+
}
66+
pub struct StructSchemaBuilder<'a> {
67+
base_fields_idx: Vec<u32>,
68+
target: &'a mut StructSchema,
69+
}
70+
71+
impl<'a> StructSchemaBuilder<'a> {
72+
pub fn new(target: &'a mut StructSchema) -> Self {
73+
Self {
74+
base_fields_idx: Vec::new(),
75+
target,
76+
}
77+
}
78+
79+
pub fn set_description(&mut self, description: impl Into<Arc<str>>) {
80+
self.target.description = Some(description.into());
81+
}
82+
83+
pub fn add_field(&mut self, field_schema: FieldSchema) -> SchemaBuilderFieldRef {
84+
let current_idx = self.target.fields.len() as u32;
85+
Arc::make_mut(&mut self.target.fields).push(field_schema);
86+
let mut fields_idx = self.base_fields_idx.clone();
87+
fields_idx.push(current_idx);
88+
SchemaBuilderFieldRef(AnalyzedLocalFieldReference { fields_idx })
89+
}
90+
91+
pub fn add_struct_field<'b>(
92+
&'b mut self,
93+
name: impl Into<FieldName>,
94+
nullable: bool,
95+
attrs: Arc<BTreeMap<String, serde_json::Value>>,
96+
) -> (StructSchemaBuilder<'b>, SchemaBuilderFieldRef) {
97+
let field_schema = FieldSchema::new(
98+
name.into(),
99+
EnrichedValueType {
100+
typ: ValueType::Struct(StructSchema {
101+
fields: Arc::new(Vec::new()),
102+
description: None,
103+
}),
104+
nullable,
105+
attrs,
106+
},
107+
);
108+
let local_ref = self.add_field(field_schema);
109+
let struct_schema = match &mut Arc::make_mut(&mut self.target.fields)
110+
.last_mut()
111+
.unwrap()
112+
.value_type
113+
.typ
114+
{
115+
ValueType::Struct(s) => s,
116+
_ => unreachable!(),
117+
};
118+
(
119+
StructSchemaBuilder {
120+
base_fields_idx: local_ref.0.fields_idx.clone(),
121+
target: struct_schema,
122+
},
123+
local_ref,
124+
)
125+
}
126+
}

src/ops/sources/google_drive.rs

Lines changed: 32 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ use hyper_util::client::legacy::connect::HttpConnector;
1414
use indexmap::IndexSet;
1515
use log::warn;
1616

17+
use crate::base::field_attrs;
1718
use crate::ops::sdk::*;
1819

1920
struct ExportMimeType {
@@ -277,22 +278,39 @@ impl SourceFactoryBase for Factory {
277278
spec: &Spec,
278279
_context: &FlowInstanceContext,
279280
) -> Result<EnrichedValueType> {
281+
let mut struct_schema = StructSchema::default();
282+
let mut schema_builder = StructSchemaBuilder::new(&mut struct_schema);
283+
schema_builder.add_field(FieldSchema::new(
284+
"file_id",
285+
make_output_type(BasicValueType::Str),
286+
));
287+
let filename_field = schema_builder.add_field(FieldSchema::new(
288+
"filename",
289+
make_output_type(BasicValueType::Str),
290+
));
291+
let mime_type_field = schema_builder.add_field(FieldSchema::new(
292+
"mime_type",
293+
make_output_type(BasicValueType::Str),
294+
));
295+
schema_builder.add_field(FieldSchema::new(
296+
"content",
297+
make_output_type(if spec.binary {
298+
BasicValueType::Bytes
299+
} else {
300+
BasicValueType::Str
301+
})
302+
.with_attr(
303+
field_attrs::CONTENT_FILENAME,
304+
serde_json::to_value(filename_field.to_field_ref())?,
305+
)
306+
.with_attr(
307+
field_attrs::CONTENT_MIME_TYPE,
308+
serde_json::to_value(mime_type_field.to_field_ref())?,
309+
),
310+
));
280311
Ok(make_output_type(CollectionSchema::new(
281312
CollectionKind::Table,
282-
vec![
283-
FieldSchema::new("file_id", make_output_type(BasicValueType::Str)),
284-
FieldSchema::new("filename", make_output_type(BasicValueType::Str)),
285-
FieldSchema::new("mime_type", make_output_type(BasicValueType::Str)),
286-
FieldSchema::new(
287-
"content",
288-
make_output_type(if spec.binary {
289-
BasicValueType::Bytes
290-
} else {
291-
BasicValueType::Str
292-
}),
293-
),
294-
],
295-
None,
313+
struct_schema,
296314
)))
297315
}
298316

src/ops/sources/local_file.rs

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ use globset::{Glob, GlobSet, GlobSetBuilder};
22
use log::warn;
33
use std::{path::PathBuf, sync::Arc};
44

5+
use crate::base::field_attrs;
56
use crate::{fields_value, ops::sdk::*};
67

78
#[derive(Debug, Deserialize)]
@@ -99,20 +100,28 @@ impl SourceFactoryBase for Factory {
99100
spec: &Spec,
100101
_context: &FlowInstanceContext,
101102
) -> Result<EnrichedValueType> {
103+
let mut struct_schema = StructSchema::default();
104+
let mut schema_builder = StructSchemaBuilder::new(&mut struct_schema);
105+
let filename_field = schema_builder.add_field(FieldSchema::new(
106+
"filename",
107+
make_output_type(BasicValueType::Str),
108+
));
109+
schema_builder.add_field(FieldSchema::new(
110+
"content",
111+
make_output_type(if spec.binary {
112+
BasicValueType::Bytes
113+
} else {
114+
BasicValueType::Str
115+
})
116+
.with_attr(
117+
field_attrs::CONTENT_FILENAME,
118+
serde_json::to_value(filename_field.to_field_ref())?,
119+
),
120+
));
121+
102122
Ok(make_output_type(CollectionSchema::new(
103123
CollectionKind::Table,
104-
vec![
105-
FieldSchema::new("filename", make_output_type(BasicValueType::Str)),
106-
FieldSchema::new(
107-
"content",
108-
make_output_type(if spec.binary {
109-
BasicValueType::Bytes
110-
} else {
111-
BasicValueType::Str
112-
}),
113-
),
114-
],
115-
None,
124+
struct_schema,
116125
)))
117126
}
118127

0 commit comments

Comments
 (0)