Skip to content

Commit 15ef7f8

Browse files
authored
Add a ParseJson function to parse text into JSON. (#327)
1 parent bb9c14c commit 15ef7f8

File tree

4 files changed

+109
-0
lines changed

4 files changed

+109
-0
lines changed

python/cocoindex/functions.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@
55
from .typing import Float32, Vector, TypeAttr
66
from . import op, llm
77

8+
class ParseJson(op.FunctionSpec):
9+
"""Parse a text into a JSON object."""
10+
811
class SplitRecursively(op.FunctionSpec):
912
"""Split a document (in string) recursively."""
1013

src/ops/functions/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
pub mod extract_by_llm;
2+
pub mod parse_json;
23
pub mod split_recursively;

src/ops/functions/parse_json.rs

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
use crate::ops::sdk::*;
2+
use anyhow::Result;
3+
use std::collections::HashMap;
4+
use std::sync::{Arc, LazyLock};
5+
use unicase::UniCase;
6+
7+
pub struct Args {
8+
text: ResolvedOpArg,
9+
language: Option<ResolvedOpArg>,
10+
}
11+
12+
type ParseFn = fn(&str) -> Result<serde_json::Value>;
13+
struct LanguageConfig {
14+
parse_fn: ParseFn,
15+
}
16+
17+
fn add_language<'a>(
18+
output: &'a mut HashMap<UniCase<&'static str>, Arc<LanguageConfig>>,
19+
name: &'static str,
20+
aliases: impl IntoIterator<Item = &'static str>,
21+
parse_fn: ParseFn,
22+
) {
23+
let lang_config = Arc::new(LanguageConfig { parse_fn });
24+
for name in std::iter::once(name).chain(aliases.into_iter()) {
25+
if output.insert(name.into(), lang_config.clone()).is_some() {
26+
panic!("Language `{name}` already exists");
27+
}
28+
}
29+
}
30+
31+
fn parse_json(text: &str) -> Result<serde_json::Value> {
32+
Ok(serde_json::from_str(text)?)
33+
}
34+
35+
static PARSE_FN_BY_LANG: LazyLock<HashMap<UniCase<&'static str>, Arc<LanguageConfig>>> =
36+
LazyLock::new(|| {
37+
let mut map = HashMap::new();
38+
add_language(&mut map, "json", [".json"], parse_json);
39+
map
40+
});
41+
42+
struct Executor {
43+
args: Args,
44+
}
45+
46+
#[async_trait]
47+
impl SimpleFunctionExecutor for Executor {
48+
async fn evaluate(&self, input: Vec<value::Value>) -> Result<value::Value> {
49+
let text = self.args.text.value(&input)?.as_str()?;
50+
let lang_config = {
51+
let language = self.args.language.value(&input)?;
52+
language
53+
.optional()
54+
.map(|v| anyhow::Ok(v.as_str()?.as_ref()))
55+
.transpose()?
56+
.and_then(|lang| PARSE_FN_BY_LANG.get(&UniCase::new(lang)))
57+
};
58+
let parse_fn = lang_config.map(|c| c.parse_fn).unwrap_or(parse_json);
59+
let parsed_value = parse_fn(text)?;
60+
Ok(value::Value::Basic(value::BasicValue::Json(Arc::new(
61+
parsed_value,
62+
))))
63+
}
64+
}
65+
66+
pub struct Factory;
67+
68+
#[async_trait]
69+
impl SimpleFunctionFactoryBase for Factory {
70+
type Spec = EmptySpec;
71+
type ResolvedArgs = Args;
72+
73+
fn name(&self) -> &str {
74+
"ParseJson"
75+
}
76+
77+
fn resolve_schema(
78+
&self,
79+
_spec: &EmptySpec,
80+
args_resolver: &mut OpArgsResolver<'_>,
81+
_context: &FlowInstanceContext,
82+
) -> Result<(Args, EnrichedValueType)> {
83+
let args = Args {
84+
text: args_resolver
85+
.next_arg("text")?
86+
.expect_type(&ValueType::Basic(BasicValueType::Str))?,
87+
language: args_resolver
88+
.next_optional_arg("language")?
89+
.expect_type(&ValueType::Basic(BasicValueType::Str))?,
90+
};
91+
92+
let output_schema = make_output_type(BasicValueType::Json);
93+
Ok((args, output_schema))
94+
}
95+
96+
async fn build_executor(
97+
self: Arc<Self>,
98+
_spec: EmptySpec,
99+
args: Args,
100+
_context: Arc<FlowInstanceContext>,
101+
) -> Result<Box<dyn SimpleFunctionExecutor>> {
102+
Ok(Box::new(Executor { args }))
103+
}
104+
}

src/ops/registration.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ fn register_executor_factories(registry: &mut ExecutorFactoryRegistry) -> Result
99
sources::local_file::Factory.register(registry)?;
1010
sources::google_drive::Factory.register(registry)?;
1111

12+
functions::parse_json::Factory.register(registry)?;
1213
functions::split_recursively::Factory.register(registry)?;
1314
functions::extract_by_llm::Factory.register(registry)?;
1415

0 commit comments

Comments
 (0)