Skip to content

Commit 9df543d

Browse files
committed
Merge branch 'main' into examples
2 parents c13f000 + 253d512 commit 9df543d

File tree

4 files changed

+23
-37
lines changed

4 files changed

+23
-37
lines changed

examples/manuals_llm_extraction/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ name = "manuals-llm-extraction"
33
version = "0.1.0"
44
description = "Simple example for cocoindex: extract structured information from a Markdown file using LLM."
55
requires-python = ">=3.11"
6-
dependencies = ["cocoindex>=0.1.79", "marker-pdf>=1.5.2"]
6+
dependencies = ["cocoindex>=0.1.79", "marker-pdf>=1.8.5"]
77

88
[tool.setuptools]
99
packages = []

examples/paper_metadata/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ requires-python = ">=3.11"
66
dependencies = [
77
"cocoindex[embeddings]>=0.1.83",
88
"pypdf>=5.7.0",
9-
"marker-pdf>=1.5.2",
9+
"marker-pdf>=1.8.5",
1010
]
1111

1212
[tool.setuptools]

examples/pdf_embedding/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ requires-python = ">=3.11"
66
dependencies = [
77
"cocoindex[embeddings]>=0.1.79",
88
"python-dotenv>=1.0.1",
9-
"marker-pdf>=1.5.2",
9+
"marker-pdf>=1.8.5",
1010
"psycopg[binary,pool]",
1111
"jinja2>=3.1.6",
1212
]

python/cocoindex/convert.py

Lines changed: 20 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828
encode_enriched_type,
2929
is_namedtuple_type,
3030
is_numpy_number_type,
31-
is_struct_type,
3231
)
3332

3433

@@ -88,38 +87,25 @@ def encode_struct_list(value: Any) -> Any:
8887

8988
return encode_struct_list
9089

91-
if isinstance(variant, AnalyzedDictType):
92-
if not variant.value_type:
93-
return lambda value: value
90+
# Otherwise it's a vector, falling into basic type in the engine.
9491

92+
if isinstance(variant, AnalyzedDictType):
9593
value_type_info = analyze_type_info(variant.value_type)
96-
if isinstance(value_type_info.variant, AnalyzedStructType):
97-
98-
def encode_struct_dict(value: Any) -> Any:
99-
if not isinstance(value, dict):
100-
return value
101-
if not value:
102-
return []
103-
104-
sample_key, sample_val = next(iter(value.items()))
105-
key_type, val_type = type(sample_key), type(sample_val)
106-
107-
# Handle KTable case
108-
if value and is_struct_type(val_type):
109-
key_encoder = (
110-
make_engine_value_encoder(analyze_type_info(key_type))
111-
if is_struct_type(key_type)
112-
else make_engine_value_encoder(ANY_TYPE_INFO)
113-
)
114-
value_encoder = make_engine_value_encoder(
115-
analyze_type_info(val_type)
116-
)
117-
return [
118-
[key_encoder(k)] + value_encoder(v) for k, v in value.items()
119-
]
120-
return {key_encoder(k): value_encoder(v) for k, v in value.items()}
94+
if not isinstance(value_type_info.variant, AnalyzedStructType):
95+
raise ValueError(
96+
f"Value type for dict is required to be a struct (e.g. dataclass or NamedTuple), got {variant.value_type}. "
97+
f"If you want a free-formed dict, use `cocoindex.Json` instead."
98+
)
12199

122-
return encode_struct_dict
100+
key_encoder = make_engine_value_encoder(analyze_type_info(variant.key_type))
101+
value_encoder = make_engine_value_encoder(analyze_type_info(variant.value_type))
102+
103+
def encode_struct_dict(value: Any) -> Any:
104+
if not value:
105+
return []
106+
return [[key_encoder(k)] + value_encoder(v) for k, v in value.items()]
107+
108+
return encode_struct_dict
123109

124110
if isinstance(variant, AnalyzedStructType):
125111
struct_type = variant.struct_type
@@ -132,8 +118,8 @@ def encode_struct_dict(value: Any) -> Any:
132118
field_names = [f.name for f in fields]
133119

134120
def encode_dataclass(value: Any) -> Any:
135-
if not dataclasses.is_dataclass(value):
136-
return value
121+
if value is None:
122+
return None
137123
return [
138124
encoder(getattr(value, name))
139125
for encoder, name in zip(field_encoders, field_names)
@@ -154,8 +140,8 @@ def encode_dataclass(value: Any) -> Any:
154140
]
155141

156142
def encode_namedtuple(value: Any) -> Any:
157-
if not is_namedtuple_type(type(value)):
158-
return value
143+
if value is None:
144+
return None
159145
return [
160146
encoder(getattr(value, name))
161147
for encoder, name in zip(field_encoders, field_names)

0 commit comments

Comments
 (0)