Skip to content

Commit 9f3c91e

Browse files
authored
examples: simplify the paper_metadata example: inline embedding call (#714)
1 parent 9350045 commit 9f3c91e

File tree

1 file changed

+10
-17
lines changed

1 file changed

+10
-17
lines changed

examples/paper_metadata/main.py

Lines changed: 10 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -70,21 +70,6 @@ def pdf_to_markdown(content: bytes) -> str:
7070
return text
7171

7272

73-
@cocoindex.transform_flow()
74-
def text_to_embedding(
75-
text: cocoindex.DataSlice[str],
76-
) -> cocoindex.DataSlice[list[float]]:
77-
"""
78-
Embed the text using a SentenceTransformer model.
79-
This is a shared logic between indexing and querying, so extract it as a function.
80-
"""
81-
return text.transform(
82-
cocoindex.functions.SentenceTransformerEmbed(
83-
model="sentence-transformers/all-MiniLM-L6-v2"
84-
)
85-
)
86-
87-
8873
@cocoindex.flow_def(name="PaperMetadata")
8974
def paper_metadata_flow(
9075
flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
@@ -115,7 +100,11 @@ def paper_metadata_flow(
115100
instruction="Please extract the metadata from the first page of the paper.",
116101
)
117102
)
118-
doc["title_embedding"] = text_to_embedding(doc["metadata"]["title"])
103+
doc["title_embedding"] = doc["metadata"]["title"].transform(
104+
cocoindex.functions.SentenceTransformerEmbed(
105+
model="sentence-transformers/all-MiniLM-L6-v2"
106+
)
107+
)
119108
doc["abstract_chunks"] = doc["metadata"]["abstract"].transform(
120109
cocoindex.functions.SplitRecursively(
121110
custom_languages=[
@@ -152,7 +141,11 @@ def paper_metadata_flow(
152141
)
153142

154143
with doc["abstract_chunks"].row() as chunk:
155-
chunk["embedding"] = text_to_embedding(chunk["text"])
144+
chunk["embedding"] = chunk["text"].transform(
145+
cocoindex.functions.SentenceTransformerEmbed(
146+
model="sentence-transformers/all-MiniLM-L6-v2"
147+
)
148+
)
156149
metadata_embeddings.collect(
157150
id=cocoindex.GeneratedField.UUID,
158151
filename=doc["filename"],

0 commit comments

Comments
 (0)