@@ -70,21 +70,6 @@ def pdf_to_markdown(content: bytes) -> str:
7070 return text
7171
7272
73- @cocoindex .transform_flow ()
74- def text_to_embedding (
75- text : cocoindex .DataSlice [str ],
76- ) -> cocoindex .DataSlice [list [float ]]:
77- """
78- Embed the text using a SentenceTransformer model.
79- This is a shared logic between indexing and querying, so extract it as a function.
80- """
81- return text .transform (
82- cocoindex .functions .SentenceTransformerEmbed (
83- model = "sentence-transformers/all-MiniLM-L6-v2"
84- )
85- )
86-
87-
8873@cocoindex .flow_def (name = "PaperMetadata" )
8974def paper_metadata_flow (
9075 flow_builder : cocoindex .FlowBuilder , data_scope : cocoindex .DataScope
@@ -115,7 +100,11 @@ def paper_metadata_flow(
115100 instruction = "Please extract the metadata from the first page of the paper." ,
116101 )
117102 )
118- doc ["title_embedding" ] = text_to_embedding (doc ["metadata" ]["title" ])
103+ doc ["title_embedding" ] = doc ["metadata" ]["title" ].transform (
104+ cocoindex .functions .SentenceTransformerEmbed (
105+ model = "sentence-transformers/all-MiniLM-L6-v2"
106+ )
107+ )
119108 doc ["abstract_chunks" ] = doc ["metadata" ]["abstract" ].transform (
120109 cocoindex .functions .SplitRecursively (
121110 custom_languages = [
@@ -152,7 +141,11 @@ def paper_metadata_flow(
152141 )
153142
154143 with doc ["abstract_chunks" ].row () as chunk :
155- chunk ["embedding" ] = text_to_embedding (chunk ["text" ])
144+ chunk ["embedding" ] = chunk ["text" ].transform (
145+ cocoindex .functions .SentenceTransformerEmbed (
146+ model = "sentence-transformers/all-MiniLM-L6-v2"
147+ )
148+ )
156149 metadata_embeddings .collect (
157150 id = cocoindex .GeneratedField .UUID ,
158151 filename = doc ["filename" ],
0 commit comments