@@ -83,10 +83,11 @@ def paper_metadata_flow(
8383 )
8484
8585 paper_metadata = data_scope .add_collector ()
86- metadata_embeddings = data_scope .add_collector ()
8786 author_papers = data_scope .add_collector ()
87+ metadata_embeddings = data_scope .add_collector ()
8888
8989 with data_scope ["documents" ].row () as doc :
90+ # Extract metadata
9091 doc ["basic_info" ] = doc ["content" ].transform (extract_basic_info )
9192 doc ["first_page_md" ] = doc ["basic_info" ]["first_page" ].transform (
9293 pdf_to_markdown
@@ -100,6 +101,24 @@ def paper_metadata_flow(
100101 instruction = "Please extract the metadata from the first page of the paper." ,
101102 )
102103 )
104+
105+ # Collect metadata
106+ paper_metadata .collect (
107+ filename = doc ["filename" ],
108+ title = doc ["metadata" ]["title" ],
109+ authors = doc ["metadata" ]["authors" ],
110+ abstract = doc ["metadata" ]["abstract" ],
111+ num_pages = doc ["basic_info" ]["num_pages" ],
112+ )
113+
114+ # Collect author to filename mapping
115+ with doc ["metadata" ]["authors" ].row () as author :
116+ author_papers .collect (
117+ author_name = author ["name" ],
118+ filename = doc ["filename" ],
119+ )
120+
121+ # Embed title and abstract, and collect embeddings
103122 doc ["title_embedding" ] = doc ["metadata" ]["title" ].transform (
104123 cocoindex .functions .SentenceTransformerEmbed (
105124 model = "sentence-transformers/all-MiniLM-L6-v2"
@@ -119,27 +138,13 @@ def paper_metadata_flow(
119138 min_chunk_size = 200 ,
120139 chunk_overlap = 150 ,
121140 )
122-
123- paper_metadata .collect (
124- filename = doc ["filename" ],
125- title = doc ["metadata" ]["title" ],
126- authors = doc ["metadata" ]["authors" ],
127- abstract = doc ["metadata" ]["abstract" ],
128- num_pages = doc ["basic_info" ]["num_pages" ],
129- )
130141 metadata_embeddings .collect (
131142 id = cocoindex .GeneratedField .UUID ,
132143 filename = doc ["filename" ],
133144 location = "title" ,
134145 text = doc ["metadata" ]["title" ],
135146 embedding = doc ["title_embedding" ],
136147 )
137- with doc ["metadata" ]["authors" ].row () as author :
138- author_papers .collect (
139- author_name = author ["name" ],
140- filename = doc ["filename" ],
141- )
142-
143148 with doc ["abstract_chunks" ].row () as chunk :
144149 chunk ["embedding" ] = chunk ["text" ].transform (
145150 cocoindex .functions .SentenceTransformerEmbed (
@@ -159,6 +164,11 @@ def paper_metadata_flow(
159164 cocoindex .targets .Postgres (),
160165 primary_key_fields = ["filename" ],
161166 )
167+ author_papers .export (
168+ "author_papers" ,
169+ cocoindex .targets .Postgres (),
170+ primary_key_fields = ["author_name" , "filename" ],
171+ )
162172 metadata_embeddings .export (
163173 "metadata_embeddings" ,
164174 cocoindex .targets .Postgres (),
@@ -170,8 +180,3 @@ def paper_metadata_flow(
170180 )
171181 ],
172182 )
173- author_papers .export (
174- "author_papers" ,
175- cocoindex .targets .Postgres (),
176- primary_key_fields = ["author_name" , "filename" ],
177- )
0 commit comments