Skip to content

Commit 7eb00ba

Browse files
authored
example(paper-metadata): rearrange code ordering to make it clearer (#715)
1 parent 9f3c91e commit 7eb00ba

File tree

1 file changed

+25
-20
lines changed

1 file changed

+25
-20
lines changed

examples/paper_metadata/main.py

Lines changed: 25 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -83,10 +83,11 @@ def paper_metadata_flow(
8383
)
8484

8585
paper_metadata = data_scope.add_collector()
86-
metadata_embeddings = data_scope.add_collector()
8786
author_papers = data_scope.add_collector()
87+
metadata_embeddings = data_scope.add_collector()
8888

8989
with data_scope["documents"].row() as doc:
90+
# Extract metadata
9091
doc["basic_info"] = doc["content"].transform(extract_basic_info)
9192
doc["first_page_md"] = doc["basic_info"]["first_page"].transform(
9293
pdf_to_markdown
@@ -100,6 +101,24 @@ def paper_metadata_flow(
100101
instruction="Please extract the metadata from the first page of the paper.",
101102
)
102103
)
104+
105+
# Collect metadata
106+
paper_metadata.collect(
107+
filename=doc["filename"],
108+
title=doc["metadata"]["title"],
109+
authors=doc["metadata"]["authors"],
110+
abstract=doc["metadata"]["abstract"],
111+
num_pages=doc["basic_info"]["num_pages"],
112+
)
113+
114+
# Collect author to filename mapping
115+
with doc["metadata"]["authors"].row() as author:
116+
author_papers.collect(
117+
author_name=author["name"],
118+
filename=doc["filename"],
119+
)
120+
121+
# Embed title and abstract, and collect embeddings
103122
doc["title_embedding"] = doc["metadata"]["title"].transform(
104123
cocoindex.functions.SentenceTransformerEmbed(
105124
model="sentence-transformers/all-MiniLM-L6-v2"
@@ -119,27 +138,13 @@ def paper_metadata_flow(
119138
min_chunk_size=200,
120139
chunk_overlap=150,
121140
)
122-
123-
paper_metadata.collect(
124-
filename=doc["filename"],
125-
title=doc["metadata"]["title"],
126-
authors=doc["metadata"]["authors"],
127-
abstract=doc["metadata"]["abstract"],
128-
num_pages=doc["basic_info"]["num_pages"],
129-
)
130141
metadata_embeddings.collect(
131142
id=cocoindex.GeneratedField.UUID,
132143
filename=doc["filename"],
133144
location="title",
134145
text=doc["metadata"]["title"],
135146
embedding=doc["title_embedding"],
136147
)
137-
with doc["metadata"]["authors"].row() as author:
138-
author_papers.collect(
139-
author_name=author["name"],
140-
filename=doc["filename"],
141-
)
142-
143148
with doc["abstract_chunks"].row() as chunk:
144149
chunk["embedding"] = chunk["text"].transform(
145150
cocoindex.functions.SentenceTransformerEmbed(
@@ -159,6 +164,11 @@ def paper_metadata_flow(
159164
cocoindex.targets.Postgres(),
160165
primary_key_fields=["filename"],
161166
)
167+
author_papers.export(
168+
"author_papers",
169+
cocoindex.targets.Postgres(),
170+
primary_key_fields=["author_name", "filename"],
171+
)
162172
metadata_embeddings.export(
163173
"metadata_embeddings",
164174
cocoindex.targets.Postgres(),
@@ -170,8 +180,3 @@ def paper_metadata_flow(
170180
)
171181
],
172182
)
173-
author_papers.export(
174-
"author_papers",
175-
cocoindex.targets.Postgres(),
176-
primary_key_fields=["author_name", "filename"],
177-
)

0 commit comments

Comments
 (0)