Skip to content

Commit b3b4632

Browse files
authored
Merge pull request #4 from hinxcode/preprint
New updates for the preprint
2 parents 5066094 + 7cbc48a commit b3b4632

File tree

18 files changed

+180
-173
lines changed

18 files changed

+180
-173
lines changed

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "digital-collections-explorer",
3-
"version": "1.0.0",
3+
"version": "1.2.0",
44
"description": "A web-based exploratory search system and multimodal viewer for digital collections",
55
"scripts": {
66
"setup": "node src/setup.js",

scripts/create_loc_assets.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
"""
2+
This script generates the essential assets (embedding index and metadata) required for Digital Collections Explorer
3+
by converting `beto_idx.pt` provided by Mahowald and Lee (https://zenodo.org/records/11538437) into `item_ids.pt` and `metadata.json`
4+
5+
The output assets can be placed directly in the `data/embeddings` folder, allowing the FastAPI server to access them
6+
in the same way as we do in our public demo at https://digital-collections-explorer.com/
7+
"""
8+
9+
import pandas as pd
10+
import torch
11+
import json
12+
import base64
13+
14+
ORIGINAL_INDEX_PATH = 'input/beto_idx.pt'
15+
CSV_PATH = 'input/merged_files.csv'
16+
FINAL_METADATA_PATH = 'output/metadata.json'
17+
FINAL_INDEX_PATH = 'output/item_ids.pt'
18+
19+
def generate_assets():
20+
# --- 1. Load the original beto_idx.pt file ---
21+
original_idx = torch.load(ORIGINAL_INDEX_PATH)
22+
total_items = len(original_idx)
23+
print(f"Found {total_items} entries in the original index.")
24+
25+
# --- 2. Build a lookup table from merged_files.csv ---
26+
df = pd.read_csv(CSV_PATH)
27+
df.dropna(subset=['p1_item_id', 'file_url'], inplace=True)
28+
df['iiif_id'] = df['file_url'].apply(lambda url: url.split('/')[5] if isinstance(url, str) else None)
29+
df.dropna(subset=['iiif_id'], inplace=True)
30+
iiif_to_p1_lookup = pd.Series(df.p1_item_id.values, index=df.iiif_id).to_dict()
31+
32+
# --- 3. Generate new index and metadata ---
33+
final_metadata = {}
34+
final_beto_idx = []
35+
36+
for image_url in original_idx:
37+
# a. Extract iiif_id
38+
try:
39+
iiif_id = image_url.split('/')[5]
40+
except IndexError:
41+
b64_key = base64.urlsafe_b64encode(f"ERROR_PARSING_{len(final_beto_idx)}".encode('utf-8')).decode('utf-8')
42+
final_beto_idx.append(b64_key)
43+
final_metadata[b64_key] = {'error': f'Could not parse iiif_id from URL: {image_url}'}
44+
continue
45+
46+
# b. Generate Base64 key
47+
b64_key = base64.urlsafe_b64encode(iiif_id.encode('utf-8')).decode('utf-8')
48+
49+
# c. Append key to the new index
50+
final_beto_idx.append(b64_key)
51+
52+
# d. Find p1_item_id
53+
p1_item_id = iiif_to_p1_lookup.get(iiif_id, "p1_item_id_not_found")
54+
55+
# e. Assemble the new metadata object
56+
url_base = f"https://tile.loc.gov/image-services/iiif/{iiif_id}"
57+
paths = {
58+
'original': f"{url_base}/full/pct:100/0/default.jpg",
59+
'processed': f"{url_base}/full/2000,/0/default.jpg",
60+
'thumbnail': f"{url_base}/full/400,/0/default.jpg"
61+
}
62+
final_metadata[b64_key] = {
63+
'type': 'image',
64+
'iiif_id': iiif_id,
65+
'url': p1_item_id,
66+
'paths': paths
67+
}
68+
69+
# --- 4. Final Save and Validation ---
70+
with open(FINAL_METADATA_PATH, 'w') as f:
71+
json.dump(final_metadata, f, indent=4)
72+
print(f"Successfully saved {FINAL_METADATA_PATH} with {len(final_metadata)} entries.")
73+
74+
torch.save(final_beto_idx, FINAL_INDEX_PATH)
75+
print(f"Successfully saved {FINAL_INDEX_PATH} with {len(final_beto_idx)} entries.")
76+
77+
assert len(original_idx) == len(final_beto_idx), "CRITICAL: Final index length does not match original!"
78+
79+
if __name__ == '__main__':
80+
generate_assets()

src/frontend/documents/src/App.jsx

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,7 @@ const SearchResults = React.memo(({
4545

4646
return (
4747
<div className="welcome-message">
48-
<p>Enter a search term to explore documents.</p>
49-
<p>Try searching for subjects, time periods, locations, or visual elements.</p>
48+
<p>Enter a search term or upload a similar image to explore documents.</p>
5049
</div>
5150
);
5251
});

src/frontend/documents/src/components/PDFPreviewCard.jsx

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ const PDFPreviewCard = ({ metadata, src, alt, onClick }) => {
1212
effect="blur"
1313
src={src}
1414
width="100%"
15-
placeholderSrc='https://placehold.co/300x200'
1615
/>
1716
</div>
1817

src/frontend/documents/src/components/SearchBar.css

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -52,13 +52,6 @@
5252
font-size: 1.2rem;
5353
}
5454

55-
.search-suggestions {
56-
margin-top: 0.5rem;
57-
font-size: 0.85rem;
58-
color: #b0b0b0;
59-
font-style: italic;
60-
}
61-
6255
@media (max-width: 768px) {
6356
.search-text {
6457
display: none;

src/frontend/documents/src/components/SearchBar.jsx

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,6 @@ function SearchBar({ searchQuery, setSearchQuery, onSearch, inputRef }) {
2424
<span className="search-text">Search</span>
2525
</button>
2626
</form>
27-
<div className="search-suggestions">
28-
<p>Try: "city streets" • "rural landscapes" • "women in uniform" • "symbol of capitalization"</p>
29-
</div>
3027
</div>
3128
);
3229
}

src/frontend/maps/src/App.jsx

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,6 @@ function App() {
121121
<h1>Historical Maps Explorer</h1>
122122
<p>Explore maps using natural language search</p>
123123
</header>
124-
125124
<main className="App-main">
126125
<div className="search-controls">
127126
<SearchBar
@@ -136,11 +135,6 @@ function App() {
136135
onSearchByImage={handleSearchByImage}
137136
/>
138137
</div>
139-
{embeddingCount !== null && (
140-
<p className="embedding-count">
141-
Total number of maps in the collection: {embeddingCount.toLocaleString()}
142-
</p>
143-
)}
144138
<SearchResults
145139
items={maps}
146140
isLoading={isLoading}
@@ -150,8 +144,17 @@ function App() {
150144
setCurrentPage={setCurrentPage}
151145
hasMore={hasMore}
152146
/>
147+
{
148+
maps.length === 0 && embeddingCount !== null && (
149+
<div className="welcome-message">
150+
<p>
151+
Enter a search term or upload a similar image to discover matches
152+
from our collection of {embeddingCount.toLocaleString()} maps.
153+
</p>
154+
</div>
155+
)
156+
}
153157
</main>
154-
155158
<Lightbox
156159
isVisible={!!selectedMap}
157160
data={selectedMap}

src/frontend/maps/src/components/SearchBar.css

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -51,13 +51,6 @@
5151
font-size: 1.2rem;
5252
}
5353

54-
.search-suggestions {
55-
margin-top: 0.5rem;
56-
font-size: 0.85rem;
57-
color: #b0b0b0;
58-
font-style: italic;
59-
}
60-
6154
.search-mode-selector {
6255
display: flex;
6356
margin-bottom: 1rem;

src/frontend/maps/src/components/SearchResults.jsx

Lines changed: 21 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import { Gallery } from 'react-grid-gallery';
33
import Pagination from './Pagination';
44
import './SearchResults.css';
55

6-
const SearchResults = React.memo(({
6+
const SearchResults = React.memo(({
77
items,
88
isLoading,
99
error,
@@ -27,32 +27,27 @@ const SearchResults = React.memo(({
2727
</div>
2828
);
2929
}
30-
31-
if (items.length > 0) {
32-
return (
33-
<div className="gallery-container">
34-
<Gallery
35-
images={items}
36-
enableImageSelection={false}
37-
onClick={(index) => onClick(items[index])}
38-
margin={2}
39-
rowHeight={180}
40-
targetRowHeight={200}
41-
containerWidth={window.innerWidth * 0.95}
42-
/>
43-
<Pagination
44-
currentPage={currentPage}
45-
setCurrentPage={setCurrentPage}
46-
hasMore={hasMore}
47-
isLoading={isLoading}
48-
/>
49-
</div>
50-
);
51-
}
52-
5330
return (
54-
<div className="welcome-message">
55-
<p>Enter a search term or upload a similar image to explore historical maps.</p>
31+
<div className="gallery-container">
32+
{
33+
items.length > 0 && (
34+
<>
35+
<Gallery
36+
images={items}
37+
enableImageSelection={false}
38+
onClick={(index) => onClick(items[index])}
39+
margin={0}
40+
rowHeight={220}
41+
/>
42+
<Pagination
43+
currentPage={currentPage}
44+
setCurrentPage={setCurrentPage}
45+
hasMore={hasMore}
46+
isLoading={isLoading}
47+
/>
48+
</>
49+
)
50+
}
5651
</div>
5752
);
5853
});

src/frontend/maps/src/services/api.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,6 @@ export const getEmbeddingStats = async () => {
118118
return data;
119119
} catch (error) {
120120
console.error('Error fetching embedding stats:', error);
121-
return { count: 0 };
121+
throw error;
122122
}
123123
};

0 commit comments

Comments
 (0)