Merge pull request #4 from hinxcode/preprint

hinxcode · web-flow · commit b3b46325ad39 · 2025-07-01T02:23:59.000-07:00
New updates for the preprint
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "digital-collections-explorer",
-  "version": "1.0.0",
+  "version": "1.2.0",
   "description": "A web-based exploratory search system and multimodal viewer for digital collections",
   "scripts": {
     "setup": "node src/setup.js",
diff --git a/scripts/create_loc_assets.py b/scripts/create_loc_assets.py
@@ -0,0 +1,80 @@
+"""
+This script generates the essential assets (embedding index and metadata) required for Digital Collections Explorer
+by converting `beto_idx.pt` provided by Mahowald and Lee (https://zenodo.org/records/11538437) into `item_ids.pt` and `metadata.json`
+
+The output assets can be placed directly in the `data/embeddings` folder, allowing the FastAPI server to access them
+in the same way as we do in our public demo at https://digital-collections-explorer.com/
+"""
+
+import pandas as pd
+import torch
+import json
+import base64
+
+ORIGINAL_INDEX_PATH = 'input/beto_idx.pt'
+CSV_PATH = 'input/merged_files.csv'
+FINAL_METADATA_PATH = 'output/metadata.json'
+FINAL_INDEX_PATH = 'output/item_ids.pt'
+
+def generate_assets():
+    # --- 1. Load the original beto_idx.pt file ---
+    original_idx = torch.load(ORIGINAL_INDEX_PATH)
+    total_items = len(original_idx)
+    print(f"Found {total_items} entries in the original index.")
+
+    # --- 2. Build a lookup table from merged_files.csv ---
+    df = pd.read_csv(CSV_PATH)
+    df.dropna(subset=['p1_item_id', 'file_url'], inplace=True)
+    df['iiif_id'] = df['file_url'].apply(lambda url: url.split('/')[5] if isinstance(url, str) else None)
+    df.dropna(subset=['iiif_id'], inplace=True)
+    iiif_to_p1_lookup = pd.Series(df.p1_item_id.values, index=df.iiif_id).to_dict()
+
+    # --- 3. Generate new index and metadata ---
+    final_metadata = {}
+    final_beto_idx = []
+
+    for image_url in original_idx:
+        # a. Extract iiif_id
+        try:
+            iiif_id = image_url.split('/')[5]
+        except IndexError:
+            b64_key = base64.urlsafe_b64encode(f"ERROR_PARSING_{len(final_beto_idx)}".encode('utf-8')).decode('utf-8')
+            final_beto_idx.append(b64_key)
+            final_metadata[b64_key] = {'error': f'Could not parse iiif_id from URL: {image_url}'}
+            continue
+
+        # b. Generate Base64 key
+        b64_key = base64.urlsafe_b64encode(iiif_id.encode('utf-8')).decode('utf-8')
+
+        # c. Append key to the new index
+        final_beto_idx.append(b64_key)
+
+        # d. Find p1_item_id
+        p1_item_id = iiif_to_p1_lookup.get(iiif_id, "p1_item_id_not_found")
+
+        # e. Assemble the new metadata object
+        url_base = f"https://tile.loc.gov/image-services/iiif/{iiif_id}"
+        paths = {
+            'original': f"{url_base}/full/pct:100/0/default.jpg",
+            'processed': f"{url_base}/full/2000,/0/default.jpg",
+            'thumbnail': f"{url_base}/full/400,/0/default.jpg"
+        }
+        final_metadata[b64_key] = {
+            'type': 'image',
+            'iiif_id': iiif_id,
+            'url': p1_item_id,
+            'paths': paths
+        }
+
+    # --- 4. Final Save and Validation ---
+    with open(FINAL_METADATA_PATH, 'w') as f:
+        json.dump(final_metadata, f, indent=4)
+    print(f"Successfully saved {FINAL_METADATA_PATH} with {len(final_metadata)} entries.")
+
+    torch.save(final_beto_idx, FINAL_INDEX_PATH)
+    print(f"Successfully saved {FINAL_INDEX_PATH} with {len(final_beto_idx)} entries.")
+
+    assert len(original_idx) == len(final_beto_idx), "CRITICAL: Final index length does not match original!"
+
+if __name__ == '__main__':
+    generate_assets()
diff --git a/src/frontend/documents/src/App.jsx b/src/frontend/documents/src/App.jsx
@@ -45,8 +45,7 @@ const SearchResults = React.memo(({
 
   return (
     <div className="welcome-message">
-      <p>Enter a search term to explore documents.</p>
-      <p>Try searching for subjects, time periods, locations, or visual elements.</p>
+      <p>Enter a search term or upload a similar image to explore documents.</p>
     </div>
   );
 });
diff --git a/src/frontend/documents/src/components/PDFPreviewCard.jsx b/src/frontend/documents/src/components/PDFPreviewCard.jsx
@@ -12,7 +12,6 @@ const PDFPreviewCard = ({ metadata, src, alt, onClick }) => {
           effect="blur"
           src={src}
           width="100%"
-          placeholderSrc='https://placehold.co/300x200'
         />
       </div>
       
diff --git a/src/frontend/documents/src/components/SearchBar.css b/src/frontend/documents/src/components/SearchBar.css
@@ -52,13 +52,6 @@
   font-size: 1.2rem;
 }
 
-.search-suggestions {
-  margin-top: 0.5rem;
-  font-size: 0.85rem;
-  color: #b0b0b0;
-  font-style: italic;
-}
-
 @media (max-width: 768px) {
   .search-text {
     display: none;
diff --git a/src/frontend/documents/src/components/SearchBar.jsx b/src/frontend/documents/src/components/SearchBar.jsx
@@ -24,9 +24,6 @@ function SearchBar({ searchQuery, setSearchQuery, onSearch, inputRef }) {
           <span className="search-text">Search</span>
         </button>
       </form>
-      <div className="search-suggestions">
-        <p>Try: "city streets" • "rural landscapes" • "women in uniform" • "symbol of capitalization"</p>
-      </div>
     </div>
   );
 }
diff --git a/src/frontend/maps/src/App.jsx b/src/frontend/maps/src/App.jsx
@@ -121,7 +121,6 @@ function App() {
         <h1>Historical Maps Explorer</h1>
         <p>Explore maps using natural language search</p>
       </header>
-      
       <main className="App-main">
         <div className="search-controls">
           <SearchBar
@@ -136,11 +135,6 @@ function App() {
             onSearchByImage={handleSearchByImage}
           />
         </div>
-        {embeddingCount !== null && (
-          <p className="embedding-count">
-            Total number of maps in the collection: {embeddingCount.toLocaleString()}
-          </p>
-        )}
         <SearchResults 
           items={maps}
           isLoading={isLoading}
@@ -150,8 +144,17 @@ function App() {
           setCurrentPage={setCurrentPage}
           hasMore={hasMore}
         />
+        {
+          maps.length === 0 && embeddingCount !== null && (
+            <div className="welcome-message">
+              <p>
+                Enter a search term or upload a similar image to discover matches
+                from our collection of {embeddingCount.toLocaleString()} maps.
+              </p>
+            </div>
+          )
+        }
       </main>
-
       <Lightbox
         isVisible={!!selectedMap}
         data={selectedMap}
diff --git a/src/frontend/maps/src/components/SearchBar.css b/src/frontend/maps/src/components/SearchBar.css
@@ -51,13 +51,6 @@
   font-size: 1.2rem;
 }
 
-.search-suggestions {
-  margin-top: 0.5rem;
-  font-size: 0.85rem;
-  color: #b0b0b0;
-  font-style: italic;
-}
-
 .search-mode-selector {
   display: flex;
   margin-bottom: 1rem;
diff --git a/src/frontend/maps/src/components/SearchResults.jsx b/src/frontend/maps/src/components/SearchResults.jsx
@@ -3,7 +3,7 @@ import { Gallery } from 'react-grid-gallery';
 import Pagination from './Pagination';
 import './SearchResults.css';
 
-const SearchResults = React.memo(({ 
+const SearchResults = React.memo(({
   items,
   isLoading,
   error,
@@ -27,32 +27,27 @@ const SearchResults = React.memo(({
       </div>
     );
   }
-
-  if (items.length > 0) {
-    return (
-      <div className="gallery-container">
-        <Gallery 
-          images={items}
-          enableImageSelection={false}
-          onClick={(index) => onClick(items[index])}
-          margin={2}
-          rowHeight={180}
-          targetRowHeight={200}
-          containerWidth={window.innerWidth * 0.95}
-        />
-        <Pagination 
-          currentPage={currentPage}
-          setCurrentPage={setCurrentPage}
-          hasMore={hasMore}
-          isLoading={isLoading}
-        />
-      </div>
-    );
-  }
-
   return (
-    <div className="welcome-message">
-      <p>Enter a search term or upload a similar image to explore historical maps.</p>
+    <div className="gallery-container">
+      {
+        items.length > 0 && (
+          <>
+            <Gallery
+              images={items}
+              enableImageSelection={false}
+              onClick={(index) => onClick(items[index])}
+              margin={0}
+              rowHeight={220}
+            />
+            <Pagination
+              currentPage={currentPage}
+              setCurrentPage={setCurrentPage}
+              hasMore={hasMore}
+              isLoading={isLoading}
+            />
+          </>
+        )
+      }
     </div>
   );
 });
diff --git a/src/frontend/maps/src/services/api.js b/src/frontend/maps/src/services/api.js
@@ -118,6 +118,6 @@ export const getEmbeddingStats = async () => {
     return data;
   } catch (error) {
     console.error('Error fetching embedding stats:', error);
-    return { count: 0 };
+    throw error;
   }
 };
diff --git a/src/frontend/photographs/package-lock.json b/src/frontend/photographs/package-lock.json
diff --git a/src/frontend/photographs/package.json b/src/frontend/photographs/package.json
@@ -12,12 +12,11 @@
     "react": "^18.2.15",
     "react-dom": "^18.2.15",
     "react-grid-gallery": "^1.0.0",
-    "react-lazy-load-image-component": "^1.6.0",
     "yet-another-react-lightbox": "^3.12.0"
   },
   "devDependencies": {
-    "@types/react": "^18.2.15",
     "@eslint/js": "^9.21.0",
+    "@types/react": "^18.2.15",
     "@types/react-dom": "^18.2.15",
     "@vitejs/plugin-react": "^4.3.4",
     "eslint": "^9.21.0",
diff --git a/src/frontend/photographs/src/App.jsx b/src/frontend/photographs/src/App.jsx
@@ -2,7 +2,7 @@ import React, { useState, useEffect, useCallback, useRef } from 'react';
 import SearchBar from './components/SearchBar';
 import { ResultsPerPageDropdown } from './components/Pagination';
 import SearchResults from './components/SearchResults';
-import { searchByText, searchByImage } from './services/api';
+import { searchByText, searchByImage, getEmbeddingStats } from './services/api';
 import './App.css';
 
 function App() {
@@ -16,6 +16,20 @@ function App() {
   const [hasMore, setHasMore] = useState(false);
   const [currentPage, setCurrentPage] = useState(1);
   const [resultsPerPage, setResultsPerPage] = useState(50);
+  const [embeddingCount, setEmbeddingCount] = useState(null);
+
+  useEffect(() => {
+    const fetchEmbeddingStats = async () => {
+      try {
+        const stats = await getEmbeddingStats();
+        setEmbeddingCount(stats.count);
+      } catch (error) {
+        console.error('Failed to load embedding stats:', error);
+      }
+    };
+
+    fetchEmbeddingStats();
+  }, []);
 
   const formatPhotosForGallery = (results) => {
     return results.map(result => ({
@@ -127,6 +141,16 @@ function App() {
           setCurrentPage={setCurrentPage}
           hasMore={hasMore}
         />
+        {
+          photos.length === 0 && embeddingCount !== null && (
+            <div className="welcome-message">
+              <p>
+                Enter a search term or upload a similar image to discover matches
+                from our collection of {embeddingCount.toLocaleString()} photographs.
+              </p>
+            </div>
+          )
+        }
       </main>
     </div>
   );
diff --git a/src/frontend/photographs/src/components/SearchBar.css b/src/frontend/photographs/src/components/SearchBar.css
@@ -32,14 +32,17 @@
 }
 
 .search-bar .search-button {
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  gap: 0.5rem;
   background-color: #8b5a2b;
   color: white;
   border: none;
   padding: 0.75rem 1.5rem;
   font-size: 1rem;
   cursor: pointer;
   border-radius: 4px;
-  font-family: 'Lato', sans-serif;
   transition: background-color 0.2s ease;
 }
 
@@ -190,10 +193,6 @@
   color: #fff;
 }
 
-.embedding-count {
-  margin-top: 8px;
-}
-
 @media (max-width: 600px) {
   .search-bar form {
     flex-direction: column;
diff --git a/src/frontend/photographs/src/components/SearchBar.jsx b/src/frontend/photographs/src/components/SearchBar.jsx
diff --git a/src/frontend/photographs/src/components/SearchResults.css b/src/frontend/photographs/src/components/SearchResults.css
diff --git a/src/frontend/photographs/src/components/SearchResults.jsx b/src/frontend/photographs/src/components/SearchResults.jsx
diff --git a/src/frontend/photographs/src/services/api.js b/src/frontend/photographs/src/services/api.js

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "digital-collections-explorer",`
`3`		`- "version": "1.0.0",`
	`3`	`+ "version": "1.2.0",`
`4`	`4`	`"description": "A web-based exploratory search system and multimodal viewer for digital collections",`
`5`	`5`	`"scripts": {`
`6`	`6`	`"setup": "node src/setup.js",`
Original file line number	Diff line number	Diff line change
`@@ -118,6 +118,6 @@ export const getEmbeddingStats = async () => {`
`118`	`118`	`return data;`
`119`	`119`	`} catch (error) {`
`120`	`120`	`console.error('Error fetching embedding stats:', error);`
`121`		`- return { count: 0 };`
	`121`	`+ throw error;`
`122`	`122`	`}`
`123`	`123`	`};`