66in the same way as we do in our public demo at https://digital-collections-explorer.com/
77"""
88
9+ import base64
10+ import json
11+
912import pandas as pd
1013import torch
11- import json
12- import base64
1314
14- ORIGINAL_INDEX_PATH = 'input/beto_idx.pt'
15- CSV_PATH = 'input/merged_files.csv'
16- FINAL_METADATA_PATH = 'output/metadata.json'
17- FINAL_INDEX_PATH = 'output/item_ids.pt'
15+ ORIGINAL_INDEX_PATH = "input/beto_idx.pt"
16+ CSV_PATH = "input/merged_files.csv"
17+ FINAL_METADATA_PATH = "output/metadata.json"
18+ FINAL_INDEX_PATH = "output/item_ids.pt"
19+
1820
1921def generate_assets ():
2022 # --- 1. Load the original beto_idx.pt file ---
@@ -24,9 +26,11 @@ def generate_assets():
2426
2527 # --- 2. Build a lookup table from merged_files.csv ---
2628 df = pd .read_csv (CSV_PATH )
27- df .dropna (subset = ['p1_item_id' , 'file_url' ], inplace = True )
28- df ['iiif_id' ] = df ['file_url' ].apply (lambda url : url .split ('/' )[5 ] if isinstance (url , str ) else None )
29- df .dropna (subset = ['iiif_id' ], inplace = True )
29+ df .dropna (subset = ["p1_item_id" , "file_url" ], inplace = True )
30+ df ["iiif_id" ] = df ["file_url" ].apply (
31+ lambda url : url .split ("/" )[5 ] if isinstance (url , str ) else None
32+ )
33+ df .dropna (subset = ["iiif_id" ], inplace = True )
3034 iiif_to_p1_lookup = pd .Series (df .p1_item_id .values , index = df .iiif_id ).to_dict ()
3135
3236 # --- 3. Generate new index and metadata ---
@@ -36,15 +40,19 @@ def generate_assets():
3640 for image_url in original_idx :
3741 # a. Extract iiif_id
3842 try :
39- iiif_id = image_url .split ('/' )[5 ]
43+ iiif_id = image_url .split ("/" )[5 ]
4044 except IndexError :
41- b64_key = base64 .urlsafe_b64encode (f"ERROR_PARSING_{ len (final_beto_idx )} " .encode ('utf-8' )).decode ('utf-8' )
45+ b64_key = base64 .urlsafe_b64encode (
46+ f"ERROR_PARSING_{ len (final_beto_idx )} " .encode ("utf-8" )
47+ ).decode ("utf-8" )
4248 final_beto_idx .append (b64_key )
43- final_metadata [b64_key ] = {'error' : f'Could not parse iiif_id from URL: { image_url } ' }
49+ final_metadata [b64_key ] = {
50+ "error" : f"Could not parse iiif_id from URL: { image_url } "
51+ }
4452 continue
4553
4654 # b. Generate Base64 key
47- b64_key = base64 .urlsafe_b64encode (iiif_id .encode (' utf-8' )).decode (' utf-8' )
55+ b64_key = base64 .urlsafe_b64encode (iiif_id .encode (" utf-8" )).decode (" utf-8" )
4856
4957 # c. Append key to the new index
5058 final_beto_idx .append (b64_key )
@@ -55,26 +63,31 @@ def generate_assets():
5563 # e. Assemble the new metadata object
5664 url_base = f"https://tile.loc.gov/image-services/iiif/{ iiif_id } "
5765 paths = {
58- ' original' : f"{ url_base } /full/pct:100/0/default.jpg" ,
59- ' processed' : f"{ url_base } /full/2000,/0/default.jpg" ,
60- ' thumbnail' : f"{ url_base } /full/400,/0/default.jpg"
66+ " original" : f"{ url_base } /full/pct:100/0/default.jpg" ,
67+ " processed" : f"{ url_base } /full/2000,/0/default.jpg" ,
68+ " thumbnail" : f"{ url_base } /full/400,/0/default.jpg" ,
6169 }
6270 final_metadata [b64_key ] = {
63- ' type' : ' image' ,
64- ' iiif_id' : iiif_id ,
65- ' url' : p1_item_id ,
66- ' paths' : paths
71+ " type" : " image" ,
72+ " iiif_id" : iiif_id ,
73+ " url" : p1_item_id ,
74+ " paths" : paths ,
6775 }
6876
6977 # --- 4. Final Save and Validation ---
70- with open (FINAL_METADATA_PATH , 'w' ) as f :
78+ with open (FINAL_METADATA_PATH , "w" ) as f :
7179 json .dump (final_metadata , f , indent = 4 )
72- print (f"Successfully saved { FINAL_METADATA_PATH } with { len (final_metadata )} entries." )
80+ print (
81+ f"Successfully saved { FINAL_METADATA_PATH } with { len (final_metadata )} entries."
82+ )
7383
7484 torch .save (final_beto_idx , FINAL_INDEX_PATH )
7585 print (f"Successfully saved { FINAL_INDEX_PATH } with { len (final_beto_idx )} entries." )
7686
77- assert len (original_idx ) == len (final_beto_idx ), "CRITICAL: Final index length does not match original!"
87+ assert len (original_idx ) == len (
88+ final_beto_idx
89+ ), "CRITICAL: Final index length does not match original!"
90+
7891
79- if __name__ == ' __main__' :
92+ if __name__ == " __main__" :
8093 generate_assets ()
0 commit comments