Skip to content

Commit cf6cade

Browse files
committed
chore: Fix linter errors
1 parent ed84b1c commit cf6cade

File tree

17 files changed

+454
-270
lines changed

17 files changed

+454
-270
lines changed

.flake8

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
[flake8]
2+
max-line-length = 88
3+
extend-ignore = E203, E501
4+
exclude =
5+
.git,
6+
__pycache__,
7+
venv,
8+
.venv,
9+
env,
10+
.env,
11+
build,
12+
dist,
13+
node_modules,
14+
.pytest_cache,
15+
.mypy_cache
16+
max-complexity = 10
17+

scripts/create_loc_assets.py

Lines changed: 37 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,17 @@
66
in the same way as we do in our public demo at https://digital-collections-explorer.com/
77
"""
88

9+
import base64
10+
import json
11+
912
import pandas as pd
1013
import torch
11-
import json
12-
import base64
1314

14-
ORIGINAL_INDEX_PATH = 'input/beto_idx.pt'
15-
CSV_PATH = 'input/merged_files.csv'
16-
FINAL_METADATA_PATH = 'output/metadata.json'
17-
FINAL_INDEX_PATH = 'output/item_ids.pt'
15+
ORIGINAL_INDEX_PATH = "input/beto_idx.pt"
16+
CSV_PATH = "input/merged_files.csv"
17+
FINAL_METADATA_PATH = "output/metadata.json"
18+
FINAL_INDEX_PATH = "output/item_ids.pt"
19+
1820

1921
def generate_assets():
2022
# --- 1. Load the original beto_idx.pt file ---
@@ -24,9 +26,11 @@ def generate_assets():
2426

2527
# --- 2. Build a lookup table from merged_files.csv ---
2628
df = pd.read_csv(CSV_PATH)
27-
df.dropna(subset=['p1_item_id', 'file_url'], inplace=True)
28-
df['iiif_id'] = df['file_url'].apply(lambda url: url.split('/')[5] if isinstance(url, str) else None)
29-
df.dropna(subset=['iiif_id'], inplace=True)
29+
df.dropna(subset=["p1_item_id", "file_url"], inplace=True)
30+
df["iiif_id"] = df["file_url"].apply(
31+
lambda url: url.split("/")[5] if isinstance(url, str) else None
32+
)
33+
df.dropna(subset=["iiif_id"], inplace=True)
3034
iiif_to_p1_lookup = pd.Series(df.p1_item_id.values, index=df.iiif_id).to_dict()
3135

3236
# --- 3. Generate new index and metadata ---
@@ -36,15 +40,19 @@ def generate_assets():
3640
for image_url in original_idx:
3741
# a. Extract iiif_id
3842
try:
39-
iiif_id = image_url.split('/')[5]
43+
iiif_id = image_url.split("/")[5]
4044
except IndexError:
41-
b64_key = base64.urlsafe_b64encode(f"ERROR_PARSING_{len(final_beto_idx)}".encode('utf-8')).decode('utf-8')
45+
b64_key = base64.urlsafe_b64encode(
46+
f"ERROR_PARSING_{len(final_beto_idx)}".encode("utf-8")
47+
).decode("utf-8")
4248
final_beto_idx.append(b64_key)
43-
final_metadata[b64_key] = {'error': f'Could not parse iiif_id from URL: {image_url}'}
49+
final_metadata[b64_key] = {
50+
"error": f"Could not parse iiif_id from URL: {image_url}"
51+
}
4452
continue
4553

4654
# b. Generate Base64 key
47-
b64_key = base64.urlsafe_b64encode(iiif_id.encode('utf-8')).decode('utf-8')
55+
b64_key = base64.urlsafe_b64encode(iiif_id.encode("utf-8")).decode("utf-8")
4856

4957
# c. Append key to the new index
5058
final_beto_idx.append(b64_key)
@@ -55,26 +63,31 @@ def generate_assets():
5563
# e. Assemble the new metadata object
5664
url_base = f"https://tile.loc.gov/image-services/iiif/{iiif_id}"
5765
paths = {
58-
'original': f"{url_base}/full/pct:100/0/default.jpg",
59-
'processed': f"{url_base}/full/2000,/0/default.jpg",
60-
'thumbnail': f"{url_base}/full/400,/0/default.jpg"
66+
"original": f"{url_base}/full/pct:100/0/default.jpg",
67+
"processed": f"{url_base}/full/2000,/0/default.jpg",
68+
"thumbnail": f"{url_base}/full/400,/0/default.jpg",
6169
}
6270
final_metadata[b64_key] = {
63-
'type': 'image',
64-
'iiif_id': iiif_id,
65-
'url': p1_item_id,
66-
'paths': paths
71+
"type": "image",
72+
"iiif_id": iiif_id,
73+
"url": p1_item_id,
74+
"paths": paths,
6775
}
6876

6977
# --- 4. Final Save and Validation ---
70-
with open(FINAL_METADATA_PATH, 'w') as f:
78+
with open(FINAL_METADATA_PATH, "w") as f:
7179
json.dump(final_metadata, f, indent=4)
72-
print(f"Successfully saved {FINAL_METADATA_PATH} with {len(final_metadata)} entries.")
80+
print(
81+
f"Successfully saved {FINAL_METADATA_PATH} with {len(final_metadata)} entries."
82+
)
7383

7484
torch.save(final_beto_idx, FINAL_INDEX_PATH)
7585
print(f"Successfully saved {FINAL_INDEX_PATH} with {len(final_beto_idx)} entries.")
7686

77-
assert len(original_idx) == len(final_beto_idx), "CRITICAL: Final index length does not match original!"
87+
assert len(original_idx) == len(
88+
final_beto_idx
89+
), "CRITICAL: Final index length does not match original!"
90+
7891

79-
if __name__ == '__main__':
92+
if __name__ == "__main__":
8093
generate_assets()

src/backend/api/routes/embeddings.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
1-
from fastapi import APIRouter
21
import logging
32

3+
from fastapi import APIRouter
4+
45
from ...services.embedding_service import embedding_service
56

67
logger = logging.getLogger(__name__)
78
router = APIRouter(prefix="/api/embeddings", tags=["embeddings"])
89

10+
911
@router.get("/count")
1012
async def get_total_embeddings():
1113
"""Return the total number of embeddings."""

src/backend/api/routes/images.py

Lines changed: 44 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,67 +1,89 @@
1+
from pathlib import Path
2+
13
from fastapi import APIRouter, HTTPException, Query
24
from fastapi.responses import FileResponse
3-
from pathlib import Path
5+
46
from src.backend.services.embedding_service import embedding_service
57

68
router = APIRouter(tags=["images"])
79

10+
811
@router.get("/images/{id}")
912
async def get_image_by_id(
10-
id: str,
11-
size: str = Query("full", description="Image size: 'thumbnail' or 'full'")
13+
id: str, size: str = Query("full", description="Image size: 'thumbnail' or 'full'")
1214
):
1315
"""
1416
Serve an image based on its ID
15-
17+
1618
Args:
1719
id: The document ID
1820
size: Size of the image to return
1921
"""
2022
doc = embedding_service.get_document_by_id(id)
21-
23+
2224
if not doc:
2325
raise HTTPException(status_code=404, detail=f"Document with ID {id} not found")
24-
25-
if size == "thumbnail" and "metadata" in doc and "paths" in doc["metadata"] and "thumbnail" in doc["metadata"]["paths"]:
26+
27+
if (
28+
size == "thumbnail"
29+
and "metadata" in doc
30+
and "paths" in doc["metadata"]
31+
and "thumbnail" in doc["metadata"]["paths"]
32+
):
2633
path_str = doc["metadata"]["paths"]["thumbnail"]
27-
elif "metadata" in doc and "paths" in doc["metadata"] and "processed" in doc["metadata"]["paths"]:
34+
elif (
35+
"metadata" in doc
36+
and "paths" in doc["metadata"]
37+
and "processed" in doc["metadata"]["paths"]
38+
):
2839
path_str = doc["metadata"]["paths"]["processed"]
2940
else:
30-
raise HTTPException(status_code=404, detail="Image path not found in document metadata")
31-
41+
raise HTTPException(
42+
status_code=404, detail="Image path not found in document metadata"
43+
)
44+
3245
path = Path(path_str)
33-
46+
3447
if not path.exists():
3548
raise HTTPException(status_code=404, detail=f"Image not found at path: {path}")
36-
49+
3750
return FileResponse(path)
3851

52+
3953
@router.get("/static/{id}")
4054
async def get_original_document(id: str):
4155
"""
4256
Serve the original document file
43-
57+
4458
Args:
4559
id: The document ID
4660
"""
4761
doc = embedding_service.get_document_by_id(id)
4862

4963
if not doc:
5064
raise HTTPException(status_code=404, detail=f"Document with ID {id} not found")
51-
52-
if "metadata" in doc and "paths" in doc["metadata"] and "original" in doc["metadata"]["paths"]:
65+
66+
if (
67+
"metadata" in doc
68+
and "paths" in doc["metadata"]
69+
and "original" in doc["metadata"]["paths"]
70+
):
5371
path_str = doc["metadata"]["paths"]["original"]
5472
else:
55-
raise HTTPException(status_code=404, detail="Original file path not found in document metadata")
56-
73+
raise HTTPException(
74+
status_code=404, detail="Original file path not found in document metadata"
75+
)
76+
5777
path = Path(path_str)
58-
78+
5979
if not path.exists():
60-
raise HTTPException(status_code=404, detail=f"Original file not found at path: {path}")
61-
80+
raise HTTPException(
81+
status_code=404, detail=f"Original file not found at path: {path}"
82+
)
83+
6284
filename = path.name
6385
return FileResponse(
64-
path,
86+
path,
6587
filename=filename,
66-
headers={"Content-Disposition": f"attachment; filename={filename}"}
88+
headers={"Content-Disposition": f"attachment; filename={filename}"},
6789
)

src/backend/api/routes/search.py

Lines changed: 28 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,48 @@
1-
from fastapi import APIRouter, File, Form, UploadFile, Query
2-
from PIL import Image
3-
from io import BytesIO
41
import logging
2+
from io import BytesIO
3+
4+
from fastapi import APIRouter, File, Form, Query, UploadFile
5+
from PIL import Image
56

6-
from ...services.embedding_service import embedding_service
7-
from ...services.clip_service import clip_service
87
from ...models.schemas import SearchResponse, SearchResult
8+
from ...services.clip_service import clip_service
9+
from ...services.embedding_service import embedding_service
910

1011
logger = logging.getLogger(__name__)
1112
router = APIRouter(prefix="/api/search", tags=["search"])
1213

14+
1315
@router.get("/text", response_model=SearchResponse)
1416
async def search_by_text(
15-
query: str,
17+
query: str,
1618
limit: int = Query(30, description="Number of results per page"),
17-
page: int = Query(1, description="Page number for pagination")
19+
page: int = Query(1, description="Page number for pagination"),
1820
):
1921
"""Search for similar content using text query."""
2022
offset = (page - 1) * limit
21-
23+
2224
try:
2325
if not embedding_service.is_loaded:
2426
embedding_service.load_embeddings()
25-
27+
2628
text_embedding = clip_service.encode_text(query)
2729
logit_scale = clip_service.model.logit_scale.exp().item()
28-
raw_results = embedding_service.search(text_embedding, logit_scale=logit_scale, limit=limit, offset=offset)
29-
30+
raw_results = embedding_service.search(
31+
text_embedding, logit_scale=logit_scale, limit=limit, offset=offset
32+
)
33+
3034
search_results = [
3135
SearchResult(
32-
id=result["id"],
33-
score=result["score"],
34-
metadata=result["metadata"]
35-
) for result in raw_results
36+
id=result["id"], score=result["score"], metadata=result["metadata"]
37+
)
38+
for result in raw_results
3639
]
3740
return SearchResponse(results=search_results)
3841
except Exception as e:
3942
logger.error(f"Error in text search: {str(e)}")
4043
return SearchResponse(results=[])
4144

45+
4246
@router.post("/image", response_model=SearchResponse)
4347
async def search_by_image(
4448
image: UploadFile = File(...),
@@ -47,19 +51,20 @@ async def search_by_image(
4751
):
4852
"""Search for similar content using an uploaded image."""
4953
offset = (page - 1) * limit
50-
54+
5155
try:
5256
image_data = await image.read()
53-
image = Image.open(BytesIO(image_data)).convert('RGB')
57+
image = Image.open(BytesIO(image_data)).convert("RGB")
5458
image_embedding = clip_service.encode_image(image)
55-
raw_results = embedding_service.search(image_embedding, limit=limit, offset=offset)
56-
59+
raw_results = embedding_service.search(
60+
image_embedding, limit=limit, offset=offset
61+
)
62+
5763
search_results = [
5864
SearchResult(
59-
id=result["id"],
60-
score=result["score"],
61-
metadata=result["metadata"]
62-
) for result in raw_results
65+
id=result["id"], score=result["score"], metadata=result["metadata"]
66+
)
67+
for result in raw_results
6368
]
6469
return SearchResponse(results=search_results)
6570
except Exception as e:

0 commit comments

Comments
 (0)