nga_project/training.py at main · huyndao/nga_project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import numpy as np
import scipy as sp
import polars as pl

pl.Config.set_engine_affinity('streaming')
pl.Config.set_fmt_str_lengths(1000)

from pathlib import Path
import concurrent
from concurrent.futures import ThreadPoolExecutor

import sklearn

from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import pairwise_distances

import usearch, usearch.index
import cv2
from PIL import Image

import tqdm
import httpx
import matplotlib.pyplot as plt
import json
import gc
import io

from bovw import BoVW

gc.enable()

# download csv of published images API if not already downloaded
csv_filename = 'published_images.csv'

if Path(csv_filename).exists() and Path(csv_filename).is_file():
    published_images = pl.scan_csv(Path(csv_filename))
else:
    published_images = pl.scan_csv(
        source = 'https://github.com/NationalGalleryOfArt/opendata/raw/refs/heads/main/data/published_images.csv',
        has_header=True,
        infer_schema_length=100,
        infer_schema=True
    ).with_columns(
        pl.col('iiifthumburl').str.replace("200,200", "400,400").alias("thumb_url")
    )

    published_images.sink_csv(csv_filename, sync_on_close='all')

# function to download image given a url, filename, and a target download directory
def get_file(file_url: str, name: str, target_dir: str) -> None:
    name = name + '.jpg'

    target_dir = Path(target_dir).resolve()
    path = target_dir / name

    def dl_file(file_url: str, path: Path) -> None:
        try:
            with httpx.stream(method="GET", url=file_url, follow_redirects=True) as response:
                response.raise_for_status()
                with open(path, "wb") as fd:
                    for data in response.iter_bytes():
                        fd.write(data)
        except Exception as e:
            pass

    if path.exists() and path.is_file():
        try:
            with Image.open(path) as img_pil:
                img_pil.verify()
        except:
            path.unlink()
            dl_file(file_url=file_url, path=path)
    elif path.exists():
        path.unlink()
        dl_file(file_url=file_url, path=path)
    else:
        dl_file(file_url=file_url, path=path)

# function to perform parallel downloads
def get_files(file_urls: list[str], names: list[str], target_dir: str, threads: int = 24) -> None:
    with ThreadPoolExecutor(max_workers=threads) as exc:
        futs = []
        for name, url in tqdm.auto.tqdm(zip(names, file_urls), desc='Submitting images', unit='image', total=len(names)):
            futs += [exc.submit(get_file, url, name, target_dir)]

        for f in tqdm.auto.tqdm(concurrent.futures.as_completed(futs), desc='Downloading images', unit='image', total=len(futs)):
            pass

# get names and urls of images, convert to lists
names = published_images.select(pl.col('uuid')).collect().to_series().to_list()
urls = published_images.select(pl.col('thumb_url')).collect().to_series().to_list()

# specify download dir
download_dir = Path('nga_archive')
download_dir.mkdir(exist_ok=True)

# actually download images
get_files(file_urls=urls, names=names, target_dir=download_dir.name)

# make a list of downloaded images
imgs = list(download_dir.expanduser().resolve().rglob('*'))

# initialize BoVW model from pickled file if not already exists, else fit on the downloaded images
if not Path('bovw_nga.pkl').exists():
    bovw = BoVW(model_path=Path('bovw_nga.pkl'), detector='sift', vocab_size=1024, batch_size=64)
    bovw.fit(X=imgs)
else:
    bovw = BoVW(model_path=Path('bovw_nga.pkl'), detector='sift', vocab_size=1024, batch_size=64)

# pipeline of BoVW model and TF-IDF transformer
bovw_pipe = make_pipeline(
    bovw,
    TfidfTransformer()
)

# lambda function to convert a Path filename, giving just the name, excluding the file extension
get_name = lambda x: x.with_suffix('').name

# if ANN index file not already exists, then use the pipeline to transform batches of images and add into index
if not Path('bovw_index.usearch').exists():
    bovw_img_index = usearch.index.Index(ndim=bovw.vocab_size)

    batch_size = 1024
    uuid_key_df = pl.DataFrame(schema={'uuid': pl.String, 'key': pl.Int64})

    for b in tqdm.auto.trange(0, len(imgs), batch_size, desc='processing batches', unit='batch'):
        start = b
        end = min(start + batch_size, len(imgs))
        keys = np.arange(start, end)
        batch = imgs[start:end]
        uuid = list(map(get_name, batch))

        uuid_key_df = pl.concat(
            [
                uuid_key_df,
                pl.DataFrame([uuid, keys], schema={'uuid': pl.String, 'key': pl.Int64})
            ],
            how='vertical'
        )

        bovw_img_embeddings_tfidf = bovw_pipe.fit_transform(batch)

        bovw_img_index.add(keys=keys, vectors=bovw_img_embeddings_tfidf.toarray(), log=False)

        gc.collect()

    bovw_img_index.save('bovw_index.usearch')
    published_images.join(uuid_key_df.lazy(), on='uuid', how='inner', coalesce=None).sink_csv('published_images_with_keys.csv')

else:
    bovw_img_index = usearch.index.Index.restore('bovw_index.usearch', view=True)