diff --git a/clipeval/eval_all.py b/clipeval/eval_all.py index ef8de5c..0407b28 100644 --- a/clipeval/eval_all.py +++ b/clipeval/eval_all.py @@ -16,6 +16,12 @@ ("slip", "clipeval.slip.eval_slip"), ("xm3600", "clipeval.xm3600.eval_xm3600"), ("cvqa", "clipeval.cvqa.eval_cvqa"), + ("zero_shot_classification_dollar_street", "clipeval.zero_shot_classification.eval_dollar_street"), + ("zero_shot_classification_GeoDE", "clipeval.zero_shot_classification.eval_GeoDE"), + ("zero_shot_classification_GLDv2", "clipeval.zero_shot_classification.eval_GLDv2"), + ("few_shot_geo_localization_dollar_street", "clipeval.few_shot_geo_localization.eval_dollar_street"), + ("few_shot_geo_localization_GeoDE", "clipeval.few_shot_geo_localization.eval_GeoDE"), + ("few_shot_geo_localization_xm3600", "clipeval.few_shot_geo_localization.eval_xm3600"), ] diff --git a/clipeval/few_shot_geo_localization/eval_GeoDE.py b/clipeval/few_shot_geo_localization/eval_GeoDE.py new file mode 100644 index 0000000..462721c --- /dev/null +++ b/clipeval/few_shot_geo_localization/eval_GeoDE.py @@ -0,0 +1,105 @@ +import torch +import json +from PIL import Image +from tqdm import tqdm +import pandas as pd +import numpy as np + +import sys +if "external/big_vision" not in sys.path: + sys.path.append("external/big_vision") +# or directly copy the functions from https://github.com/google-research/big_vision/blob/main/big_vision/evaluators/fewshot_lsr.py + +from big_vision.evaluators.fewshot_lsr import _precompute_cache, _eig_fewshot_acc_fn + +data_dir = 'data/geode/' +GROUP_KEY = 'ip_country' # 'ip_country' + +# Evaluation Function +def evaluate(model, preprocess_val): + geo_df = pd.read_csv(data_dir + 'index.csv') + geo_df = geo_df.sample(frac=1).reset_index(drop=True) #shuffle + train_df = geo_df.iloc[:20000] + test_df = geo_df.iloc[20000:] + print("done load data", len(geo_df), len(train_df), len(test_df)) + + batch_size = 16 + device = torch.cuda.current_device() + + ## train classification probe + classification_probes = [] + country_ids_list = [] # each n_shot has a list, theoretically should be the same, but GeoDE is special, some countries are very rare + for n_shot in [5, 10, 25]: + train_sampled = train_df.groupby(GROUP_KEY, group_keys=False).apply(lambda x: x.sample(n=min(len(x), n_shot), random_state=42)) + country_ids = sorted(list(set(train_sampled[GROUP_KEY]))) + + df = train_sampled + with torch.no_grad(): + all_features = [] + all_labels = [] + for start in tqdm(range(0, len(df), batch_size)): + end = min(start + batch_size, len(df)) + batch_imgs = [] + for i in range(start, end): + data = df.iloc[i] + try: + batch_imgs.append(Image.open(data_dir + 'images/' + data['file_path']).convert("RGB")) + all_labels.append(country_ids.index(data[GROUP_KEY])) + except: + print(f"missing image {data['file_path']}") + + images = torch.stack([preprocess_val(img).to(device) for img in batch_imgs]) + image_embs = model.encode_image(images) + image_embs /= image_embs.norm(dim=-1, keepdim=True) + + all_features.append(image_embs) + + all_features = torch.cat(all_features, dim=0) + print(all_features.shape) + + classification_probes.append(_precompute_cache(all_features.cpu().numpy(), all_labels, len(set(all_labels)))) + country_ids_list.append(country_ids) + + ## start eval + n = 0 + correct = [0] * len(classification_probes) + + with torch.no_grad(): + for local_start in tqdm(range(0, len(test_df), batch_size)): + local_end = min(local_start + batch_size, len(test_df)) + batch_imgs = [] + country_labels = [] + + for i in range(local_start, local_end): + data = test_df.iloc[i] + try: + batch_imgs.append(Image.open(data_dir + 'images/' + data['file_path']).convert("RGB")) + country_labels.append(data[GROUP_KEY]) + except: + print(f"missing image {data['file_path']}") + + images = torch.stack([preprocess_val(img).to(device) for img in batch_imgs]) + image_features = model.encode_image(images) + image_features /= image_features.norm(dim=-1, keepdim=True) + + for ind, cache in enumerate(classification_probes): + labels = [country_ids_list[ind].index(c) if c in country_ids_list[ind] else -1 for c in country_labels] + if labels.count(-1) > 0: + print(f"WARNING: there are {labels.count(-1)} out of {len(labels)} samples country are not in the training set.") + correct[ind] += _eig_fewshot_acc_fn(cache, image_features.cpu().numpy(), labels, 2.0 ** 10).item() + + n += len(labels) + + print(f"few_shot [5, 10, 25] geo-localization on GeoDE, {correct}, {n}, {np.array(correct)/n}") + return correct, n + +def parse_results(results, result_json): + with open(result_json) as f: + result = json.load(f) + print("few-shot geo-localization GeoDE:", result['acc']) + results['few_shot_geo_loc_GeoDE'] = result['acc'] + +def main(model, preprocess_val, tokenizer, result_json): + correct, n = evaluate(model, preprocess_val) + with open(result_json, "w") as f: + json.dump({"correct": correct, "total": n, "acc": (np.array(correct)/n).tolist()}, f) diff --git a/clipeval/few_shot_geo_localization/eval_dollar_street.py b/clipeval/few_shot_geo_localization/eval_dollar_street.py new file mode 100644 index 0000000..e5f39c7 --- /dev/null +++ b/clipeval/few_shot_geo_localization/eval_dollar_street.py @@ -0,0 +1,96 @@ +import torch +import json +from PIL import Image +from tqdm import tqdm +import pandas as pd +import numpy as np + +import sys +if "external/big_vision" not in sys.path: + sys.path.append("external/big_vision") +# or directly copy the functions from https://github.com/google-research/big_vision/blob/main/big_vision/evaluators/fewshot_lsr.py + +from big_vision.evaluators.fewshot_lsr import _precompute_cache, _eig_fewshot_acc_fn + + +data_dir = 'data/DollarStreet/dataset_dollarstreet/' + +# Evaluation Function +def evaluate(model, preprocess_val): + train_df = pd.read_csv(data_dir + 'images_v2_imagenet_train.csv') + test_df = pd.read_csv(data_dir + 'images_v2_imagenet_test.csv') + print("done load data", len(train_df), len(test_df)) + + batch_size = 16 + device = torch.cuda.current_device() + + ## train classification probe + classification_probes = [] + country_ids_list = [] # each n_shot has a list, theoretically should be the same, but just in case + for n_shot in [5, 10, 25]: + train_sampled = train_df.groupby('country.id', group_keys=False).apply(lambda x: x.sample(n=min(len(x), n_shot), random_state=42)) + country_ids = sorted(list(set(train_sampled['country.id']))) + + df = train_sampled + with torch.no_grad(): + all_features = [] + all_labels = [] + for start in tqdm(range(0, len(df), batch_size)): + end = min(start + batch_size, len(df)) + batch_imgs = [] + for i in range(start, end): + data = df.iloc[i] + batch_imgs.append(Image.open(data_dir + data['imageRelPath']).convert("RGB")) + all_labels.append(country_ids.index(data['country.id'])) + + + images = torch.stack([preprocess_val(img).to(device) for img in batch_imgs]) + image_embs = model.encode_image(images) + image_embs /= image_embs.norm(dim=-1, keepdim=True) + + all_features.append(image_embs) + + all_features = torch.cat(all_features, dim=0) + print(all_features.shape) + + classification_probes.append(_precompute_cache(all_features.cpu().numpy(), all_labels, len(set(all_labels)))) + country_ids_list.append(country_ids) + + ## start eval + n = 0 + correct = [0] * len(classification_probes) + + with torch.no_grad(): + for local_start in tqdm(range(0, len(test_df), batch_size)): + local_end = min(local_start + batch_size, len(test_df)) + batch_imgs = [] + country_labels = [] + + for i in range(local_start, local_end): + data = test_df.iloc[i] + batch_imgs.append(Image.open(data_dir + data['imageRelPath']).convert("RGB")) + country_labels.append(data['country.id']) + + images = torch.stack([preprocess_val(img).to(device) for img in batch_imgs]) + image_features = model.encode_image(images) + image_features /= image_features.norm(dim=-1, keepdim=True) + + for ind, cache in enumerate(classification_probes): + labels = [country_ids_list[ind].index(c) for c in country_labels] + correct[ind] += _eig_fewshot_acc_fn(cache, image_features.cpu().numpy(), labels, 2.0 ** 10).item() + + n += len(labels) + + print(f"few_shot [5, 10, 25] geo-localization on DollarStreet, {correct}, {n}, {np.array(correct)/n}") + return correct, n + +def parse_results(results, result_json): + with open(result_json) as f: + result = json.load(f) + print("few-shot geo-localization dollar street:", result['acc']) + results['few_shot_geo_loc_dollar_street'] = result['acc'] + +def main(model, preprocess_val, tokenizer, result_json): + correct, n = evaluate(model, preprocess_val) + with open(result_json, "w") as f: + json.dump({"correct": correct, "total": n, "acc": (np.array(correct)/n).tolist()}, f) diff --git a/clipeval/few_shot_geo_localization/eval_xm3600.py b/clipeval/few_shot_geo_localization/eval_xm3600.py new file mode 100644 index 0000000..5ea0dfa --- /dev/null +++ b/clipeval/few_shot_geo_localization/eval_xm3600.py @@ -0,0 +1,107 @@ +import torch +import json +from PIL import Image +from tqdm import tqdm +import pandas as pd +import numpy as np + +import sys +if "external/big_vision" not in sys.path: + sys.path.append("external/big_vision") +# or directly copy the functions from https://github.com/google-research/big_vision/blob/main/big_vision/evaluators/fewshot_lsr.py + +from big_vision.evaluators.fewshot_lsr import _precompute_cache, _eig_fewshot_acc_fn + +data_dir = 'data/XM3600/' +GROUP_KEY = 'image/locale' + +# Evaluation Function +def evaluate(model, preprocess_val): + with open(data_dir + 'captions.jsonl', 'r') as f: + data = [{k: v for k, v in json.loads(line).items() if k in ['image/key', 'image/locale']} for line in f] + df = pd.DataFrame(data) + df = df.sample(frac=1).reset_index(drop=True) #shuffle + train_df = df.iloc[:1800] + test_df = df.iloc[1800:] + print("done load data", len(df), len(train_df), len(test_df)) + + batch_size = 16 + device = torch.cuda.current_device() + + ## train classification probe + classification_probes = [] + country_ids_list = [] # each n_shot has a list, theoretically should be the same, but GeoDE is special, some countries are very rare + for n_shot in [5, 10, 25]: + train_sampled = train_df.groupby(GROUP_KEY, group_keys=False).apply(lambda x: x.sample(n=min(len(x), n_shot), random_state=42)) + country_ids = sorted(list(set(train_sampled[GROUP_KEY]))) + + df = train_sampled + with torch.no_grad(): + all_features = [] + all_labels = [] + for start in tqdm(range(0, len(df), batch_size)): + end = min(start + batch_size, len(df)) + batch_imgs = [] + for i in range(start, end): + data = df.iloc[i] + try: + batch_imgs.append(Image.open(data_dir + f"images/{data['image/key']}.jpg").convert("RGB")) + all_labels.append(country_ids.index(data[GROUP_KEY])) + except: + print(f"missing image {data['image/key']}") + + images = torch.stack([preprocess_val(img).to(device) for img in batch_imgs]) + image_embs = model.encode_image(images) + image_embs /= image_embs.norm(dim=-1, keepdim=True) + + all_features.append(image_embs) + + all_features = torch.cat(all_features, dim=0) + print(all_features.shape) + + classification_probes.append(_precompute_cache(all_features.cpu().numpy(), all_labels, len(set(all_labels)))) + country_ids_list.append(country_ids) + + ## start eval + n = 0 + correct = [0] * len(classification_probes) + + with torch.no_grad(): + for local_start in tqdm(range(0, len(test_df), batch_size)): + local_end = min(local_start + batch_size, len(test_df)) + batch_imgs = [] + country_labels = [] + + for i in range(local_start, local_end): + data = test_df.iloc[i] + try: + batch_imgs.append(Image.open(data_dir + f"images/{data['image/key']}.jpg").convert("RGB")) + country_labels.append(data[GROUP_KEY]) + except: + print(f"missing image {data['image/key']}") + + images = torch.stack([preprocess_val(img).to(device) for img in batch_imgs]) + image_features = model.encode_image(images) + image_features /= image_features.norm(dim=-1, keepdim=True) + + for ind, cache in enumerate(classification_probes): + labels = [country_ids_list[ind].index(c) if c in country_ids_list[ind] else -1 for c in country_labels] + if labels.count(-1) > 0: + print(f"WARNING: there are {labels.count(-1)} out of {len(labels)} samples country are not in the training set.") + correct[ind] += _eig_fewshot_acc_fn(cache, image_features.cpu().numpy(), labels, 2.0 ** 10).item() + + n += len(labels) + + print(f"few_shot [5, 10, 25] geo-localization on XM3600, {correct}, {n}, {np.array(correct)/n}") + return correct, n + +def parse_results(results, result_json): + with open(result_json) as f: + result = json.load(f) + print("few-shot geo-localization XM3600:", result['acc']) + results['few_shot_geo_loc_xm3600'] = result['acc'] + +def main(model, preprocess_val, tokenizer, result_json): + correct, n = evaluate(model, preprocess_val) + with open(result_json, "w") as f: + json.dump({"correct": correct, "total": n, "acc": (np.array(correct)/n).tolist()}, f) diff --git a/clipeval/zero_shot_classification/eval_GLDv2.py b/clipeval/zero_shot_classification/eval_GLDv2.py new file mode 100644 index 0000000..a0de9ce --- /dev/null +++ b/clipeval/zero_shot_classification/eval_GLDv2.py @@ -0,0 +1,96 @@ +import torch +import json +from PIL import Image +from tqdm import tqdm +from collections import Counter +import pandas as pd + +import sys +if "external/open_clip" not in sys.path: + sys.path.append("external/open_clip") +from src.open_clip.open_clip_train.zero_shot import accuracy # https://github.com/mlfoundations/open_clip/blob/main/src/open_clip_train/zero_shot.py + +data_dir = 'data/GLDv2/' + +# return top landmark or all retrieved landmarks +def get_landmark(row, image_to_land_id_map, land_id_to_name): + tmp = [image_to_land_id_map[id] for id in row['images'].split()] + counter = Counter(tmp) + landmark_id, count = counter.most_common(1)[0] + return land_id_to_name[landmark_id], [land_id_to_name[x] for x in set(tmp)] # most_voting landmark, all landmarks + +def encode_texts(model, tokenizer, texts, device): + texts = tokenizer(texts).to(device) + text_embs = model.encode_text(texts) + text_embs /= text_embs.norm(dim=-1, keepdim=True) + return text_embs + +# Evaluation Function +def evaluate(model, preprocess_val, tokenizer): + # prepare for GLDv2 data + df = pd.read_csv(data_dir + 'retrieval_solution_v2.1.csv') + df = df[df['Usage'].isin(['Private', 'Public'])] + ids = set(df['id']) + print("test images count: ", len(ids)) + image_ids = set([id for x in df['images'] for id in x.split()]) + print("retrieved images count: ", len(image_ids)) + + image_to_landmark_df = pd.read_csv(data_dir + 'index_image_to_landmark.csv') + image_to_landmark_df = image_to_landmark_df[image_to_landmark_df['id'].isin(image_ids)] + image_to_land_id_map = {} + for _, row in image_to_landmark_df.iterrows(): + image_to_land_id_map[row['id']] = row['landmark_id'] + + landmark_to_category_df = pd.read_csv(data_dir + 'index_label_to_category.csv') + landmark_to_category_df = landmark_to_category_df[landmark_to_category_df['landmark_id'].isin(set(image_to_landmark_df['landmark_id']))] + land_id_to_name = {} + for _, row in landmark_to_category_df.iterrows(): + category = row['category'] + name = category[category.rfind(':') + 1:].replace("_", " ").rstrip('"') + land_id_to_name[row['landmark_id']] = name + + landmarks = list(land_id_to_name.values()) + print("number of landmarks: ", len(landmarks)) + + batch_size = 16 + device = torch.cuda.current_device() + + top1 = 0 + n = 0 + + text_features = encode_texts(model, tokenizer, landmarks, device) + + with torch.no_grad(): + for local_start in tqdm(range(0, len(df), batch_size)): + local_end = min(local_start + batch_size, len(df)) + batch_imgs = [] + labels = [] + + for i in range(local_start, local_end): + data = df.iloc[i] + batch_imgs.append(Image.open(data_dir + 'test/' + data['id'] + '.jpg').convert("RGB")) + landmark_name, retrieved_landmarks = get_landmark(data, image_to_land_id_map, land_id_to_name) + labels.append(landmarks.index(landmark_name)) + + labels = torch.tensor(labels).to(device) + images = torch.stack([preprocess_val(img).to(device) for img in batch_imgs]) + image_features = model.encode_image(images) + image_features /= image_features.norm(dim=-1, keepdim=True) + + probs = image_features @ text_features.T + top1 += accuracy(probs, labels)[0] + n += images.size(0) + + print(f"results {top1}, {n}, {top1/n}") + return top1, n + +def parse_results(results, result_json): + with open(result_json) as f: + result = json.load(f) + print("zero-shot classification GLDv2:", result['acc']) + results['zero_shot_classification_GLDv2'] = result['acc'] + +def main(model, preprocess_val, tokenizer, result_json): + top1, n = evaluate(model, preprocess_val, tokenizer) + with open(result_json, "w") as f: + json.dump({"top1": top1, "total": n, "acc": top1/n}, f) diff --git a/clipeval/zero_shot_classification/eval_GeoDE.py b/clipeval/zero_shot_classification/eval_GeoDE.py new file mode 100644 index 0000000..be244d7 --- /dev/null +++ b/clipeval/zero_shot_classification/eval_GeoDE.py @@ -0,0 +1,72 @@ +import torch +import json +from PIL import Image +from tqdm import tqdm +import pandas as pd + +import sys +if "external/open_clip" not in sys.path: + sys.path.append("external/open_clip") +from src.open_clip.zero_shot_classifier import build_zero_shot_classifier # https://github.com/mlfoundations/open_clip/blob/main/src/open_clip/zero_shot_classifier.py +from src.open_clip.open_clip_train.zero_shot import accuracy # https://github.com/mlfoundations/open_clip/blob/main/src/open_clip_train/zero_shot.py +from src.open_clip.zero_shot_metadata import OPENAI_IMAGENET_TEMPLATES # https://github.com/mlfoundations/open_clip/blob/main/src/open_clip/zero_shot_metadata.py + + +data_dir = 'data/GeoDE/geode/' +# Evaluation Function +def evaluate(model, preprocess_val, tokenizer): + df = pd.read_csv(data_dir + 'index.csv') + classnames = df['object'].unique().tolist() + print("done load data", len(df)) + + batch_size = 16 + device = torch.cuda.current_device() + + top1 = 0 + n = 0 + + classifier = build_zero_shot_classifier( + model, + tokenizer=tokenizer, + classnames=classnames, + templates=OPENAI_IMAGENET_TEMPLATES, + num_classes_per_batch=10, + device=device, + use_tqdm=True, + ) + + with torch.no_grad(): + for local_start in tqdm(range(0, len(df), batch_size)): + local_end = min(local_start + batch_size, len(df)) + batch_imgs = [] + labels = [] + + for i in range(local_start, local_end): + data = df.iloc[i] + try: + batch_imgs.append(Image.open(data_dir + 'images/' + data['file_path']).convert("RGB")) + labels.append(classnames.index(data['object'])) + except: + print(f"missing image {data['file_path']}") + + labels = torch.tensor(labels).to(device) + images = torch.stack([preprocess_val(img).to(device) for img in batch_imgs]) + image_features = model.encode_image(images) + image_features /= image_features.norm(dim=-1, keepdim=True) + logits = 100. * image_features @ classifier + top1 += accuracy(logits, labels)[0] + n += images.size(0) + + print(f"results {top1}, {n}, {top1/n}") + return top1, n + +def parse_results(results, result_json): + with open(result_json) as f: + result = json.load(f) + print("zero-shot classification GeoDE:", result['acc']) + results['zero_shot_classification_GeoDE'] = result['acc'] + +def main(model, preprocess_val, tokenizer, result_json): + top1, n = evaluate(model, preprocess_val, tokenizer) + with open(result_json, "w") as f: + json.dump({"top1": top1, "total": n, "acc": top1/n}, f) diff --git a/clipeval/zero_shot_classification/eval_dollar_street.py b/clipeval/zero_shot_classification/eval_dollar_street.py new file mode 100644 index 0000000..97a48bc --- /dev/null +++ b/clipeval/zero_shot_classification/eval_dollar_street.py @@ -0,0 +1,85 @@ +import ast +import torch +import json +from PIL import Image +from tqdm import tqdm +import pandas as pd + +import sys +if "external/open_clip" not in sys.path: + sys.path.append("external/open_clip") +from src.open_clip.zero_shot_classifier import build_zero_shot_classifier # https://github.com/mlfoundations/open_clip/blob/main/src/open_clip/zero_shot_classifier.py +from src.open_clip.zero_shot_metadata import OPENAI_IMAGENET_TEMPLATES, IMAGENET_CLASSNAMES # https://github.com/mlfoundations/open_clip/blob/main/src/open_clip/zero_shot_metadata.py + + +data_dir = 'data/DollarStreet/dataset_dollarstreet/' + +def match_any_accuracy(output, target, topk=(1,)): + pred = output.topk(max(topk), 1, True, True)[1] # [B, k] + pred_exp = pred.unsqueeze(2) # [B, k, 1] + target_exp = target.unsqueeze(1) # [B, 1, N] + # Compare: broadcasted over [B, k, N] + correct = pred_exp.eq(target_exp).any(dim=2).t() # [k, B] — True if any label matches + return [float(correct[:k].reshape(-1).float().sum().item()) for k in topk] + +# Evaluation Function +def evaluate(model, preprocess_val, tokenizer): + ds_train_df = pd.read_csv(data_dir + 'images_v2_imagenet_train.csv') + ds_test_df = pd.read_csv(data_dir + 'images_v2_imagenet_test.csv') + df = pd.concat([ds_train_df, ds_test_df]) + print("done load data", len(df)) + + batch_size = 16 + device = torch.cuda.current_device() + + top1 = 0 + top5 = 0 + n = 0 + + classifier = build_zero_shot_classifier( + model, + tokenizer=tokenizer, + classnames=IMAGENET_CLASSNAMES, + templates=OPENAI_IMAGENET_TEMPLATES, + num_classes_per_batch=10, + device=device, + use_tqdm=True, + ) + + with torch.no_grad(): + for local_start in tqdm(range(0, len(df), batch_size)): + local_end = min(local_start + batch_size, len(df)) + batch_imgs = [] + labels = [] + + for i in range(local_start, local_end): + data = df.iloc[i] + batch_imgs.append(Image.open(data_dir + data['imageRelPath']).convert("RGB")) + labels.append(ast.literal_eval(data['imagenet_sysnet_id'])) + + max_len = max(len(x) for x in labels) + padded_labels = [x + [-1] * (max_len - len(x)) for x in labels] + + labels = torch.tensor(padded_labels).to(device) + images = torch.stack([preprocess_val(img).to(device) for img in batch_imgs]) + image_features = model.encode_image(images) + image_features /= image_features.norm(dim=-1, keepdim=True) + logits = 100. * image_features @ classifier + tmp1, tmp5 = match_any_accuracy(logits, labels, (1, 5)) + top1 += tmp1 + top5 += tmp5 + n += images.size(0) + + print(f"results {top1}, {top5}, {n}, {top1/n}, {top5/n}") + return top1, top5, n + +def parse_results(results, result_json): + with open(result_json) as f: + result = json.load(f) + print("zero-shot classification dollar street:", result['acc']) + results['zero_shot_classification_dollar_street'] = result['acc'] + +def main(model, preprocess_val, tokenizer, result_json): + top1, top5, n = evaluate(model, preprocess_val, tokenizer) + with open(result_json, "w") as f: + json.dump({"top1": top1, "top5": top5, "total": n, "acc": top1/n}, f)