|
4 | 4 | from efficient_ir import EfficientIR |
5 | 5 |
|
6 | 6 |
|
7 | | -ir_engine = EfficientIR() |
8 | | -name_index_path = 'index/name_index.json' # 文件路径索引的位置 |
9 | | - |
10 | | - |
11 | | -def get_file_list(target_dir): |
12 | | - accepted_exts = ['.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif', '.webp'] |
13 | | - file_path_list = [] |
14 | | - for root, dirs, files in os.walk(target_dir): |
15 | | - for name in files: |
16 | | - if name.lower().endswith(tuple(accepted_exts)): |
17 | | - file_path_list.append(os.path.join(root, name)) |
18 | | - return file_path_list |
19 | | - |
20 | | - |
21 | | -def get_exists_index(): |
22 | | - return json.loads(open(name_index_path, 'rb').read()) |
23 | | - |
24 | | - |
25 | | -def index_target_dir(target_dir): |
26 | | - exists_index = [] |
27 | | - if os.path.exists(name_index_path): |
28 | | - exists_index = json.loads(open(name_index_path, 'rb').read()) |
29 | | - this_index = get_file_list(target_dir) |
30 | | - for i in this_index: |
31 | | - if not i in exists_index: |
32 | | - exists_index.append(i) |
33 | | - with open(name_index_path, 'wb') as wp: |
34 | | - wp.write(json.dumps(exists_index,ensure_ascii=False).encode('UTF-8')) |
35 | | - return exists_index |
36 | | - |
37 | | - |
38 | | -def update_ir_index(exists_index): |
39 | | - count = ir_engine.hnsw_index.get_current_count() |
40 | | - for idx in tqdm(range(count, len(exists_index)), ascii=True): |
41 | | - fv = ir_engine.get_fv(exists_index[idx]) |
42 | | - if fv is None: |
43 | | - continue |
44 | | - ir_engine.add_fv(fv, idx) |
45 | | - ir_engine.save_index() |
46 | | - |
47 | | - |
48 | | -def remove_nonexists(): |
49 | | - exists_index = [] |
50 | | - if os.path.exists(name_index_path): |
51 | | - exists_index = json.loads(open(name_index_path, 'rb').read()) |
52 | | - for idx in tqdm(range(len(exists_index)), ascii=True): |
53 | | - if not os.path.exists(exists_index[idx]): |
54 | | - exists_index[idx] = 'NOTEXISTS' |
55 | | - ir_engine.hnsw_index.mark_deleted(idx) |
56 | | - with open(name_index_path, 'wb') as wp: |
57 | | - wp.write(json.dumps(exists_index,ensure_ascii=False).encode('UTF-8')) |
58 | | - |
59 | | - |
60 | | -def checkout(image_path, exists_index, match_n=5): |
61 | | - fv = ir_engine.get_fv(image_path) |
62 | | - sim, ids = ir_engine.match(fv, match_n) |
63 | | - return [(sim[i], exists_index[ids[i]]) for i in range(len(ids))] |
64 | | - |
65 | | - |
66 | | -def get_duplicate(exists_index, threshold): |
67 | | - matched = set() |
68 | | - for idx in tqdm(range(len(exists_index)), ascii=True): |
69 | | - match_n = 5 |
70 | | - try: |
71 | | - fv = ir_engine.hnsw_index.get_items([idx])[0] |
72 | | - except RuntimeError: |
73 | | - continue |
74 | | - sim, ids = ir_engine.match(fv, match_n) |
75 | | - while sim[-1] > threshold: |
76 | | - match_n = round(match_n*1.5) |
77 | | - sim, ids = ir_engine.match(fv, match_n) |
78 | | - for i in range(len(ids)): |
79 | | - if ids[i] == idx: |
| 7 | +NOTEXISTS = 'NOTEXISTS' |
| 8 | + |
| 9 | + |
| 10 | +class Utils: |
| 11 | + |
| 12 | + def __init__(self, config): |
| 13 | + self.metainfo_path = config['metainfo_path'] |
| 14 | + self.exists_index_path = config['exists_index_path'] |
| 15 | + self.ir_engine = EfficientIR( |
| 16 | + config['img_size'], |
| 17 | + config['index_capacity'], |
| 18 | + config['index_path'], |
| 19 | + config['model_path'], |
| 20 | + ) |
| 21 | + |
| 22 | + |
| 23 | + def get_exists_index(self): |
| 24 | + return json.loads(open(self.exists_index_path, 'rb').read()) |
| 25 | + |
| 26 | + |
| 27 | + def get_file_list(self, target_dir): |
| 28 | + accepted_exts = ['.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif', '.webp'] |
| 29 | + file_path_list = [] |
| 30 | + for root, dirs, files in os.walk(target_dir): |
| 31 | + for name in files: |
| 32 | + if name.lower().endswith(tuple(accepted_exts)): |
| 33 | + file_path_list.append(os.path.join(root, name)) |
| 34 | + return file_path_list |
| 35 | + |
| 36 | + |
| 37 | + def index_target_dir(self, target_dir): |
| 38 | + # 如果已有文件索引就加载 |
| 39 | + exists_index = [] |
| 40 | + if os.path.exists(self.exists_index_path): |
| 41 | + exists_index = json.loads(open(self.exists_index_path, 'rb').read()) |
| 42 | + # 如果已有元信息索引就加载 |
| 43 | + metainfo = [] |
| 44 | + if os.path.exists(self.metainfo_path): |
| 45 | + metainfo = json.loads(open(self.metainfo_path, 'rb').read()) |
| 46 | + # 枚举当前指定目录的所有文件全路径 |
| 47 | + this_index = self.get_file_list(target_dir) |
| 48 | + # 需要特征索引的文件 |
| 49 | + need_index = [] |
| 50 | + # 更新文件索引 |
| 51 | + for i in this_index: |
| 52 | + if not i in exists_index: |
| 53 | + exists_index.append(i) |
| 54 | + # 更新元信息索引 |
| 55 | + for i in range(len(exists_index)): |
| 56 | + if NOTEXISTS in exists_index[i]: |
80 | 57 | continue |
81 | | - if sim[i] < threshold: |
| 58 | + # 采集元信息 |
| 59 | + file_size = os.path.getsize(exists_index[i]) |
| 60 | + file_mtime = os.path.getmtime(exists_index[i]) |
| 61 | + # 新增元信息 |
| 62 | + if i >= len(metainfo): |
| 63 | + metainfo.append([file_size, file_mtime]) |
| 64 | + need_index.append(i) |
82 | 65 | continue |
83 | | - if ids[i] in matched: |
| 66 | + # 检查元信息更新 |
| 67 | + if metainfo[i][0] != file_size or metainfo[i][1] != file_mtime: |
| 68 | + metainfo[i] = [file_size, file_mtime] |
| 69 | + need_index.append(i) |
| 70 | + # 写入索引文件 |
| 71 | + with open(self.exists_index_path, 'wb') as wp: |
| 72 | + wp.write(json.dumps(exists_index,ensure_ascii=False).encode('UTF-8')) |
| 73 | + with open(self.metainfo_path, 'wb') as wp: |
| 74 | + wp.write(json.dumps(metainfo,ensure_ascii=False).encode('UTF-8')) |
| 75 | + return [(i,exists_index[i]) for i in need_index] |
| 76 | + |
| 77 | + |
| 78 | + def update_ir_index(self, need_index): |
| 79 | + for idx, fpath in tqdm(need_index, ascii=True, desc='更新索引记录'): |
| 80 | + fv = self.ir_engine.get_fv(fpath) |
| 81 | + if fv is None: |
84 | 82 | continue |
85 | | - if not idx in matched: |
86 | | - matched.add(idx) |
87 | | - yield (exists_index[idx], exists_index[ids[i]], sim[i]) |
| 83 | + self.ir_engine.add_fv(fv, idx) |
| 84 | + self.ir_engine.save_index() |
| 85 | + |
| 86 | + |
| 87 | + def remove_nonexists(self): |
| 88 | + exists_index = [] |
| 89 | + if os.path.exists(self.exists_index_path): |
| 90 | + exists_index = json.loads(open(self.exists_index_path, 'rb').read()) |
| 91 | + for idx in tqdm(range(len(exists_index)), ascii=True, desc='删除不存在文件'): |
| 92 | + if not os.path.exists(exists_index[idx]): |
| 93 | + exists_index[idx] = NOTEXISTS |
| 94 | + self.ir_engine.hnsw_index.mark_deleted(idx) |
| 95 | + with open(self.exists_index_path, 'wb') as wp: |
| 96 | + wp.write(json.dumps(exists_index,ensure_ascii=False).encode('UTF-8')) |
| 97 | + |
| 98 | + |
| 99 | + def checkout(self, image_path, exists_index, match_n=5): |
| 100 | + fv = self.ir_engine.get_fv(image_path) |
| 101 | + sim, ids = self.ir_engine.match(fv, match_n) |
| 102 | + return [(sim[i], exists_index[ids[i]]) for i in range(len(ids))] |
| 103 | + |
| 104 | + |
| 105 | + def get_duplicate(self, exists_index, threshold, same_folder): |
| 106 | + matched = set() |
| 107 | + for idx in tqdm(range(len(exists_index)), ascii=True, desc='检索重复图像中'): |
| 108 | + match_n = 5 |
| 109 | + try: |
| 110 | + fv = self.ir_engine.hnsw_index.get_items([idx])[0] |
| 111 | + except RuntimeError: |
| 112 | + continue |
| 113 | + sim, ids = self.ir_engine.match(fv, match_n) |
| 114 | + while sim[-1] > threshold: |
| 115 | + match_n = round(match_n*1.5) |
| 116 | + sim, ids = self.ir_engine.match(fv, match_n) |
| 117 | + for i in range(len(ids)): |
| 118 | + if ids[i] == idx: |
| 119 | + continue |
| 120 | + if sim[i] < threshold: |
| 121 | + continue |
| 122 | + if ids[i] in matched: |
| 123 | + continue |
| 124 | + if not idx in matched: |
| 125 | + matched.add(idx) |
| 126 | + path_a = exists_index[idx] |
| 127 | + path_b = exists_index[ids[i]] |
| 128 | + if same_folder: |
| 129 | + if os.path.dirname(path_a) != os.path.dirname(path_b): |
| 130 | + continue |
| 131 | + yield (path_a, path_b, sim[i]) |
0 commit comments