Skip to content

Commit 7e78f43

Browse files
committed
detect file modification
1 parent f87f747 commit 7e78f43

File tree

1 file changed

+122
-78
lines changed

1 file changed

+122
-78
lines changed

utils.py

Lines changed: 122 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -4,84 +4,128 @@
44
from efficient_ir import EfficientIR
55

66

7-
ir_engine = EfficientIR()
8-
name_index_path = 'index/name_index.json' # 文件路径索引的位置
9-
10-
11-
def get_file_list(target_dir):
12-
accepted_exts = ['.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif', '.webp']
13-
file_path_list = []
14-
for root, dirs, files in os.walk(target_dir):
15-
for name in files:
16-
if name.lower().endswith(tuple(accepted_exts)):
17-
file_path_list.append(os.path.join(root, name))
18-
return file_path_list
19-
20-
21-
def get_exists_index():
22-
return json.loads(open(name_index_path, 'rb').read())
23-
24-
25-
def index_target_dir(target_dir):
26-
exists_index = []
27-
if os.path.exists(name_index_path):
28-
exists_index = json.loads(open(name_index_path, 'rb').read())
29-
this_index = get_file_list(target_dir)
30-
for i in this_index:
31-
if not i in exists_index:
32-
exists_index.append(i)
33-
with open(name_index_path, 'wb') as wp:
34-
wp.write(json.dumps(exists_index,ensure_ascii=False).encode('UTF-8'))
35-
return exists_index
36-
37-
38-
def update_ir_index(exists_index):
39-
count = ir_engine.hnsw_index.get_current_count()
40-
for idx in tqdm(range(count, len(exists_index)), ascii=True):
41-
fv = ir_engine.get_fv(exists_index[idx])
42-
if fv is None:
43-
continue
44-
ir_engine.add_fv(fv, idx)
45-
ir_engine.save_index()
46-
47-
48-
def remove_nonexists():
49-
exists_index = []
50-
if os.path.exists(name_index_path):
51-
exists_index = json.loads(open(name_index_path, 'rb').read())
52-
for idx in tqdm(range(len(exists_index)), ascii=True):
53-
if not os.path.exists(exists_index[idx]):
54-
exists_index[idx] = 'NOTEXISTS'
55-
ir_engine.hnsw_index.mark_deleted(idx)
56-
with open(name_index_path, 'wb') as wp:
57-
wp.write(json.dumps(exists_index,ensure_ascii=False).encode('UTF-8'))
58-
59-
60-
def checkout(image_path, exists_index, match_n=5):
61-
fv = ir_engine.get_fv(image_path)
62-
sim, ids = ir_engine.match(fv, match_n)
63-
return [(sim[i], exists_index[ids[i]]) for i in range(len(ids))]
64-
65-
66-
def get_duplicate(exists_index, threshold):
67-
matched = set()
68-
for idx in tqdm(range(len(exists_index)), ascii=True):
69-
match_n = 5
70-
try:
71-
fv = ir_engine.hnsw_index.get_items([idx])[0]
72-
except RuntimeError:
73-
continue
74-
sim, ids = ir_engine.match(fv, match_n)
75-
while sim[-1] > threshold:
76-
match_n = round(match_n*1.5)
77-
sim, ids = ir_engine.match(fv, match_n)
78-
for i in range(len(ids)):
79-
if ids[i] == idx:
7+
NOTEXISTS = 'NOTEXISTS'
8+
9+
10+
class Utils:
11+
12+
def __init__(self, config):
13+
self.metainfo_path = config['metainfo_path']
14+
self.exists_index_path = config['exists_index_path']
15+
self.ir_engine = EfficientIR(
16+
config['img_size'],
17+
config['index_capacity'],
18+
config['index_path'],
19+
config['model_path'],
20+
)
21+
22+
23+
def get_exists_index(self):
24+
return json.loads(open(self.exists_index_path, 'rb').read())
25+
26+
27+
def get_file_list(self, target_dir):
28+
accepted_exts = ['.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif', '.webp']
29+
file_path_list = []
30+
for root, dirs, files in os.walk(target_dir):
31+
for name in files:
32+
if name.lower().endswith(tuple(accepted_exts)):
33+
file_path_list.append(os.path.join(root, name))
34+
return file_path_list
35+
36+
37+
def index_target_dir(self, target_dir):
38+
# 如果已有文件索引就加载
39+
exists_index = []
40+
if os.path.exists(self.exists_index_path):
41+
exists_index = json.loads(open(self.exists_index_path, 'rb').read())
42+
# 如果已有元信息索引就加载
43+
metainfo = []
44+
if os.path.exists(self.metainfo_path):
45+
metainfo = json.loads(open(self.metainfo_path, 'rb').read())
46+
# 枚举当前指定目录的所有文件全路径
47+
this_index = self.get_file_list(target_dir)
48+
# 需要特征索引的文件
49+
need_index = []
50+
# 更新文件索引
51+
for i in this_index:
52+
if not i in exists_index:
53+
exists_index.append(i)
54+
# 更新元信息索引
55+
for i in range(len(exists_index)):
56+
if NOTEXISTS in exists_index[i]:
8057
continue
81-
if sim[i] < threshold:
58+
# 采集元信息
59+
file_size = os.path.getsize(exists_index[i])
60+
file_mtime = os.path.getmtime(exists_index[i])
61+
# 新增元信息
62+
if i >= len(metainfo):
63+
metainfo.append([file_size, file_mtime])
64+
need_index.append(i)
8265
continue
83-
if ids[i] in matched:
66+
# 检查元信息更新
67+
if metainfo[i][0] != file_size or metainfo[i][1] != file_mtime:
68+
metainfo[i] = [file_size, file_mtime]
69+
need_index.append(i)
70+
# 写入索引文件
71+
with open(self.exists_index_path, 'wb') as wp:
72+
wp.write(json.dumps(exists_index,ensure_ascii=False).encode('UTF-8'))
73+
with open(self.metainfo_path, 'wb') as wp:
74+
wp.write(json.dumps(metainfo,ensure_ascii=False).encode('UTF-8'))
75+
return [(i,exists_index[i]) for i in need_index]
76+
77+
78+
def update_ir_index(self, need_index):
79+
for idx, fpath in tqdm(need_index, ascii=True, desc='更新索引记录'):
80+
fv = self.ir_engine.get_fv(fpath)
81+
if fv is None:
8482
continue
85-
if not idx in matched:
86-
matched.add(idx)
87-
yield (exists_index[idx], exists_index[ids[i]], sim[i])
83+
self.ir_engine.add_fv(fv, idx)
84+
self.ir_engine.save_index()
85+
86+
87+
def remove_nonexists(self):
88+
exists_index = []
89+
if os.path.exists(self.exists_index_path):
90+
exists_index = json.loads(open(self.exists_index_path, 'rb').read())
91+
for idx in tqdm(range(len(exists_index)), ascii=True, desc='删除不存在文件'):
92+
if not os.path.exists(exists_index[idx]):
93+
exists_index[idx] = NOTEXISTS
94+
self.ir_engine.hnsw_index.mark_deleted(idx)
95+
with open(self.exists_index_path, 'wb') as wp:
96+
wp.write(json.dumps(exists_index,ensure_ascii=False).encode('UTF-8'))
97+
98+
99+
def checkout(self, image_path, exists_index, match_n=5):
100+
fv = self.ir_engine.get_fv(image_path)
101+
sim, ids = self.ir_engine.match(fv, match_n)
102+
return [(sim[i], exists_index[ids[i]]) for i in range(len(ids))]
103+
104+
105+
def get_duplicate(self, exists_index, threshold, same_folder):
106+
matched = set()
107+
for idx in tqdm(range(len(exists_index)), ascii=True, desc='检索重复图像中'):
108+
match_n = 5
109+
try:
110+
fv = self.ir_engine.hnsw_index.get_items([idx])[0]
111+
except RuntimeError:
112+
continue
113+
sim, ids = self.ir_engine.match(fv, match_n)
114+
while sim[-1] > threshold:
115+
match_n = round(match_n*1.5)
116+
sim, ids = self.ir_engine.match(fv, match_n)
117+
for i in range(len(ids)):
118+
if ids[i] == idx:
119+
continue
120+
if sim[i] < threshold:
121+
continue
122+
if ids[i] in matched:
123+
continue
124+
if not idx in matched:
125+
matched.add(idx)
126+
path_a = exists_index[idx]
127+
path_b = exists_index[ids[i]]
128+
if same_folder:
129+
if os.path.dirname(path_a) != os.path.dirname(path_b):
130+
continue
131+
yield (path_a, path_b, sim[i])

0 commit comments

Comments
 (0)