Skip to content

Commit 222e287

Browse files
committed
chg: [images_engines] improve prompts + ad prompt to detect csam domains and images
1 parent 528c486 commit 222e287

File tree

3 files changed

+135
-5
lines changed

3 files changed

+135
-5
lines changed

bin/lib/images_engine.py

Lines changed: 116 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def get_image_obj(obj_gid):
3838

3939
def create_ollama_domain_data(model, descriptions):
4040
return json.dumps({'model': model,
41-
'prompt': f'From this list of images descriptions, Can you please describe this domain and check if it\'s related to child exploitation?\n\n{descriptions}',
41+
'prompt': f'From this list of images descriptions, Describe this domain\n\n{descriptions}',
4242
'stream': False
4343
})
4444

@@ -49,6 +49,25 @@ def create_ollama_image_data(model, images):
4949
'images': images
5050
})
5151

52+
def create_ollama_description_csam_classification(model, description):
53+
return json.dumps({'model': model,
54+
'prompt': f'Does this description involve CE or CSAM? Answer "Yes" or "No".\nDescription: {description}',
55+
'stream': False
56+
})
57+
58+
def create_ollama_image_csam_classification(model, images):
59+
return json.dumps({'model': model,
60+
'prompt': 'Does this image involve CE or CSAM? Answer "Yes" or "No".',
61+
'stream': False,
62+
'images': images
63+
})
64+
65+
def create_ollama_domain_csam_classification(model, descriptions):
66+
return json.dumps({'model': model,
67+
'prompt': f'Is this website domain associated with child exploitation? Respond with only "Yes" or "No"\n\n{descriptions}',
68+
'stream': False
69+
})
70+
5271
# screenshot + image
5372
def api_get_image_description(obj_gid):
5473
model = get_default_image_description_model()
@@ -72,7 +91,7 @@ def api_get_image_description(obj_gid):
7291
return {"status": "error", "reason": f"ollama requests error: {e}"}, 400
7392
if res.status_code != 200:
7493
# TODO LOG
75-
return {"status": "error", "reason": f" llama requests error: {res.status_code}, {res.text}"}, 400
94+
return {"status": "error", "reason": f"ollama requests error: {res.status_code}, {res.text}"}, 400
7695
else:
7796
r = res.json()
7897
if r:
@@ -117,7 +136,7 @@ def get_domain_description(domain_id, reprocess=True):
117136
return {"status": "error", "reason": f"ollama requests error: {e}"}, 400
118137
if res.status_code != 200:
119138
# TODO LOG
120-
return {"status": "error", "reason": f" llama requests error: {res.status_code}, {res.text}"}, 400
139+
return {"status": "error", "reason": f"ollama requests error: {res.status_code}, {res.text}"}, 400
121140
else:
122141
r = res.json()
123142
if r:
@@ -152,6 +171,98 @@ def _create_image_description():
152171
# print(f'{done}/{total} {progress}%')
153172

154173

174+
def update_domain_description(domain, model):
175+
domain.delete_description(model)
176+
search_engine.remove_document('desc-dom', domain.get_global_id())
177+
get_domain_description(domain.get_id(), reprocess=False)
178+
179+
def update_domains_descriptions():
180+
nb_domains = Domains.get_nb_domains_up_by_type('onion') + Domains.get_nb_domains_up_by_type('web')
181+
model = get_default_image_description_model()
182+
done = 0
183+
for domain in Domains.get_domain_up_iterator():
184+
update_domain_description(domain, model)
185+
done += 1
186+
progress = int(done * 100 / nb_domains)
187+
print(f'{done}/{nb_domains} {progress}%')
188+
search_engine.delete_index('desc-dom')
189+
190+
191+
def check_is_image_csam(obj_gid, image_description=False):
192+
model = get_default_image_description_model()
193+
194+
image = get_image_obj(obj_gid)
195+
if not image:
196+
return {"status": "error", "reason": "Unknown image"}, 404
197+
198+
headers = {"Connection": "close", 'Content-Type': 'application/json', 'Accept': 'application/json'}
199+
is_csam = None
200+
201+
# Check if image description is CSAM related
202+
if image_description:
203+
description = api_get_image_description(obj_gid)
204+
if description[1] == 200:
205+
description = description[0]
206+
data = create_ollama_description_csam_classification(model, description)
207+
208+
# Check If image content is CSAM
209+
else:
210+
b64 = image.get_base64()
211+
if not b64:
212+
return {"status": "error", "reason": "No Content"}, 404
213+
data = create_ollama_image_csam_classification(model, [b64])
214+
215+
if data:
216+
try:
217+
res = requests.post(f'{OLLAMA_URL}/api/generate', data=data, headers=headers)
218+
except Exception as e:
219+
return {"status": "error", "reason": f"ollama requests error: {e}"}, 400
220+
if res.status_code != 200:
221+
# TODO LOG
222+
return {"status": "error", "reason": f"ollama requests error: {res.status_code}, {res.text}"}, 400
223+
else:
224+
r = res.json()
225+
if r:
226+
res = image.add_description_model(model, r['response'])
227+
if res == 'YES':
228+
is_csam = True
229+
elif res == 'NO':
230+
is_csam = False
231+
232+
# TODO LOG NONE result
233+
if is_csam:
234+
image.add_tag('dark-web:topic="pornography-child-exploitation"')
235+
print(obj_gid, is_csam)
236+
237+
return is_csam, 200
238+
239+
def check_images_csam(image_description=False):
240+
for image in Images.get_all_images_objects():
241+
check_is_image_csam(image.get_global_id(), image_description=image_description)
242+
243+
244+
def check_if_domain_csam(domain_id):
245+
model = get_default_image_description_model()
246+
domain = Domains.Domain(domain_id)
247+
description = get_domain_description(domain_id)
248+
249+
headers = {"Connection": "close", 'Content-Type': 'application/json', 'Accept': 'application/json'}
250+
try:
251+
res = requests.post(f'{OLLAMA_URL}/api/generate', data=create_ollama_domain_csam_classification(model, description), headers=headers)
252+
except Exception as e: # TODO LOG
253+
return {"status": "error", "reason": f"ollama requests error: {e}"}, 400
254+
if res.status_code != 200:
255+
# TODO LOG
256+
return {"status": "error", "reason": f"ollama requests error: {res.status_code}, {res.text}"}, 400
257+
else:
258+
r = res.json()
259+
if r:
260+
if r['response'] == 'yes':
261+
print('yes')
262+
domain.add_tag('dark-web:topic="pornography-child-exploitation"')
263+
264+
155265
if __name__ == '__main__':
156-
# _create_domains_up_description()
157-
_create_image_description()
266+
update_domains_descriptions()
267+
# check_images_csam()
268+
# _create_image_description()

bin/lib/objects/Domains.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,11 @@ def get_description(self, model=None):
356356
description = description.replace("`", ' ')
357357
return description
358358

359+
def delete_description(self, model=None):
360+
if model is None:
361+
model = get_default_image_description_model()
362+
self._delete_field(f'desc:{model}')
363+
359364
## -Descriptions- ##
360365

361366
## Search ##

bin/lib/search_engine.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import logging.config
77
import sys
88
import time
9+
import uuid
910

1011
import meilisearch
1112

@@ -31,6 +32,11 @@
3132
M_KEY = config_loader.get_config_str('Indexer', 'meilisearch_key')
3233
config_loader = None
3334

35+
36+
def get_obj_uuid5(obj_gid):
37+
return str(uuid.uuid5(uuid.NAMESPACE_URL, obj_gid))
38+
39+
3440
def is_meilisearch_enabled():
3541
return IS_MEILISEARCH_ENABLED
3642

@@ -175,6 +181,14 @@ def index_domains_descriptions():
175181
index_domain_description(dom_id)
176182

177183

184+
def remove_document(index_name, obj_gid):
185+
Engine.remove(index_name, get_obj_uuid5(obj_gid))
186+
187+
188+
def delete_index(index_name):
189+
Engine._delete(index_name)
190+
Engine.client.create_index(index_name, {'primaryKey': 'uuid'})
191+
178192
def log(user_id, index, to_search):
179193
logger.warning(f'{user_id} search: {index} - {to_search}')
180194

0 commit comments

Comments
 (0)