Skip to content

Commit c793a16

Browse files
committed
Add perceptual hashing (phash) support for image similarity detection
1 parent eb205e4 commit c793a16

File tree

16 files changed

+1881
-4
lines changed

16 files changed

+1881
-4
lines changed

bin/lib/ail_core.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,15 @@
1818

1919
AIL_OBJECTS = {'author', 'barcode', 'chat', 'chat-subchannel', 'chat-thread', 'cookie-name', 'cve', 'cryptocurrency',
2020
'decoded', 'domain', 'dom-hash', 'etag', 'favicon', 'file-name', 'gtracker', 'hhhash', 'ip',
21-
'item', 'image', 'mail', 'message', 'ocr', 'pdf', 'pgp', 'qrcode', 'ssh-key', 'screenshot', 'title',
21+
'item', 'image', 'mail', 'message', 'ocr', 'pdf', 'phash', 'pgp', 'qrcode', 'ssh-key', 'screenshot', 'title',
2222
'user-account', 'username'}
2323

2424
AIL_OBJECTS_WITH_SUBTYPES = {'chat', 'chat-subchannel', 'cryptocurrency', 'pgp', 'username', 'user-account'}
2525

2626
# TODO by object TYPE ???? correlation
2727
AIL_OBJECTS_CORRELATIONS_DEFAULT = {'author', 'barcode', 'chat', 'chat-subchannel', 'chat-thread', 'cve', 'cryptocurrency',
2828
'decoded', 'domain', 'dom-hash', 'favicon', 'file-name', 'gtracker', 'item',
29-
'image', 'ip', 'mail', 'message', 'ocr', 'pdf', 'pgp', 'qrcode', 'screenshot',
29+
'image', 'ip', 'mail', 'message', 'ocr', 'pdf', 'phash', 'pgp', 'qrcode', 'screenshot',
3030
'ssh-key', 'title', 'user-account', 'username'}
3131

3232
AIL_OBJS_QUEUES = {'barcode', 'decoded', 'file-name', 'image', 'item', 'message', 'ocr', 'pgp', 'qrcode', 'screenshot', 'title'} # ADD TAGS ???

bin/lib/correlations_engine.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,16 +57,17 @@
5757
"file-name": ["chat", "item", "message", "pdf"],
5858
"gtracker": ["domain", "item"],
5959
"hhhash": ["domain"],
60-
"image": ["barcode", "chat", "chat-subchannel", "chat-thread", "message", "ocr", "qrcode", "user-account"], # TODO subchannel + threads ????
60+
"image": ["barcode", "chat", "chat-subchannel", "chat-thread", "message", "ocr", "phash", "qrcode", "user-account", "image", "screenshot"], # TODO subchannel + threads ????
6161
"ip": ["ssh-key"],
6262
"item": ["cve", "cryptocurrency", "decoded", "domain", "dom-hash", "favicon", "file-name", "gtracker", "mail", "message", "pdf", "pgp", "screenshot", "title", "username"], # chat ???
6363
"mail": ["domain", "item", "message"], # chat ??
6464
"message": ["barcode", "chat", "chat-subchannel", "chat-thread", "cve", "cryptocurrency", "decoded", "domain", "file-name", "image", "item", "mail", "ocr", "pdf", "pgp", "user-account"],
6565
"ocr": ["chat", "chat-subchannel", "chat-thread", "cve", "cryptocurrency", "decoded", "image", "message", "pgp", "user-account"],
6666
"pdf": ["author", "chat", "file-name", "item", "message"],
67+
"phash": ["image", "phash"],
6768
"pgp": ["chat", "domain", "item", "message", "ocr"],
6869
"qrcode": ["chat", "cve", "cryptocurrency", "decoded", "domain", "image", "message", "screenshot"], # "chat-subchannel", "chat-thread" ?????
69-
"screenshot": ["barcode", "domain", "item", "qrcode"],
70+
"screenshot": ["barcode", "domain", "item", "qrcode", "image"],
7071
"ssh-key": ["domain", "ip"],
7172
"title": ["domain", "item"],
7273
"user-account": ["chat", "chat-subchannel", "chat-thread", "image", "message", "ocr", "username"],

bin/lib/objects/Images.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,15 @@
1212
from flask import url_for
1313
from pymisp import MISPObject
1414

15+
try:
16+
from PIL import Image as PILImage
17+
from PIL.ExifTags import TAGS
18+
import imagehash
19+
IMAGEHASH_AVAILABLE = True
20+
except ImportError:
21+
IMAGEHASH_AVAILABLE = False
22+
TAGS = None
23+
1524
sys.path.append(os.environ['AIL_BIN'])
1625
##################################
1726
# Import Project packages
@@ -116,6 +125,66 @@ def get_description(self, model=None):
116125
description = description.replace("`", ' ')
117126
return description
118127

128+
def calculate_phash(self):
129+
"""Calculate perceptual hash (pHash) for the image."""
130+
if not IMAGEHASH_AVAILABLE:
131+
return None
132+
133+
if not self.exists():
134+
return None
135+
136+
try:
137+
filepath = self.get_filepath()
138+
with PILImage.open(filepath) as img:
139+
phash = imagehash.phash(img)
140+
return str(phash)
141+
except Exception as e:
142+
self.logger.warning(f"Failed to calculate phash for image {self.id}: {e}")
143+
return None
144+
145+
def get_phash(self):
146+
"""Get perceptual hash, calculating it if not stored."""
147+
phash = self._get_field('phash')
148+
if phash:
149+
return phash
150+
151+
# Calculate and store if not exists
152+
phash = self.calculate_phash()
153+
if phash:
154+
self._set_field('phash', phash)
155+
return phash
156+
157+
def set_phash(self, phash_value):
158+
"""Store perceptual hash in image metadata."""
159+
if phash_value:
160+
self._set_field('phash', phash_value)
161+
162+
def compare_phash(self, other_phash):
163+
"""
164+
Compare this image's phash with another phash using Hamming distance.
165+
166+
Args:
167+
other_phash: Another phash value (string) to compare with
168+
169+
Returns:
170+
int: Hamming distance (0-64), or None if either phash is invalid
171+
"""
172+
if not IMAGEHASH_AVAILABLE:
173+
return None
174+
175+
current_phash = self.get_phash()
176+
if not current_phash or not other_phash:
177+
return None
178+
179+
try:
180+
# Convert hex strings to imagehash objects for comparison
181+
hash1 = imagehash.hex_to_hash(current_phash)
182+
hash2 = imagehash.hex_to_hash(other_phash)
183+
return hash1 - hash2 # Hamming distance
184+
except Exception as e:
185+
self.logger.warning(f"Failed to compare phash for image {self.id}: {e}")
186+
return None
187+
119188
def get_search_document(self):
120189
global_id = self.get_global_id()
121190
content = self.get_description()

bin/lib/objects/Phashs.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
#!/usr/bin/env python3
2+
# -*-coding:UTF-8 -*
3+
4+
import os
5+
import sys
6+
7+
from flask import url_for
8+
from pymisp import MISPObject
9+
10+
sys.path.append(os.environ['AIL_BIN'])
11+
##################################
12+
# Import Project packages
13+
##################################
14+
from lib.ConfigLoader import ConfigLoader
15+
from lib.objects.abstract_daterange_object import AbstractDaterangeObject, AbstractDaterangeObjects
16+
17+
config_loader = ConfigLoader()
18+
r_objects = config_loader.get_db_conn("Kvrocks_Objects")
19+
baseurl = config_loader.get_config_str("Notifications", "ail_domain")
20+
config_loader = None
21+
22+
23+
class Phash(AbstractDaterangeObject):
24+
"""
25+
AIL Phash Object.
26+
Represents a perceptual hash value for images.
27+
"""
28+
29+
def __init__(self, id):
30+
super(Phash, self).__init__('phash', id)
31+
32+
def delete(self):
33+
# TODO: Implement delete functionality
34+
pass
35+
36+
def get_link(self, flask_context=False):
37+
if flask_context:
38+
url = url_for('correlation.show_correlation', type=self.type, id=self.id)
39+
else:
40+
url = f'{baseurl}/correlation/show?type={self.type}&id={self.id}'
41+
return url
42+
43+
def get_svg_icon(self):
44+
# Icon for correlation graph visualization (like DomHash and HHHash)
45+
return {'style': 'fas', 'icon': '\uf1c0', 'color': '#E1F5DF', 'radius': 5}
46+
47+
def get_misp_object(self):
48+
obj_attrs = []
49+
obj = MISPObject('phash')
50+
first_seen = self.get_first_seen()
51+
last_seen = self.get_last_seen()
52+
if first_seen:
53+
obj.first_seen = first_seen
54+
if last_seen:
55+
obj.last_seen = last_seen
56+
if not first_seen or not last_seen:
57+
self.logger.warning(
58+
f'Export error, None seen {self.type}:{self.subtype}:{self.id}, first={first_seen}, last={last_seen}')
59+
60+
obj_attrs.append(obj.add_attribute('phash', value=self.get_id()))
61+
# Note: DomHash doesn't include tool attribute, HHHash does. Phash follows DomHash pattern.
62+
for obj_attr in obj_attrs:
63+
for tag in self.get_tags():
64+
obj_attr.add_tag(tag)
65+
return obj
66+
67+
def get_nb_seen(self):
68+
return self.get_nb_correlation('image')
69+
70+
def get_meta(self, options=set()):
71+
meta = self._get_meta(options=options)
72+
meta['id'] = self.id
73+
meta['tags'] = self.get_tags(r_list=True)
74+
return meta
75+
76+
def create(self, _first_seen=None, _last_seen=None):
77+
self._create()
78+
79+
80+
def create(phash_value, obj_id=None):
81+
"""
82+
Create or get Phash object.
83+
84+
Args:
85+
phash_value: The phash string value
86+
obj_id: Optional phash ID (if None, uses phash_value as ID)
87+
88+
Returns:
89+
Phash object
90+
"""
91+
if obj_id is None:
92+
obj_id = phash_value
93+
obj = Phash(obj_id)
94+
if not obj.exists():
95+
obj.create()
96+
return obj
97+
98+
99+
class Phashs(AbstractDaterangeObjects):
100+
"""
101+
Phash Objects
102+
"""
103+
def __init__(self):
104+
super().__init__('phash', Phash)
105+
106+
def get_name(self):
107+
return 'Phashs'
108+
109+
def get_icon(self):
110+
return {'fa': 'fa-solid', 'icon': 'image'}
111+
112+
def get_link(self, flask_context=False):
113+
if flask_context:
114+
url = url_for('objects_phash.objects_phashes')
115+
else:
116+
url = f'{baseurl}/objects/phashes'
117+
return url
118+
119+
def sanitize_id_to_search(self, name_to_search):
120+
return name_to_search
121+

bin/lib/objects/Screenshots.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,13 @@
1111
from flask import url_for
1212
from pymisp import MISPObject
1313

14+
try:
15+
from PIL import Image as PILImage
16+
import imagehash
17+
IMAGEHASH_AVAILABLE = True
18+
except ImportError:
19+
IMAGEHASH_AVAILABLE = False
20+
1421
sys.path.append(os.environ['AIL_BIN'])
1522
##################################
1623
# Import Project packages
@@ -114,6 +121,40 @@ def get_description(self, model=None):
114121
model = get_default_image_description_model()
115122
return self._get_field(f'desc:{model}')
116123

124+
def calculate_phash(self):
125+
"""Calculate perceptual hash (pHash) for the screenshot."""
126+
if not IMAGEHASH_AVAILABLE:
127+
return None
128+
129+
if not self.exists():
130+
return None
131+
132+
try:
133+
filepath = self.get_filepath()
134+
with PILImage.open(filepath) as img:
135+
phash = imagehash.phash(img)
136+
return str(phash)
137+
except Exception as e:
138+
# Log error if needed
139+
return None
140+
141+
def get_phash(self):
142+
"""Get perceptual hash, calculating it if not stored."""
143+
phash = self._get_field('phash')
144+
if phash:
145+
return phash
146+
147+
# Calculate and store if not exists
148+
phash = self.calculate_phash()
149+
if phash:
150+
self._set_field('phash', phash)
151+
return phash
152+
153+
def set_phash(self, phash_value):
154+
"""Store perceptual hash in screenshot metadata."""
155+
if phash_value:
156+
self._set_field('phash', phash_value)
157+
117158
def get_search_document(self):
118159
global_id = self.get_global_id()
119160
content = self.get_description()

bin/lib/objects/ail_objects.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
from lib.objects import Messages
4646
from lib.objects import Ocrs
4747
from lib.objects import PDFs
48+
from lib.objects import Phashs
4849
from lib.objects import Pgps
4950
from lib.objects import QrCodes
5051
from lib.objects import Screenshots
@@ -81,6 +82,7 @@
8182
'message': {'obj': Messages.Message, 'objs': None}, #############################################################
8283
'ocr': {'obj': Ocrs.Ocr, 'objs': Ocrs.Ocrs},
8384
'pdf': {'obj': PDFs.PDF, 'objs': PDFs.PDFs},
85+
'phash': {'obj': Phashs.Phash, 'objs': Phashs.Phashs},
8486
'pgp': {'obj': Pgps.Pgp, 'objs': Pgps.Pgps},
8587
'qrcode': {'obj': QrCodes.Qrcode, 'objs': QrCodes.Qrcodes},
8688
'screenshot': {'obj': Screenshots.Screenshot, 'objs': None}, ####################################################################################################

bin/modules/ImagePhash.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
#!/usr/bin/env python3
2+
# -*-coding:UTF-8 -*
3+
"""
4+
The ImagePhash Module
5+
======================
6+
7+
Calculates perceptual hash (phash) for images when they are imported.
8+
Creates Phash objects and correlates them with Images.
9+
"""
10+
11+
##################################
12+
# Import External packages
13+
##################################
14+
import os
15+
import sys
16+
17+
sys.path.append(os.environ['AIL_BIN'])
18+
##################################
19+
# Import Project packages
20+
##################################
21+
from modules.abstract_module import AbstractModule
22+
from lib.objects import Images
23+
from lib.objects import Phashs
24+
25+
26+
class Phash(AbstractModule):
27+
"""
28+
Phash module for AIL framework
29+
Calculates perceptual hash for images and creates Phash objects
30+
"""
31+
32+
def __init__(self):
33+
super(Phash, self).__init__()
34+
35+
# Waiting time in seconds between to message processed
36+
self.pending_seconds = 1
37+
38+
# Send module state to logs
39+
self.logger.info(f'Module {self.module_name} initialized')
40+
41+
def compute(self, message):
42+
image = self.get_obj()
43+
date = message
44+
45+
# Calculate phash
46+
phash_value = image.calculate_phash()
47+
if not phash_value:
48+
self.logger.warning(f'Failed to calculate phash for image {image.id}')
49+
return None
50+
51+
# Store phash in image metadata (for backward compatibility and quick access)
52+
image.set_phash(phash_value)
53+
54+
# Create or get Phash object
55+
phash_obj = Phashs.create(phash_value)
56+
57+
# Correlate Phash ↔ Image (using add() which automatically creates correlation)
58+
phash_obj.add(date, image)
59+
60+
self.logger.debug(f'Created Phash object {phash_value} for image {image.id}')
61+
62+
# Queue Phash object for correlation processing
63+
self.add_message_to_queue(obj=phash_obj, queue='PhashCorrelation', message=date)
64+
65+
66+
if __name__ == '__main__':
67+
68+
module = Phash()
69+
module.run()
70+

0 commit comments

Comments
 (0)