77 Optional ,
88)
99
10+ from slips_files .common .data_structures .trie import Trie
11+
12+ # for future developers, remember to invalidate_trie_cache() on every
13+ # change to the self.constants.IOC_DOMAINS key or slips will keep using an
14+ # invalid cache to lookup malicious domains
15+
1016
1117class IoCHandler :
1218 """
@@ -17,6 +23,38 @@ class IoCHandler:
1723
1824 name = "DB"
1925
26+ def __init__ (self ):
27+ # used for faster domain lookups
28+ self .trie = None
29+ self .is_trie_cached = False
30+
31+ def _build_trie (self ):
32+ """Retrieve domains from Redis and construct the trie."""
33+ self .trie = Trie ()
34+ ioc_domains : Dict [str , str ] = self .rcache .hgetall (
35+ self .constants .IOC_DOMAINS
36+ )
37+ for domain , domain_info in ioc_domains .items ():
38+ domain : str
39+ domain_info : str
40+ # domain_info is something like this
41+ # {"description": "['hack''malware''phishing']",
42+ # "source": "OCD-Datalake-russia-ukraine_IOCs-ALL.csv",
43+ # "threat_level": "medium",
44+ # "tags": ["Russia-UkraineIoCs"]}
45+
46+ # store parsed domain info
47+ self .trie .insert (domain , json .loads (domain_info ))
48+ self .is_trie_cached = True
49+
50+ def _invalidate_trie_cache (self ):
51+ """
52+ Invalidate the trie cache.
53+ used whenever IOC_DOMAINS key is updated.
54+ """
55+ self .trie = None
56+ self .is_trie_cached = False
57+
2058 def set_loaded_ti_files (self , number_of_loaded_files : int ):
2159 """
2260 Stores the number of successfully loaded TI files
@@ -43,6 +81,7 @@ def delete_feed_entries(self, url: str):
4381 if feed_to_delete in domain_description ["source" ]:
4482 # this entry has the given feed as source, delete it
4583 self .rcache .hdel (self .constants .IOC_DOMAINS , domain )
84+ self ._invalidate_trie_cache ()
4685
4786 # get all IPs that are read from TI files in our db
4887 ioc_ips = self .rcache .hgetall (self .constants .IOC_IPS )
@@ -139,6 +178,7 @@ def delete_domains_from_ioc_domains(self, domains: List[str]):
139178 Delete old domains from IoC
140179 """
141180 self .rcache .hdel (self .constants .IOC_DOMAINS , * domains )
181+ self ._invalidate_trie_cache ()
142182
143183 def add_ips_to_ioc (self , ips_and_description : Dict [str , str ]) -> None :
144184 """
@@ -164,6 +204,7 @@ def add_domains_to_ioc(self, domains_and_description: dict) -> None:
164204 self .rcache .hmset (
165205 self .constants .IOC_DOMAINS , domains_and_description
166206 )
207+ self ._invalidate_trie_cache ()
167208
168209 def add_ip_range_to_ioc (self , malicious_ip_ranges : dict ) -> None :
169210 """
@@ -239,43 +280,53 @@ def is_blacklisted_ssl(self, sha1):
239280 info = self .rcache .hmget (self .constants .IOC_SSL , sha1 )[0 ]
240281 return False if info is None else info
241282
283+ def _match_exact_domain (self , domain : str ) -> Optional [Dict [str , str ]]:
284+ """checks if the given domain is blacklisted.
285+ checks only the exact given domain, no subdomains"""
286+ domain_description = self .rcache .hget (
287+ self .constants .IOC_DOMAINS , domain
288+ )
289+ if not domain_description :
290+ return
291+ return json .loads (domain_description )
292+
293+ def _match_subdomain (self , domain : str ) -> Optional [Dict [str , str ]]:
294+ """
295+ Checks if we have any blacklisted domain that is a part of the
296+ given domain
297+ Uses a cached trie for optimization.
298+ """
299+ # the goal here is we dont retrieve that huge amount of domains
300+ # from the db on every domain lookup
301+ # so we retrieve once, put em in a trie (aka cache them in memory),
302+ # keep using them from that data structure until a new domain is
303+ # added to the db, when that happens we invalidate the cache,
304+ # rebuild the trie, and keep using it from there.
305+ if not self .is_trie_cached :
306+ self ._build_trie ()
307+
308+ found , domain_info = self .trie .search (domain )
309+ if found :
310+ return domain_info
311+
242312 def is_blacklisted_domain (
243313 self , domain : str
244- ) -> Tuple [Dict [str , str ], bool ]:
314+ ) -> Union [ Tuple [Dict [str , str ], bool ], bool ]:
245315 """
246- Search in the dB of malicious domains and return a
247- description if we found a match
316+ Check if the given domain or its subdomain is blacklisted.
248317 returns a tuple (description, is_subdomain)
249318 description: description of the subdomain if found
250319 bool: True if we found a match for exactly the given
251320 domain False if we matched a subdomain
252321 """
253- domain_description = self .rcache .hget (
254- self .constants .IOC_DOMAINS , domain
255- )
256- is_subdomain = False
257- if domain_description :
258- return json .loads (domain_description ), is_subdomain
322+ if match := self ._match_exact_domain (domain ):
323+ is_subdomain = False
324+ return match , is_subdomain
259325
260- # try to match subdomain
261- ioc_domains : Dict [str , Dict [str , str ]] = self .rcache .hgetall (
262- self .constants .IOC_DOMAINS
263- )
264- for malicious_domain , domain_info in ioc_domains .items ():
265- malicious_domain : str
266- domain_info : str
267- # something like this
268- # {"description": "['hack''malware''phishing']",
269- # "source": "OCD-Datalake-russia-ukraine_IOCs-ALL.csv",
270- # "threat_level": "medium",
271- # "tags": ["Russia-UkraineIoCs"]}
272- domain_info : Dict [str , str ] = json .loads (domain_info )
273- # if the we contacted images.google.com and we have
274- # google.com in our blacklists, we find a match
275- if malicious_domain in domain :
276- is_subdomain = True
277- return domain_info , is_subdomain
278- return False , is_subdomain
326+ if match := self ._match_subdomain (domain ):
327+ is_subdomain = True
328+ return match , is_subdomain
329+ return False , False
279330
280331 def get_all_blacklisted_ip_ranges (self ) -> dict :
281332 """
0 commit comments