stratosphereips · AlyaGomaa · Nov 18, 2024 · Nov 18, 2024 · Nov 18, 2024 · Nov 18, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -16,11 +16,12 @@ repos:
       hooks:
         - id: trailing-whitespace
         - id: check-added-large-files
+          exclude: ^config/local_ti_files/known_fp_hashes\.csv$
         - id: check-docstring-first
         - id: check-merge-conflict
         - id: end-of-file-fixer
         - id: detect-private-key
-          exclude: .*dataset/.*|
+          exclude: .*dataset/.* |
                 (?x)(
                      ^config/$|
                       .*test.* |

diff --git a/config/local_ti_files/known_fp_md5_hashes.csv b/config/local_ti_files/known_fp_md5_hashes.csv
diff --git a/docs/detection_modules.md b/docs/detection_modules.md
@@ -449,6 +449,14 @@ Example:
     "6734f37431670b3ab4292b8f60f29984", "high", "Trickbot Malwar"
 
 
+### Whitelisting known FP hashes
+
+To avoid false positive "Malicious downloaded file" detections, before looking up MD5 hashes of each downloaded file online, Slips checks if the given hash is part of a known FP.
+
+The list of known FP MD5 hashes is at config/local_ti_files/known_fp_md5_hashes.csv. This list is taken from https://github.com/Neo23x0/ti-falsepositives/tree/master
+
+If the hash is a part of that list, Slips doesn't look it up.
+
 ### Adding your own remote feed
 
 

diff --git a/modules/threat_intelligence/threat_intelligence.py b/modules/threat_intelligence/threat_intelligence.py
@@ -62,7 +62,7 @@ def init(self):
         self.urlhaus = URLhaus(self.db)
         self.spamhaus = Spamhaus(self.db)
         self.pending_queries = multiprocessing.Queue()
-        self.calls_thread = threading.Thread(
+        self.pending_circllu_calls_thread = threading.Thread(
             target=self.handle_pending_queries, daemon=True
         )
         self.circllu = Circllu(self.db, self.pending_queries)
@@ -559,6 +559,23 @@ def set_evidence_malicious_domain(
     def is_valid_threat_level(self, threat_level):
         return threat_level in utils.threat_levels
 
+    def parse_known_fp_hashes(self, fullpath: str):
+        fp_hashes = {}
+        with open(fullpath) as fps:
+            # skip comments
+            for line in fps:
+                if line.startswith("#"):
+                    continue
+
+                # split the line into parts
+                parts = line.split(", ")
+                description = parts[0]
+                hashes = parts[1:]
+                for hash in hashes:
+                    fp_hashes[hash] = description
+
+        self.db.store_known_fp_md5_hashes(fp_hashes)
+
     def parse_local_ti_file(self, ti_file_path: str) -> bool:
         """Parses a local threat intelligence (TI) file to extract
          and store various indicators of compromise (IoCs), including IP
@@ -824,34 +841,20 @@ def parse_ja3_file(self, path):
         return True
 
     def parse_jarm_file(self, path):
-        """Parses a file containing JARM hashes, their associated threat levels, and
-        descriptions, then stores this information in the database. The file is expected
-        to follow a specific format where each line contains a JARM hash, its threat
-        level, and a descriptive text, separated by commas.
+        """
+        Parses a file of JARM hashes with their threat levels and descriptions, then stores the data in the database.
 
         Parameters:
-            path (str): The absolute path to the local file containing
-            JARM hashes.
-
+        path (str): Absolute path to the JARM hash file.
         Returns:
-            bool: Always returns True to indicate the method has executed.
-             This behavior could be modified in the future to reflect the
-              success status of parsing and database storage operations.
 
-        This method processes each line of the provided file, skipping any
-        lines that are commented out or improperly formatted. It validates
-         the threat level of each JARM hash against a predefined list of
-         valid levels,
-         defaulting to 'medium' if the provided level is not recognized.
+        bool: Always True, indicating execution success (may change in the future).
+        Details:
 
-        Side Effects:
-            - Populates the database with new JARM hash records extracted
-            from the provided file. Existing records for a JARM hash are
-            not explicitly handled in this method, so duplicate entries
-            could occur if not managed elsewhere.
-            - Logs the progress of reading the file, including a message
-            indicating the start of the process and any errors related to
-            invalid line formats.
+        Processes each line, skipping comments and invalid formats.
+        Validates threat levels, defaulting to 'medium' if unrecognized.
+        Populates the database with parsed JARM hash records (duplicates are not handled).
+        Logs progress, including errors for invalid lines.
         """
         filename = os.path.basename(path)
         jarm_dict = {}
@@ -898,7 +901,6 @@ def parse_jarm_file(self, path):
                         "threat_level": threat_level,
                     }
                 )
-        # Add all loaded JARM to the database
         self.db.add_jarm_to_IoC(jarm_dict)
         return True
 
@@ -1407,6 +1409,11 @@ def is_malicious_hash(self, flow_info: dict):
             # .. }
             return
 
+        if self.db.is_known_fp_md5_hash():
+            # this is a known FP https://github.com/Neo23x0/ti-falsepositives/tree/master
+            # its benign so dont look it up
+            return
+
         if blacklist_details := self.search_online_for_hash(flow_info):
             # the md5 appeared in a blacklist
             # update the blacklist_details dict with uid,
@@ -1706,16 +1713,14 @@ def update_local_file(self, filename):
              of the TI file.
         """
         fullpath = os.path.join(self.path_to_local_ti_files, filename)
+        parsers = {
+            "own_malicious_iocs.csv": self.parse_local_ti_file,
+            "own_malicious_JA3.csv": self.parse_ja3_file,
+            "own_malicious_JARM.csv": self.parse_jarm_file,
+            "known_fp_md5_hashes.csv": self.parse_known_fp_hashes,
+        }
         if filehash := self.should_update_local_ti_file(fullpath):
-            if "JA3" in filename:
-                # Load updated data to the database
-                self.parse_ja3_file(fullpath)
-            elif "JARM" in filename:
-                # Load updated data to the database
-                self.parse_jarm_file(fullpath)
-            else:
-                # Load updated data to the database
-                self.parse_local_ti_file(fullpath)
+            parsers[filename](fullpath)
             # Store the new etag and time of file in the database
             malicious_file_info = {"hash": filehash}
             self.db.set_ti_feed_info(filename, malicious_file_info)
@@ -1767,11 +1772,12 @@ def pre_main(self):
             "own_malicious_iocs.csv",
             "own_malicious_JA3.csv",
             "own_malicious_JARM.csv",
+            "known_fp_md5_hashes.csv",
         )
         for local_file in local_files:
             self.update_local_file(local_file)
 
-        self.calls_thread.start()
+        self.pending_circllu_calls_thread.start()
 
     def main(self):
         # The channel can receive an IP address or a domain name

diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py
@@ -86,6 +86,12 @@ def get_ip_info(self, *args, **kwargs):
     def set_new_ip(self, *args, **kwargs):
         return self.rdb.set_new_ip(*args, **kwargs)
 
+    def store_known_fp_md5_hashes(self, *args, **kwargs):
+        return self.rdb.store_known_fp_md5_hashes(*args, **kwargs)
+
+    def is_known_fp_md5_hash(self, *args, **kwargs):
+        return self.rdb.is_known_fp_md5_hash(*args, **kwargs)
+
     def ask_for_ip_info(self, *args, **kwargs):
         return self.rdb.ask_for_ip_info(*args, **kwargs)
 

diff --git a/slips_files/core/database/redis_db/constants.py b/slips_files/core/database/redis_db/constants.py
@@ -18,6 +18,7 @@ class Constants:
     DOMAINS_INFO = "DomainsInfo"
     IPS_INFO = "IPsInfo"
     PROCESSED_FLOWS = "processed_flows_so_far"
+    KNOWN_FPS = "known_fps"
 
 
 class Channels:

diff --git a/slips_files/core/database/redis_db/ioc_handler.py b/slips_files/core/database/redis_db/ioc_handler.py
@@ -4,6 +4,7 @@
     List,
     Tuple,
     Union,
+    Optional,
 )
 
 
@@ -119,6 +120,14 @@ def set_ti_feed_info(self, file, data):
         data = json.dumps(data)
         self.rcache.hset(self.constants.TI_FILES_INFO, file, data)
 
+    def store_known_fp_md5_hashes(self, fps: Dict[str, List[str]]):
+        self.rcache.hmset(self.constants.KNOWN_FPS, fps)
+
+    def is_known_fp_md5_hash(self, hash: str) -> Optional[str]:
+        """returns the description of the given hash if it is a FP. and
+        returns Fals eif the hash is not a FP"""
+        return self.rcache.hmget(self.constants.KNOWN_FPS, hash)
+
     def delete_ips_from_IoC_ips(self, ips: List[str]):
         """
         Delete the given IPs from IoC

diff --git a/tests/test_threat_intelligence.py b/tests/test_threat_intelligence.py
@@ -944,7 +944,7 @@ def test_pre_main(mocker):
     threatintel = ModuleFactory().create_threatintel_obj()
     mocker.patch.object(threatintel, "update_local_file")
     threatintel.pre_main()
-    assert threatintel.update_local_file.call_count == 3
+    assert threatintel.update_local_file.call_count == 4
 
 
 @pytest.mark.parametrize(
@@ -1178,7 +1178,7 @@ def test_is_malicious_hash(
     recording evidence of malicious file hashes.
     """
     threatintel = ModuleFactory().create_threatintel_obj()
-
+    threatintel.db.is_known_fp_md5_hash.return_value = False
     mock_search_online_for_hash = mocker.patch.object(
         threatintel, "search_online_for_hash"
     )
@@ -1197,11 +1197,19 @@ def test_is_malicious_hash(
         "twid": "timewindow1",
     }
     mock_search_online_for_hash.return_value = search_online_result
+
     threatintel.is_malicious_hash(flow_info)
 
     assert threatintel.db.set_evidence.called == expected_set_evidence_call
 
 
+def test_is_malicious_hash_known_fp_md5():
+    threatintel = ModuleFactory().create_threatintel_obj()
+    threatintel.db.is_known_fp_md5_hash.return_value = True
+    flow = {"flow": {"md5": "c0eec84d09bbb7f4cd1a8896f9dff718"}}
+    assert threatintel.is_malicious_hash(flow) is None
+
+
 @pytest.mark.parametrize(
     "url, result, is_malicious",
     [