@@ -234,39 +234,37 @@ def deduplicate_object(ioc, obj, expiry, expiry_max, threshold, source_name, con
234234 }
235235 )
236236 return
237- else :
238- # T1 Not Found Protocol: We have a possibly unique object
239- ioc .getLogger ().debug ("Type1 Match not found; Beginning type 2 processing" )
240- # Create a T1
241- T1ObjectId = DeDupCollection .insert_one ({
242- 'expiry' : Deduplication .generate_expiry_time (int (expiry )),
243- 'grease_internal_configuration' : configuration_name ,
244- 'max_expiry' : Deduplication .generate_max_expiry_time (int (expiry_max )),
245- 'type' : 1 ,
246- 'score' : 1 ,
247- 'source' : str (source_name ),
248- 'hash' : Deduplication .generate_hash_from_obj (t1test )
249- }).inserted_id
250- # Begin T2 Deduplication
251- compositeScore = Deduplication .object_field_score (
252- collection , ioc , source_name , configuration_name , obj , str (T1ObjectId ), expiry , expiry_max , field_set
237+ # T1 Not Found Protocol: We have a possibly unique object
238+ ioc .getLogger ().debug ("Type1 Match not found; Beginning type 2 processing" )
239+ # Create a T1
240+ T1ObjectId = DeDupCollection .insert_one ({
241+ 'expiry' : Deduplication .generate_expiry_time (int (expiry )),
242+ 'grease_internal_configuration' : configuration_name ,
243+ 'max_expiry' : Deduplication .generate_max_expiry_time (int (expiry_max )),
244+ 'type' : 1 ,
245+ 'score' : 1 ,
246+ 'source' : str (source_name ),
247+ 'hash' : Deduplication .generate_hash_from_obj (t1test )
248+ }).inserted_id
249+ # Begin T2 Deduplication
250+ compositeScore = Deduplication .object_field_score (
251+ collection , ioc , source_name , configuration_name , obj , str (T1ObjectId ), expiry , expiry_max , field_set
252+ )
253+ if compositeScore < threshold :
254+ # unique obj
255+ ioc .getLogger ().trace (
256+ "Unique object! Composite score was: [{0}] threashold: [{1}]" .format (compositeScore , threshold ),
257+ verbose = True
253258 )
254- if compositeScore < threshold :
255- # unique obj
256- ioc .getLogger ().trace (
257- "Unique object! Composite score was: [{0}] threashold: [{1}]" .format (compositeScore , threshold ),
258- verbose = True
259- )
260- final .append (obj )
261- return
262- else :
263- # likely duplicate value
264- ioc .getLogger ().trace (
265- "Object surpassed threshold, suspected to be duplicate! "
266- "Composite score was: [{0}] threashold: [{1}]" .format (compositeScore , threshold ),
267- verbose = True
268- )
269- return
259+ final .append (obj )
260+ return
261+ # likely duplicate value
262+ ioc .getLogger ().trace (
263+ "Object surpassed threshold, suspected to be duplicate! "
264+ "Composite score was: [{0}] threashold: [{1}]" .format (compositeScore , threshold ),
265+ verbose = True
266+ )
267+ return
270268
271269 @staticmethod
272270 def object_field_score (collection , ioc , source_name , configuration_name , obj , objectId , expiry , max_expiry , field_set = None ):
@@ -361,8 +359,46 @@ def object_field_score(collection, ioc, source_name, configuration_name, obj, ob
361359 continue
362360 if len (field_scores ) is 0 :
363361 return 0.0
364- else :
365- return float (sum (field_scores ) / float (len (field_scores )))
362+ return float (sum (field_scores ) / float (len (field_scores )))
363+
364+ @staticmethod
365+ def make_hashable (obj ):
366+ """Takes a dictionary and makes a sorted tuple of strings representing flattened key value pairs
367+ Args:
368+ obj (dict): A dictionary
369+ Returns:
370+ tuple<str>: a sorted flattened tuple of the dictionary's key value pairs
371+
372+ Example:
373+ {
374+ "a": ["test1", "test2"],
375+ "b": [{"test2": 21}, {"test1": 1}, {"test7": 3}],
376+ "c": "test"
377+ }
378+ becomes...
379+ (('a', ('test1', 'test2')),
380+ ('b', ((('test1', 1),), (('test2', 21),), (('test7', 3),))),
381+ ('c', 'test'))
382+ """
383+ final = []
384+ sorted_tuples = Deduplication .make_hashable_helper (obj )
385+ for pair in sorted_tuples :
386+ final .append (pair )
387+ return tuple (final )
388+
389+ @staticmethod
390+ def make_hashable_helper (obj ):
391+ """Recursively turns iterables into sorted tuples"""
392+ if isinstance (obj , (tuple , list )):
393+ return tuple (sorted (Deduplication .make_hashable_helper (e ) for e in obj ))
394+
395+ if isinstance (obj , dict ):
396+ return tuple (sorted ((k , Deduplication .make_hashable_helper (v )) for k , v in obj .items ()))
397+
398+ if isinstance (obj , (set , frozenset )):
399+ return tuple (sorted (Deduplication .make_hashable_helper (e ) for e in obj ))
400+
401+ return obj
366402
367403 @staticmethod
368404 def generate_hash_from_obj (obj ):
@@ -375,7 +411,7 @@ def generate_hash_from_obj(obj):
375411 str: Object Hash
376412
377413 """
378- return hashlib .sha256 (str ( obj ).encode ('utf-8' )).hexdigest ()
414+ return hashlib .sha256 (repr ( Deduplication . make_hashable ( obj ) ).encode ('utf-8' )).hexdigest ()
379415
380416 @staticmethod
381417 def generate_expiry_time (hours ):
0 commit comments