1+ import numpy as np
2+ import math
13import datetime
24from deepdiff .deephash import DeepHash
35from deepdiff .helper import (
@@ -31,7 +33,7 @@ def _get_rough_distance(self):
3133 """
3234
3335 _distance = get_numeric_types_distance (
34- self .t1 , self .t2 , max_ = self .cutoff_distance_for_pairs )
36+ self .t1 , self .t2 , max_ = self .cutoff_distance_for_pairs , use_log_scale = self . use_log_scale , log_scale_similarity_threshold = self . log_scale_similarity_threshold )
3537
3638 if _distance is not not_found :
3739 return _distance
@@ -122,7 +124,10 @@ def _precalculate_numpy_arrays_distance(
122124
123125 distances = _get_numpy_array_distance (
124126 pairs_transposed [0 ], pairs_transposed [1 ],
125- max_ = self .cutoff_distance_for_pairs )
127+ max_ = self .cutoff_distance_for_pairs ,
128+ use_log_scale = self .use_log_scale ,
129+ log_scale_similarity_threshold = self .log_scale_similarity_threshold ,
130+ )
126131
127132 i = 0
128133 for added_hash in hashes_added :
@@ -186,14 +191,19 @@ def _get_item_length(item, parents_ids=frozenset([])):
186191 return length
187192
188193
189- def _get_numbers_distance (num1 , num2 , max_ = 1 ):
194+ def _get_numbers_distance (num1 , num2 , max_ = 1 , use_log_scale = False , log_scale_similarity_threshold = 0.1 ):
190195 """
191196 Get the distance of 2 numbers. The output is a number between 0 to the max.
192197 The reason is the
193198 When max is returned means the 2 numbers are really far, and 0 means they are equal.
194199 """
195200 if num1 == num2 :
196201 return 0
202+ if use_log_scale :
203+ distance = logarithmic_distance (num1 , num2 )
204+ if distance < logarithmic_distance :
205+ return 0
206+ return distance
197207 if not isinstance (num1 , float ):
198208 num1 = float (num1 )
199209 if not isinstance (num2 , float ):
@@ -218,8 +228,42 @@ def _numpy_div(a, b, replace_inf_with=1):
218228 result [a == b ] = 0
219229 return result
220230
231+ # To deal with numbers close to zero
232+ MATH_LOG_OFFSET = 1e-10
233+
234+ def numpy_apply_log_keep_sign (array , offset = MATH_LOG_OFFSET ):
235+ # Calculate the absolute value and add the offset
236+ abs_plus_offset = np .abs (array ) + offset
237+
238+ # Calculate the logarithm
239+ log_values = np .log (abs_plus_offset )
240+
241+ # Apply the original signs to the log values
242+ signed_log_values = np .copysign (log_values , array )
243+
244+ return signed_log_values
245+
221246
222- def _get_numpy_array_distance (num1 , num2 , max_ = 1 ):
247+ def logarithmic_similarity (a : numbers , b : numbers , threshold : float = 0.1 ):
248+ """
249+ A threshold of 0.1 translates to about 10.5% difference.
250+ A threshold of 0.5 translates to about 65% difference.
251+ A threshold of 0.05 translates to about 5.1% difference.
252+ """
253+ return logarithmic_distance (a , b ) < threshold
254+
255+
256+ def logarithmic_distance (a : numbers , b : numbers ):
257+ # Apply logarithm to the absolute values and consider the sign
258+ a = float (a )
259+ b = float (b )
260+ log_a = math .copysign (math .log (abs (a ) + MATH_LOG_OFFSET ), a )
261+ log_b = math .copysign (math .log (abs (b ) + MATH_LOG_OFFSET ), b )
262+
263+ return abs (log_a - log_b )
264+
265+
266+ def _get_numpy_array_distance (num1 , num2 , max_ = 1 , use_log_scale = False , log_scale_similarity_threshold = 0.1 ):
223267 """
224268 Get the distance of 2 numbers. The output is a number between 0 to the max.
225269 The reason is the
@@ -229,24 +273,32 @@ def _get_numpy_array_distance(num1, num2, max_=1):
229273 # getting the pairs of items during the ingore_order=True
230274 # calculations, we need to make the divisor of comparison very big
231275 # so that any 2 numbers can be chosen as pairs.
276+ if use_log_scale :
277+ num1 = numpy_apply_log_keep_sign (num1 )
278+ num2 = numpy_apply_log_keep_sign (num2 )
279+
232280 divisor = (num1 + num2 ) / max_
233281 result = _numpy_div ((num1 - num2 ), divisor , replace_inf_with = max_ )
234- return np .clip (np .absolute (result ), 0 , max_ )
282+
283+ distance_array = np .clip (np .absolute (result ), 0 , max_ )
284+ if use_log_scale :
285+ distance_array [distance_array < log_scale_similarity_threshold ] = 0
286+ return distance_array
235287
236288
237- def _get_datetime_distance (date1 , date2 , max_ ):
289+ def _get_datetime_distance (date1 , date2 , max_ , use_log_scale , log_scale_similarity_threshold ):
238290 return _get_numbers_distance (date1 .timestamp (), date2 .timestamp (), max_ )
239291
240292
241- def _get_date_distance (date1 , date2 , max_ ):
293+ def _get_date_distance (date1 , date2 , max_ , use_log_scale , log_scale_similarity_threshold ):
242294 return _get_numbers_distance (date1 .toordinal (), date2 .toordinal (), max_ )
243295
244296
245- def _get_timedelta_distance (timedelta1 , timedelta2 , max_ ):
297+ def _get_timedelta_distance (timedelta1 , timedelta2 , max_ , use_log_scale , log_scale_similarity_threshold ):
246298 return _get_numbers_distance (timedelta1 .total_seconds (), timedelta2 .total_seconds (), max_ )
247299
248300
249- def _get_time_distance (time1 , time2 , max_ ):
301+ def _get_time_distance (time1 , time2 , max_ , use_log_scale , log_scale_similarity_threshold ):
250302 return _get_numbers_distance (time_to_seconds (time1 ), time_to_seconds (time2 ), max_ )
251303
252304
@@ -259,8 +311,8 @@ def _get_time_distance(time1, time2, max_):
259311]
260312
261313
262- def get_numeric_types_distance (num1 , num2 , max_ ):
314+ def get_numeric_types_distance (num1 , num2 , max_ , use_log_scale = False , log_scale_similarity_threshold = 0.1 ):
263315 for type_ , func in TYPES_TO_DIST_FUNC :
264316 if isinstance (num1 , type_ ) and isinstance (num2 , type_ ):
265- return func (num1 , num2 , max_ )
317+ return func (num1 , num2 , max_ , use_log_scale , log_scale_similarity_threshold )
266318 return not_found
0 commit comments