@@ -354,9 +354,10 @@ def _conditional_join_type_check(
354
354
f"'{ right_column .name } ' has { right_column .dtype } type."
355
355
)
356
356
357
- if (op in less_than_join_types .union (greater_than_join_types )) & (
358
- (is_string_dtype (left_column ) | is_categorical_dtype (left_column ))
359
- ):
357
+ number_or_date = is_numeric_dtype (left_column ) or is_datetime64_dtype (
358
+ left_column
359
+ )
360
+ if (op != _JoinOperator .STRICTLY_EQUAL .value ) & (not number_or_date ):
360
361
raise ValueError (
361
362
"non-equi joins are supported "
362
363
"only for datetime and numeric dtypes. "
@@ -490,12 +491,12 @@ def _less_than_indices(
490
491
if left .min () > right .max ():
491
492
return None
492
493
493
- any_nulls = pd .isna (left )
494
+ any_nulls = left .isna ()
494
495
if any_nulls .all ():
495
496
return None
496
497
if any_nulls .any ():
497
498
left = left [~ any_nulls ]
498
- any_nulls = pd .isna (right )
499
+ any_nulls = right .isna ()
499
500
if any_nulls .all ():
500
501
return None
501
502
if any_nulls .any ():
@@ -597,12 +598,12 @@ def _greater_than_indices(
597
598
if left .max () < right .min ():
598
599
return None
599
600
600
- any_nulls = pd .isna (left )
601
+ any_nulls = left .isna ()
601
602
if any_nulls .all ():
602
603
return None
603
604
if any_nulls .any ():
604
605
left = left [~ any_nulls ]
605
- any_nulls = pd .isna (right )
606
+ any_nulls = right .isna ()
606
607
if any_nulls .all ():
607
608
return None
608
609
if any_nulls .any ():
@@ -1129,10 +1130,10 @@ def _range_indices(
1129
1130
# get rid of any nulls
1130
1131
# this is helpful as we can convert extension arrays to numpy arrays safely
1131
1132
# and simplify the search logic below
1132
- any_nulls = pd . isna ( df [left_on ])
1133
+ any_nulls = df [left_on ]. isna ( )
1133
1134
if any_nulls .any ():
1134
1135
left_c = left_c [~ any_nulls ]
1135
- any_nulls = pd . isna ( right [right_on ])
1136
+ any_nulls = right [right_on ]. isna ( )
1136
1137
if any_nulls .any ():
1137
1138
right_c = right_c [~ any_nulls ]
1138
1139
@@ -1160,16 +1161,26 @@ def _range_indices(
1160
1161
right_c = right_c ._values
1161
1162
left_c , right_c = _convert_to_numpy_array (left_c , right_c )
1162
1163
op = operator_map [op ]
1163
- pos = np .empty (left_c .size , dtype = np .intp )
1164
-
1165
- # better served in a compiled environment
1166
- # where we can break early
1167
- # parallelise the operation, as well as
1168
- # avoid the restrictive fixed size approach of numpy
1169
- # which isnt particularly helpful in a for loop
1170
- for ind in range (left_c .size ):
1171
- out = op (left_c [ind ], right_c )
1172
- pos [ind ] = np .argmax (out )
1164
+ pos = np .copy (search_indices )
1165
+ counter = np .arange (left_c .size )
1166
+
1167
+ # better than np.outer memory wise?
1168
+ # using this for loop instead of np.outer
1169
+ # allows us to break early and reduce the
1170
+ # number of cartesian checks
1171
+ # since as we iterate, we reduce the size of left_c
1172
+ # speed wise, np.outer will be faster
1173
+ # alternatively, the user can just use the numba option
1174
+ # for more performance
1175
+ for ind in range (right_c .size ):
1176
+ if not counter .size :
1177
+ break
1178
+ keep_rows = op (left_c , right_c [ind ])
1179
+ if not keep_rows .any ():
1180
+ continue
1181
+ pos [counter [keep_rows ]] = ind
1182
+ counter = counter [~ keep_rows ]
1183
+ left_c = left_c [~ keep_rows ]
1173
1184
1174
1185
# no point searching within (a, b)
1175
1186
# if a == b
@@ -1261,10 +1272,10 @@ def _create_frame(
1261
1272
"""
1262
1273
Create final dataframe
1263
1274
"""
1264
- if df_columns :
1275
+ if df_columns is not None :
1265
1276
df = _cond_join_select_columns (df_columns , df )
1266
1277
1267
- if right_columns :
1278
+ if right_columns is not None :
1268
1279
right = _cond_join_select_columns (right_columns , right )
1269
1280
1270
1281
if set (df .columns ).intersection (right .columns ):
0 commit comments