@@ -1065,58 +1065,20 @@ def hist( # noqa: PLR0915
10651065
10661066 def _hist_from_bin_count (bin_count : int ): # type: ignore[no-untyped-def] # noqa: ANN202
10671067 d = pc .min_max (self .native )
1068- lower , upper = d ["min" ], d ["max" ]
1069- pa_float = pa .type_for_alias ("float" )
1068+ lower , upper = d ["min" ].as_py (), d ["max" ].as_py ()
10701069 if lower == upper :
1071- range_ : pa .Scalar [Any ] = lit (1.0 )
1072- mid = lit (0.5 )
1073- width = pc .divide (range_ , lit (bin_count ))
1074- lower = pc .subtract (lower , mid )
1075- upper = pc .add (upper , mid )
1076- else :
1077- range_ = pc .subtract (upper , lower )
1078- width = pc .divide (pc .cast (range_ , pa_float ), lit (float (bin_count )))
1079-
1080- bin_proportions = pc .divide (pc .subtract (self .native , lower ), width )
1081- bin_indices = pc .floor (bin_proportions )
1082-
1083- # shift bins so they are right-closed
1084- bin_indices = pc .if_else (
1085- pc .and_ (
1086- pc .equal (bin_indices , bin_proportions ),
1087- pc .greater (bin_indices , lit (0 )),
1088- ),
1089- pc .subtract (bin_indices , lit (1 )),
1090- bin_indices ,
1091- )
1092- possible = pa .Table .from_arrays (
1093- [pa .Array .from_pandas (np .arange (bin_count , dtype = "int64" ))], ["values" ]
1094- )
1095- counts = ( # count bin id occurrences
1096- pa .Table .from_arrays (
1097- pc .value_counts (bin_indices ).flatten (),
1098- names = ["values" , "counts" ],
1099- )
1100- # nan values are implicitly dropped in value_counts
1101- .filter (~ pc .field ("values" ).is_nan ())
1102- .cast (pa .schema ([("values" , pa .int64 ()), ("counts" , pa .int64 ())]))
1103- # align bin ids to all possible bin ids (populate in missing bins)
1104- .join (possible , keys = "values" , join_type = "right outer" )
1105- .sort_by ("values" )
1106- )
1107- # empty bin intervals should have a 0 count
1108- counts_coalesce = cast (
1109- "ArrowArray" , pc .coalesce (counts .column ("counts" ), lit (0 ))
1110- )
1111- counts = counts .set_column (0 , "counts" , counts_coalesce )
1112-
1113- # extract left/right side of the intervals
1114- bin_left = pc .add (lower , pc .multiply (counts .column ("values" ), width ))
1115- bin_right = pc .add (bin_left , width )
1116- return counts .column ("counts" ), bin_right
1070+ lower -= 0.5
1071+ upper += 0.5
1072+ bins = np .linspace (lower , upper , bin_count + 1 )
1073+ return _hist_from_bins (bins )
11171074
11181075 def _hist_from_bins (bins : Sequence [int | float ]): # type: ignore[no-untyped-def] # noqa: ANN202
11191076 bin_indices = np .searchsorted (bins , self .native , side = "left" )
1077+ bin_indices = pc .if_else ( # lowest bin is inclusive
1078+ pc .equal (self .native , lit (bins [0 ])), 1 , bin_indices
1079+ )
1080+
1081+ # align unique categories and counts appropriately
11201082 obs_cats , obs_counts = np .unique (bin_indices , return_counts = True )
11211083 obj_cats = np .arange (1 , len (bins ))
11221084 counts = np .zeros_like (obj_cats )
@@ -1125,15 +1087,51 @@ def _hist_from_bins(bins: Sequence[int | float]): # type: ignore[no-untyped-def
11251087 bin_right = bins [1 :]
11261088 return counts , bin_right
11271089
1090+ counts : Sequence [int | float ] | np .typing .ArrayLike
1091+ bin_right : Sequence [int | float ] | np .typing .ArrayLike
1092+
1093+ data_count = pc .sum (
1094+ pc .invert (pc .or_ (pc .is_nan (self .native ), pc .is_null (self .native ))).cast (
1095+ pa .uint8 ()
1096+ ),
1097+ min_count = 0 ,
1098+ )
11281099 if bins is not None :
11291100 if len (bins ) < 2 :
11301101 counts , bin_right = [], []
1102+
1103+ elif data_count == pa .scalar (0 , type = pa .uint64 ()): # type:ignore[comparison-overlap]
1104+ counts = np .zeros (len (bins ) - 1 )
1105+ bin_right = bins [1 :]
1106+
1107+ elif len (bins ) == 2 :
1108+ counts = [
1109+ pc .sum (
1110+ pc .and_ (
1111+ pc .greater_equal (self .native , lit (float (bins [0 ]))),
1112+ pc .less_equal (self .native , lit (float (bins [1 ]))),
1113+ ).cast (pa .uint8 ())
1114+ )
1115+ ]
1116+ bin_right = [bins [- 1 ]]
11311117 else :
11321118 counts , bin_right = _hist_from_bins (bins )
11331119
11341120 elif bin_count is not None :
11351121 if bin_count == 0 :
11361122 counts , bin_right = [], []
1123+ elif data_count == pa .scalar (0 , type = pa .uint64 ()): # type:ignore[comparison-overlap]
1124+ counts , bin_right = (
1125+ np .zeros (bin_count ),
1126+ np .linspace (0 , 1 , bin_count + 1 )[1 :],
1127+ )
1128+ elif bin_count == 1 :
1129+ d = pc .min_max (self .native )
1130+ lower , upper = d ["min" ], d ["max" ]
1131+ if lower == upper :
1132+ counts , bin_right = [data_count ], [pc .add (upper , pa .scalar (0.5 ))]
1133+ else :
1134+ counts , bin_right = [data_count ], [upper ]
11371135 else :
11381136 counts , bin_right = _hist_from_bin_count (bin_count )
11391137
0 commit comments