@@ -18,12 +18,22 @@ pub struct NpyAnalysis {
1818/// An enum to hold statistics for different supported numeric types.
1919#[ derive( Debug ) ]
2020pub enum ValueStats {
21+ BOOL {
22+ count : usize ,
23+ unique_values : Vec < bool > ,
24+ } ,
2125 I64 {
2226 count : usize ,
2327 unique_values : Vec < i64 > ,
2428 min : i64 ,
2529 max : i64 ,
2630 } ,
31+ U64 {
32+ count : usize ,
33+ unique_values : Vec < u64 > ,
34+ min : u64 ,
35+ max : u64 ,
36+ } ,
2737 F16 {
2838 count : usize ,
2939 unique_values : Vec < half:: f16 > ,
@@ -42,6 +52,10 @@ pub enum ValueStats {
4252 min : f64 ,
4353 max : f64 ,
4454 } ,
55+ String {
56+ count : usize ,
57+ unique_values : Vec < String > ,
58+ } ,
4559}
4660
4761/// Analyzes the NPY file and returns a struct with the results.
@@ -61,15 +75,24 @@ pub fn analyze_npy(file_path: &str) -> Result<NpyAnalysis, Box<dyn std::error::E
6175 let dtype_str = format ! ( "{:?}{}" , plain. type_char( ) , bits) ;
6276
6377 let stats = match ( plain. type_char ( ) , plain. size_field ( ) ) {
78+ ( npyz:: TypeChar :: Bool , _size) => value_stats_for_bool_type ( npy) ?,
79+
6480 ( npyz:: TypeChar :: Int , 1 ) => value_stats_for_int_type :: < i8 > ( npy) ?,
6581 ( npyz:: TypeChar :: Int , 2 ) => value_stats_for_int_type :: < i16 > ( npy) ?,
6682 ( npyz:: TypeChar :: Int , 4 ) => value_stats_for_int_type :: < i32 > ( npy) ?,
6783 ( npyz:: TypeChar :: Int , 8 ) => value_stats_for_int_type :: < i64 > ( npy) ?,
6884
85+ ( npyz:: TypeChar :: Uint , 1 ) => value_stats_for_uint_type :: < u8 > ( npy) ?,
86+ ( npyz:: TypeChar :: Uint , 2 ) => value_stats_for_uint_type :: < u16 > ( npy) ?,
87+ ( npyz:: TypeChar :: Uint , 4 ) => value_stats_for_uint_type :: < u32 > ( npy) ?,
88+ ( npyz:: TypeChar :: Uint , 8 ) => value_stats_for_uint_type :: < u64 > ( npy) ?,
89+
6990 ( npyz:: TypeChar :: Float , 2 ) => value_stats_for_float16_type ( npy) ?,
7091 ( npyz:: TypeChar :: Float , 4 ) => value_stats_for_float32_type ( npy) ?,
7192 ( npyz:: TypeChar :: Float , 8 ) => value_stats_for_float64_type ( npy) ?,
7293
94+ ( npyz:: TypeChar :: ByteStr , _size) => value_stats_for_string_type ( npy) ?,
95+
7396 _ => None , // Unsupported type for detailed stats
7497 } ;
7598 ( dtype_str, stats)
@@ -86,6 +109,31 @@ pub fn analyze_npy(file_path: &str) -> Result<NpyAnalysis, Box<dyn std::error::E
86109 } )
87110}
88111
112+ /// Helper function to compute statistics for bool type.
113+ fn value_stats_for_bool_type (
114+ npy : npyz:: NpyFile < & [ u8 ] > ,
115+ ) -> Result < Option < ValueStats > , Box < dyn Error > > {
116+ let data: Vec < _ > = npy. data :: < bool > ( ) ?. collect :: < Result < _ , _ > > ( ) ?;
117+ if data. is_empty ( ) {
118+ Ok ( None )
119+ } else {
120+ let count = data. len ( ) ;
121+ let has_true = data. iter ( ) . any ( |& x| x) ;
122+ let has_false = data. iter ( ) . any ( |& x| !x) ;
123+ let unique_values = match ( has_true, has_false) {
124+ ( true , true ) => vec ! [ true , false ] ,
125+ ( true , false ) => vec ! [ true ] ,
126+ ( false , true ) => vec ! [ false ] ,
127+ ( false , false ) => vec ! [ ] , // This case should not happen due to is_empty check
128+ } ;
129+
130+ Ok ( Some ( ValueStats :: BOOL {
131+ count,
132+ unique_values,
133+ } ) )
134+ }
135+ }
136+
89137/// Helper function to compute statistics for integer types.
90138fn value_stats_for_int_type < T > (
91139 npy : npyz:: NpyFile < & [ u8 ] > ,
@@ -117,6 +165,37 @@ where
117165 }
118166}
119167
168+ /// Helper function to compute statistics for unsigned integer types.
169+ fn value_stats_for_uint_type < T > (
170+ npy : npyz:: NpyFile < & [ u8 ] > ,
171+ ) -> Result < Option < ValueStats > , Box < dyn Error > >
172+ where
173+ T : Eq + Hash + Ord + Copy + Into < u64 > ,
174+ T : Deserialize ,
175+ {
176+ let data: Vec < T > = npy. data :: < T > ( ) ?. collect :: < Result < _ , _ > > ( ) ?;
177+ if data. is_empty ( ) {
178+ Ok ( None )
179+ } else {
180+ let count = data. len ( ) ;
181+ let mut unique_numbers: Vec < _ > = HashSet :: < T > :: from_iter ( data) . into_iter ( ) . collect ( ) ;
182+ unique_numbers. sort_unstable ( ) ;
183+
184+ Ok ( Some ( ValueStats :: U64 {
185+ count,
186+ min : ( * unique_numbers
187+ . first ( )
188+ . expect ( "unique_numbers should not be empty after non-empty data" ) )
189+ . into ( ) ,
190+ max : ( * unique_numbers
191+ . last ( )
192+ . expect ( "unique_numbers should not be empty after non-empty data" ) )
193+ . into ( ) ,
194+ unique_values : unique_numbers. iter ( ) . map ( |& x| x. into ( ) ) . collect ( ) ,
195+ } ) )
196+ }
197+ }
198+
120199/// Helper function to compute statistics for f16 type.
121200fn value_stats_for_float16_type (
122201 npy : npyz:: NpyFile < & [ u8 ] > ,
@@ -213,3 +292,22 @@ where
213292 unique_vec. sort ( ) ;
214293 unique_vec. into_iter ( ) . map ( |ordered| ordered. 0 ) . collect ( )
215294}
295+
296+ /// Helper function to compute statistics for string type.
297+ fn value_stats_for_string_type (
298+ npy : npyz:: NpyFile < & [ u8 ] > ,
299+ ) -> Result < Option < ValueStats > , Box < dyn Error > > {
300+ let data: Vec < _ > = npy. data :: < String > ( ) ?. collect :: < Result < _ , _ > > ( ) ?;
301+ if data. is_empty ( ) {
302+ Ok ( None )
303+ } else {
304+ let count = data. len ( ) ;
305+ let mut unique_values: Vec < _ > = HashSet :: < String > :: from_iter ( data) . into_iter ( ) . collect ( ) ;
306+ unique_values. sort ( ) ;
307+
308+ Ok ( Some ( ValueStats :: String {
309+ count,
310+ unique_values,
311+ } ) )
312+ }
313+ }
0 commit comments