Skip to content

Commit 13998e6

Browse files
Merge pull request #14 from peterfication/add-uint-support
Add uint, bool and string support
2 parents 8214ae4 + 218e914 commit 13998e6

File tree

11 files changed

+332
-6
lines changed

11 files changed

+332
-6
lines changed

CHANGELOG.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,13 @@
22

33
## [Unreleased]
44

5+
- Add support for string
6+
- Add support for bool
7+
- Add support for uint8, uint16, uint32 and uint64
8+
59
## [0.1.3] - 2025-09-13
610

7-
- Add support for all int8, int16, int32, float16 and float32
11+
- Add support for int8, int16, int32, float16 and float32
812

913
## [0.1.2] - 2025-09-12
1014

scripts/npy_definitions.json

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,84 @@
11
[
2+
{
3+
"file_path": "assets/bool.npy",
4+
"dtype": "bool",
5+
"values": [
6+
[
7+
true,
8+
true,
9+
false
10+
],
11+
[
12+
false,
13+
true,
14+
true
15+
]
16+
]
17+
},
18+
{
19+
"file_path": "assets/uint8.npy",
20+
"dtype": "uint8",
21+
"values": [
22+
[
23+
1,
24+
4,
25+
3
26+
],
27+
[
28+
8,
29+
22,
30+
12
31+
]
32+
]
33+
},
34+
{
35+
"file_path": "assets/uint16.npy",
36+
"dtype": "uint16",
37+
"values": [
38+
[
39+
1,
40+
4,
41+
3
42+
],
43+
[
44+
8,
45+
22,
46+
12
47+
]
48+
]
49+
},
50+
{
51+
"file_path": "assets/uint32.npy",
52+
"dtype": "uint32",
53+
"values": [
54+
[
55+
1,
56+
4,
57+
3
58+
],
59+
[
60+
8,
61+
22,
62+
12
63+
]
64+
]
65+
},
66+
{
67+
"file_path": "assets/uint64.npy",
68+
"dtype": "uint64",
69+
"values": [
70+
[
71+
1,
72+
4,
73+
3
74+
],
75+
[
76+
8,
77+
22,
78+
12
79+
]
80+
]
81+
},
282
{
383
"file_path": "assets/int8.npy",
484
"dtype": "int8",
@@ -110,5 +190,21 @@
110190
12.6
111191
]
112192
]
193+
},
194+
{
195+
"file_path": "assets/string.npy",
196+
"dtype": "S",
197+
"values": [
198+
[
199+
"cat",
200+
"dog",
201+
"cat"
202+
],
203+
[
204+
"mouse",
205+
"mouse",
206+
"horse"
207+
]
208+
]
113209
}
114210
]

src/analyze.rs

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,22 @@ pub struct NpyAnalysis {
1818
/// An enum to hold statistics for different supported numeric types.
1919
#[derive(Debug)]
2020
pub enum ValueStats {
21+
BOOL {
22+
count: usize,
23+
unique_values: Vec<bool>,
24+
},
2125
I64 {
2226
count: usize,
2327
unique_values: Vec<i64>,
2428
min: i64,
2529
max: i64,
2630
},
31+
U64 {
32+
count: usize,
33+
unique_values: Vec<u64>,
34+
min: u64,
35+
max: u64,
36+
},
2737
F16 {
2838
count: usize,
2939
unique_values: Vec<half::f16>,
@@ -42,6 +52,10 @@ pub enum ValueStats {
4252
min: f64,
4353
max: f64,
4454
},
55+
String {
56+
count: usize,
57+
unique_values: Vec<String>,
58+
},
4559
}
4660

4761
/// Analyzes the NPY file and returns a struct with the results.
@@ -61,15 +75,24 @@ pub fn analyze_npy(file_path: &str) -> Result<NpyAnalysis, Box<dyn std::error::E
6175
let dtype_str = format!("{:?}{}", plain.type_char(), bits);
6276

6377
let stats = match (plain.type_char(), plain.size_field()) {
78+
(npyz::TypeChar::Bool, _size) => value_stats_for_bool_type(npy)?,
79+
6480
(npyz::TypeChar::Int, 1) => value_stats_for_int_type::<i8>(npy)?,
6581
(npyz::TypeChar::Int, 2) => value_stats_for_int_type::<i16>(npy)?,
6682
(npyz::TypeChar::Int, 4) => value_stats_for_int_type::<i32>(npy)?,
6783
(npyz::TypeChar::Int, 8) => value_stats_for_int_type::<i64>(npy)?,
6884

85+
(npyz::TypeChar::Uint, 1) => value_stats_for_uint_type::<u8>(npy)?,
86+
(npyz::TypeChar::Uint, 2) => value_stats_for_uint_type::<u16>(npy)?,
87+
(npyz::TypeChar::Uint, 4) => value_stats_for_uint_type::<u32>(npy)?,
88+
(npyz::TypeChar::Uint, 8) => value_stats_for_uint_type::<u64>(npy)?,
89+
6990
(npyz::TypeChar::Float, 2) => value_stats_for_float16_type(npy)?,
7091
(npyz::TypeChar::Float, 4) => value_stats_for_float32_type(npy)?,
7192
(npyz::TypeChar::Float, 8) => value_stats_for_float64_type(npy)?,
7293

94+
(npyz::TypeChar::ByteStr, _size) => value_stats_for_string_type(npy)?,
95+
7396
_ => None, // Unsupported type for detailed stats
7497
};
7598
(dtype_str, stats)
@@ -86,6 +109,31 @@ pub fn analyze_npy(file_path: &str) -> Result<NpyAnalysis, Box<dyn std::error::E
86109
})
87110
}
88111

112+
/// Helper function to compute statistics for bool type.
113+
fn value_stats_for_bool_type(
114+
npy: npyz::NpyFile<&[u8]>,
115+
) -> Result<Option<ValueStats>, Box<dyn Error>> {
116+
let data: Vec<_> = npy.data::<bool>()?.collect::<Result<_, _>>()?;
117+
if data.is_empty() {
118+
Ok(None)
119+
} else {
120+
let count = data.len();
121+
let has_true = data.iter().any(|&x| x);
122+
let has_false = data.iter().any(|&x| !x);
123+
let unique_values = match (has_true, has_false) {
124+
(true, true) => vec![true, false],
125+
(true, false) => vec![true],
126+
(false, true) => vec![false],
127+
(false, false) => vec![], // This case should not happen due to is_empty check
128+
};
129+
130+
Ok(Some(ValueStats::BOOL {
131+
count,
132+
unique_values,
133+
}))
134+
}
135+
}
136+
89137
/// Helper function to compute statistics for integer types.
90138
fn value_stats_for_int_type<T>(
91139
npy: npyz::NpyFile<&[u8]>,
@@ -117,6 +165,37 @@ where
117165
}
118166
}
119167

168+
/// Helper function to compute statistics for unsigned integer types.
169+
fn value_stats_for_uint_type<T>(
170+
npy: npyz::NpyFile<&[u8]>,
171+
) -> Result<Option<ValueStats>, Box<dyn Error>>
172+
where
173+
T: Eq + Hash + Ord + Copy + Into<u64>,
174+
T: Deserialize,
175+
{
176+
let data: Vec<T> = npy.data::<T>()?.collect::<Result<_, _>>()?;
177+
if data.is_empty() {
178+
Ok(None)
179+
} else {
180+
let count = data.len();
181+
let mut unique_numbers: Vec<_> = HashSet::<T>::from_iter(data).into_iter().collect();
182+
unique_numbers.sort_unstable();
183+
184+
Ok(Some(ValueStats::U64 {
185+
count,
186+
min: (*unique_numbers
187+
.first()
188+
.expect("unique_numbers should not be empty after non-empty data"))
189+
.into(),
190+
max: (*unique_numbers
191+
.last()
192+
.expect("unique_numbers should not be empty after non-empty data"))
193+
.into(),
194+
unique_values: unique_numbers.iter().map(|&x| x.into()).collect(),
195+
}))
196+
}
197+
}
198+
120199
/// Helper function to compute statistics for f16 type.
121200
fn value_stats_for_float16_type(
122201
npy: npyz::NpyFile<&[u8]>,
@@ -213,3 +292,22 @@ where
213292
unique_vec.sort();
214293
unique_vec.into_iter().map(|ordered| ordered.0).collect()
215294
}
295+
296+
/// Helper function to compute statistics for string type.
297+
fn value_stats_for_string_type(
298+
npy: npyz::NpyFile<&[u8]>,
299+
) -> Result<Option<ValueStats>, Box<dyn Error>> {
300+
let data: Vec<_> = npy.data::<String>()?.collect::<Result<_, _>>()?;
301+
if data.is_empty() {
302+
Ok(None)
303+
} else {
304+
let count = data.len();
305+
let mut unique_values: Vec<_> = HashSet::<String>::from_iter(data).into_iter().collect();
306+
unique_values.sort();
307+
308+
Ok(Some(ValueStats::String {
309+
count,
310+
unique_values,
311+
}))
312+
}
313+
}

src/present.rs

Lines changed: 35 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,37 +19,57 @@ pub fn present_analysis(file_path: &str, analysis: &NpyAnalysis) {
1919
println!("----------------------------------------");
2020

2121
match &analysis.stats {
22+
Some(ValueStats::BOOL {
23+
count,
24+
unique_values,
25+
}) => {
26+
print_stats(count, unique_values);
27+
}
2228
Some(ValueStats::I64 {
2329
count,
2430
unique_values,
2531
min,
2632
max,
2733
}) => {
28-
print_stats(count, unique_values, min, max);
34+
print_stats_numeric(count, unique_values, min, max);
35+
}
36+
Some(ValueStats::U64 {
37+
count,
38+
unique_values,
39+
min,
40+
max,
41+
}) => {
42+
print_stats_numeric(count, unique_values, min, max);
2943
}
3044
Some(ValueStats::F16 {
3145
count,
3246
unique_values,
3347
min,
3448
max,
3549
}) => {
36-
print_stats(count, unique_values, min, max);
50+
print_stats_numeric(count, unique_values, min, max);
3751
}
3852
Some(ValueStats::F32 {
3953
count,
4054
unique_values,
4155
min,
4256
max,
4357
}) => {
44-
print_stats(count, unique_values, min, max);
58+
print_stats_numeric(count, unique_values, min, max);
4559
}
4660
Some(ValueStats::F64 {
4761
count,
4862
unique_values,
4963
min,
5064
max,
5165
}) => {
52-
print_stats(count, unique_values, min, max);
66+
print_stats_numeric(count, unique_values, min, max);
67+
}
68+
Some(ValueStats::String {
69+
count,
70+
unique_values,
71+
}) => {
72+
print_stats(count, unique_values);
5373
}
5474
None => {
5575
println!(
@@ -60,7 +80,17 @@ pub fn present_analysis(file_path: &str, analysis: &NpyAnalysis) {
6080
}
6181
}
6282

63-
fn print_stats<T, U>(count: &usize, unique_values: &U, min: &T, max: &T)
83+
fn print_stats<T, U>(count: &usize, unique_values: &U)
84+
where
85+
T: std::fmt::Debug + std::fmt::Display,
86+
U: std::fmt::Debug + std::ops::Deref<Target = [T]>,
87+
{
88+
println!("Number of values: {count}");
89+
println!("Number of unique values: {}", unique_values.len());
90+
println!("Unique values: {unique_values:?}");
91+
}
92+
93+
fn print_stats_numeric<T, U>(count: &usize, unique_values: &U, min: &T, max: &T)
6494
where
6595
T: std::fmt::Debug + std::fmt::Display,
6696
U: std::fmt::Debug + std::ops::Deref<Target = [T]>,

tests/cli.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,19 @@ fn cli_run_with_file(file_path: &str, snapshot_name: &str) {
2424
#[test]
2525
fn cli_run_all_types() {
2626
let files = [
27+
"assets/bool.npy",
28+
"assets/uint8.npy",
29+
"assets/uint16.npy",
30+
"assets/uint32.npy",
31+
"assets/uint64.npy",
2732
"assets/int8.npy",
2833
"assets/int16.npy",
2934
"assets/int32.npy",
3035
"assets/int64.npy",
3136
"assets/float16.npy",
3237
"assets/float32.npy",
3338
"assets/float64.npy",
39+
"assets/string.npy",
3440
];
3541
for file in files {
3642
let snapshot_name = format!(
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
---
2+
source: tests/cli.rs
3+
expression: output
4+
---
5+
Peek into assets/bool.npy
6+
----------------------------------------
7+
Dimensions: 2
8+
Shape: [2, 3]
9+
Type: Bool8
10+
Bytes: 134 B
11+
----------------------------------------
12+
Number of values: 6
13+
Number of unique values: 2
14+
Unique values: [true, false]

0 commit comments

Comments
 (0)