Skip to content

Commit b2f771d

Browse files
Merge pull request #871 from medialab/cmd-bisect
Cmd `xan bisect`
2 parents 606d83b + 6d5b6c0 commit b2f771d

File tree

10 files changed

+742
-7
lines changed

10 files changed

+742
-7
lines changed

Cargo.lock

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ serde = { version = "1", features = ["rc"] }
119119
serde_derive = "1"
120120
serde_json = { version = "1.0", features = ["preserve_order"] }
121121
shlex = "1.3.0"
122-
simd-csv = "0.10.2"
122+
simd-csv = "0.11.1"
123123
simd-json = "0.14.3"
124124
sprintf = "0.4.1"
125125
tar = { version = "0.4.44", default-features = false }

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -563,6 +563,7 @@ To access the expression language's [cheatsheet](./docs/moonblade/cheatsheet.md)
563563
- [**slice**](./docs/cmd/slice.md): Slice rows of CSV file
564564
- [**top**](./docs/cmd/top.md): Find top rows of a CSV file according to some column
565565
- [**sample**](./docs/cmd/sample.md): Randomly sample CSV data
566+
- [**bisect**](./docs/cmd/bisect.md): Binary search on sorted CSV data
566567
567568
*Sort & deduplicate*
568569

docs/cmd/bisect.md

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
<!-- Generated -->
2+
# xan bisect
3+
4+
```txt
5+
Search for rows where the value in <column> matches <value> using binary search.
6+
It is assumed that the INPUT IS SORTED according to the specified column.
7+
The ordering of the rows is assumed to be sorted according ascending lexicographic
8+
order per default, but you can specify numeric ordering using the -N or --numeric
9+
flag. You can also reverse the order using the -R/--reverse flag.
10+
11+
Usage:
12+
xan bisect [options] [--] <column> <value> <input>
13+
xan bisect --help
14+
15+
bisect options:
16+
-N, --numeric Compare according to the numerical value of cells
17+
instead of the default lexicographic order.
18+
-R, --reverse Reverse sort order, i.e. descending order.
19+
20+
Common options:
21+
-h, --help Display this message
22+
-o, --output <file> Write output to <file> instead of stdout.
23+
-n, --no-headers When set, the first row will not be evaled
24+
as headers.
25+
-d, --delimiter <arg> The field delimiter for reading CSV data.
26+
Must be a single character.
27+
```

src/cmd/bisect.rs

Lines changed: 328 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,328 @@
1+
use std::cmp::Ordering;
2+
use std::io::SeekFrom;
3+
4+
use simd_csv::ByteRecord;
5+
6+
use crate::cmd::sort::{compare_num, parse_num, Number};
7+
use crate::config::{Config, Delimiter};
8+
use crate::select::SelectedColumns;
9+
use crate::util;
10+
use crate::CliResult;
11+
12+
#[derive(Clone, PartialEq, Debug)]
13+
enum Value {
14+
Number(Number),
15+
String(Vec<u8>),
16+
}
17+
18+
impl Eq for Value {}
19+
20+
impl PartialOrd for Value {
21+
#[inline]
22+
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
23+
Some(self.cmp(other))
24+
}
25+
}
26+
27+
impl Ord for Value {
28+
#[inline]
29+
fn cmp(&self, other: &Self) -> Ordering {
30+
match (self, other) {
31+
(Self::Number(n1), Self::Number(n2)) => compare_num(*n1, *n2),
32+
(Self::String(s1), Self::String(s2)) => s1.cmp(s2),
33+
_ => panic!("Cannot compare different value types"),
34+
}
35+
}
36+
}
37+
38+
impl std::fmt::Display for Value {
39+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
40+
match self {
41+
Self::Number(n) => match n {
42+
Number::Int(i) => write!(f, "{}", i),
43+
Number::Float(fl) => write!(f, "{}", fl),
44+
},
45+
Self::String(s) => write!(f, "{}", std::str::from_utf8(s).unwrap()),
46+
}
47+
}
48+
}
49+
50+
impl Value {
51+
fn new_string(s: &[u8]) -> Self {
52+
Self::String(s.to_vec())
53+
}
54+
55+
fn new_number(s: &[u8]) -> Result<Self, String> {
56+
match parse_num(s) {
57+
Some(n) => Ok(Self::Number(n)),
58+
None => Err(format!(
59+
"Failed to parse {} as a number!",
60+
std::str::from_utf8(s).unwrap()
61+
)),
62+
}
63+
}
64+
}
65+
66+
static USAGE: &str = r#"
67+
Search for rows where the value in <column> matches <value> using binary search,
68+
and flush all records after the target value.
69+
The default behavior is similar to a lower_bound bisection, but you can exclude
70+
records (equivalent to upper_bound) with the target value using the -E/--exclude
71+
flag. It is assumed that the INPUT IS SORTED according to the specified column.
72+
The ordering of the rows is assumed to be sorted according ascending lexicographic
73+
order per default, but you can specify numeric ordering using the -N or --numeric
74+
flag. You can also reverse the order using the -R/--reverse flag.
75+
Use the -S/--search flag to only flush records matching the target value instead
76+
of all records after it.
77+
78+
Usage:
79+
xan bisect [options] [--] <column> <value> <input>
80+
xan bisect --help
81+
82+
bisect options:
83+
-E, --exclude When set, the records with the target value will be
84+
excluded from the output. By default, they are
85+
included. Cannot be used with -S/--search.
86+
TODO: not equivalent to upper_bound
87+
-N, --numeric Compare according to the numerical value of cells
88+
instead of the default lexicographic order.
89+
-R, --reverse Reverse sort order, i.e. descending order.
90+
-S, --search Perform a search on the target value instead of
91+
flushing all records after the value (included).
92+
Cannot be used with -E/--exclude nor -e/--end.
93+
-e, --end <end-value> When set, the records after the target value will be
94+
flushed until <end-value> is reached (included).
95+
By default, all records after the target value are
96+
flushed. Cannot be used with -S/--search.
97+
-v, --verbose
98+
99+
Common options:
100+
-h, --help Display this message
101+
-o, --output <file> Write output to <file> instead of stdout.
102+
-n, --no-headers When set, the first row will not be evaled
103+
as headers.
104+
-d, --delimiter <arg> The field delimiter for reading CSV data.
105+
Must be a single character.
106+
"#;
107+
108+
#[derive(Deserialize, Debug)]
109+
struct Args {
110+
arg_column: SelectedColumns,
111+
arg_value: String,
112+
arg_input: String,
113+
flag_exclude: bool,
114+
flag_numeric: bool,
115+
flag_reverse: bool,
116+
flag_search: bool,
117+
flag_end_value: Option<String>,
118+
flag_output: Option<String>,
119+
flag_no_headers: bool,
120+
flag_delimiter: Option<Delimiter>,
121+
flag_verbose: bool,
122+
}
123+
124+
impl Args {
125+
#[inline]
126+
fn get_value_from_bytes(&self, bytes: &[u8]) -> Result<Value, String> {
127+
if self.flag_numeric {
128+
Value::new_number(bytes)
129+
} else {
130+
Ok(Value::new_string(bytes))
131+
}
132+
}
133+
134+
#[inline]
135+
fn cmp(&self, v1: &Value, v2: &Value) -> Ordering {
136+
let ordering = v1.cmp(v2);
137+
138+
if self.flag_reverse {
139+
ordering.reverse()
140+
} else {
141+
ordering
142+
}
143+
}
144+
}
145+
146+
pub fn run(argv: &[&str]) -> CliResult<()> {
147+
let args: Args = util::get_args(USAGE, argv)?;
148+
149+
if args.flag_exclude && args.flag_search {
150+
Err("The -E/--exclude and -S/--search flags cannot be used together")?;
151+
}
152+
153+
if args.flag_search && args.flag_end_value.is_some() {
154+
Err("The -S/--search and -e/--end flags cannot be used together")?;
155+
}
156+
157+
macro_rules! log {
158+
($($arg:tt)*) => {
159+
if args.flag_verbose {
160+
eprintln!($($arg)*);
161+
}
162+
};
163+
}
164+
165+
let searched_value = args.get_value_from_bytes(args.arg_value.as_bytes())?;
166+
167+
let rconf = Config::new(&Some(args.arg_input.clone()))
168+
.no_headers(args.flag_no_headers)
169+
.select(args.arg_column.clone())
170+
.delimiter(args.flag_delimiter);
171+
172+
let mut seeker = rconf.simd_seeker()?.ok_or("File cannot be seeked!")?;
173+
let column_index = rconf.single_selection(seeker.byte_headers())?;
174+
175+
let mut wtr = Config::new(&args.flag_output).simd_writer()?;
176+
177+
if !rconf.no_headers {
178+
wtr.write_byte_record(seeker.byte_headers())?;
179+
}
180+
181+
let first_record = match seeker.first_byte_record()? {
182+
Some(r) => r,
183+
None => {
184+
// NOTE: file is empty!
185+
return Ok(());
186+
}
187+
};
188+
189+
let last_record = seeker.last_byte_record()?.unwrap();
190+
191+
let first_value = args.get_value_from_bytes(&first_record[column_index])?;
192+
let last_value = args.get_value_from_bytes(&last_record[column_index])?;
193+
194+
let mut lo = seeker.first_record_position();
195+
let mut hi = seeker.stream_len();
196+
197+
log!("lo byte: {}", lo);
198+
log!("hi byte: {}", hi);
199+
200+
// File does not seem to be correctly sorted
201+
if args.cmp(&first_value, &last_value).is_gt() {
202+
Err(format!(
203+
"input is not sorted in specified order!\nSee first and last values: {} and {}",
204+
first_value, last_value
205+
))?;
206+
}
207+
208+
// Searched value is more than last value: we can stop right now
209+
if args.cmp(&searched_value, &last_value).is_gt() {
210+
log!("early exit: search value is after last value!");
211+
return Ok(());
212+
}
213+
214+
// Searched value is less than first value or equal
215+
let mut skip_search = false;
216+
217+
if args.cmp(&searched_value, &first_value).is_le() {
218+
log!("skipping search: search value is before first value!");
219+
skip_search = true;
220+
}
221+
222+
// `bisect_left`
223+
// while lo < hi:
224+
// mid = (lo+hi)//2
225+
// if a[mid] < x: lo = mid+1
226+
// else: hi = mid
227+
// return lo
228+
229+
let mut jumps: usize = 0;
230+
231+
if !skip_search {
232+
while lo < hi {
233+
let mid = (lo + hi) / 2;
234+
log!("\nmid byte: {}", mid);
235+
236+
jumps += 1;
237+
238+
match seeker.find_record_after(mid)? {
239+
Some((pos, record)) => {
240+
log!("successful jump n°{} to: {} (+{})", jumps, pos, pos - mid);
241+
242+
let value = args.get_value_from_bytes(&record[column_index])?;
243+
244+
log!("found value: {}", value);
245+
246+
match args.cmp(&value, &searched_value) {
247+
Ordering::Less => {
248+
lo = mid + 1;
249+
log!("new lo (going right): {}", lo);
250+
}
251+
_ => {
252+
hi = mid;
253+
log!("new hi (going left): {}", hi);
254+
}
255+
}
256+
257+
// Is there enough space for next jump to make sense?
258+
let next_mid = (lo + hi) / 2;
259+
260+
if next_mid.abs_diff(mid) <= seeker.lookahead_len() * 2 {
261+
break;
262+
}
263+
}
264+
None => {
265+
Err(format!(
266+
"Seeker's lookahead failed (len: {}, pos: {})!",
267+
seeker.lookahead_len(),
268+
mid
269+
))?;
270+
}
271+
}
272+
}
273+
}
274+
275+
log!("\nfinal lo: {}", lo);
276+
log!(
277+
"made {} jumps vs. expected log(n) {}",
278+
jumps,
279+
(seeker.approx_count() as f64).log2().ceil() as usize
280+
);
281+
282+
let final_pos = if skip_search {
283+
seeker.first_record_position()
284+
} else {
285+
seeker.find_record_after(lo)?.unwrap().0
286+
};
287+
288+
let mut reader = seeker.into_reader_at_position(SeekFrom::Start(final_pos))?;
289+
290+
let mut record = ByteRecord::new();
291+
let mut skipped: usize = 0;
292+
let mut logged_skipped: bool = false;
293+
294+
while reader.read_byte_record(&mut record)? {
295+
let value = args.get_value_from_bytes(&record[column_index])?;
296+
297+
match args.cmp(&value, &searched_value) {
298+
Ordering::Less => {
299+
skipped += 1;
300+
}
301+
Ordering::Equal => {
302+
if !args.flag_exclude {
303+
if !logged_skipped {
304+
log!("skipped records before finding: {}", skipped);
305+
logged_skipped = true;
306+
}
307+
wtr.write_byte_record(&record)?;
308+
} else {
309+
skipped += 1;
310+
}
311+
}
312+
Ordering::Greater => {
313+
if args.flag_exclude && !logged_skipped {
314+
log!("skipped records before finding: {}", skipped);
315+
logged_skipped = true;
316+
}
317+
318+
if args.flag_search {
319+
break;
320+
} else {
321+
wtr.write_byte_record(&record)?;
322+
}
323+
}
324+
}
325+
}
326+
327+
Ok(wtr.flush()?)
328+
}

src/cmd/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
pub mod agg;
22
pub mod behead;
33
pub mod bins;
4+
pub mod bisect;
45
pub mod blank;
56
pub mod cat;
67
pub mod cluster;

0 commit comments

Comments
 (0)