|
| 1 | +use std::cmp::Ordering; |
| 2 | +use std::io::SeekFrom; |
| 3 | + |
| 4 | +use simd_csv::ByteRecord; |
| 5 | + |
| 6 | +use crate::cmd::sort::{compare_num, parse_num, Number}; |
| 7 | +use crate::config::{Config, Delimiter}; |
| 8 | +use crate::select::SelectedColumns; |
| 9 | +use crate::util; |
| 10 | +use crate::CliResult; |
| 11 | + |
| 12 | +#[derive(Clone, PartialEq, Debug)] |
| 13 | +enum Value { |
| 14 | + Number(Number), |
| 15 | + String(Vec<u8>), |
| 16 | +} |
| 17 | + |
| 18 | +impl Eq for Value {} |
| 19 | + |
| 20 | +impl PartialOrd for Value { |
| 21 | + #[inline] |
| 22 | + fn partial_cmp(&self, other: &Self) -> Option<Ordering> { |
| 23 | + Some(self.cmp(other)) |
| 24 | + } |
| 25 | +} |
| 26 | + |
| 27 | +impl Ord for Value { |
| 28 | + #[inline] |
| 29 | + fn cmp(&self, other: &Self) -> Ordering { |
| 30 | + match (self, other) { |
| 31 | + (Self::Number(n1), Self::Number(n2)) => compare_num(*n1, *n2), |
| 32 | + (Self::String(s1), Self::String(s2)) => s1.cmp(s2), |
| 33 | + _ => panic!("Cannot compare different value types"), |
| 34 | + } |
| 35 | + } |
| 36 | +} |
| 37 | + |
| 38 | +impl std::fmt::Display for Value { |
| 39 | + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { |
| 40 | + match self { |
| 41 | + Self::Number(n) => match n { |
| 42 | + Number::Int(i) => write!(f, "{}", i), |
| 43 | + Number::Float(fl) => write!(f, "{}", fl), |
| 44 | + }, |
| 45 | + Self::String(s) => write!(f, "{}", std::str::from_utf8(s).unwrap()), |
| 46 | + } |
| 47 | + } |
| 48 | +} |
| 49 | + |
| 50 | +impl Value { |
| 51 | + fn new_string(s: &[u8]) -> Self { |
| 52 | + Self::String(s.to_vec()) |
| 53 | + } |
| 54 | + |
| 55 | + fn new_number(s: &[u8]) -> Result<Self, String> { |
| 56 | + match parse_num(s) { |
| 57 | + Some(n) => Ok(Self::Number(n)), |
| 58 | + None => Err(format!( |
| 59 | + "Failed to parse {} as a number!", |
| 60 | + std::str::from_utf8(s).unwrap() |
| 61 | + )), |
| 62 | + } |
| 63 | + } |
| 64 | +} |
| 65 | + |
| 66 | +static USAGE: &str = r#" |
| 67 | +Search for rows where the value in <column> matches <value> using binary search, |
| 68 | +and flush all records after the target value. |
| 69 | +The default behavior is similar to a lower_bound bisection, but you can exclude |
| 70 | +records (equivalent to upper_bound) with the target value using the -E/--exclude |
| 71 | +flag. It is assumed that the INPUT IS SORTED according to the specified column. |
| 72 | +The ordering of the rows is assumed to be sorted according ascending lexicographic |
| 73 | +order per default, but you can specify numeric ordering using the -N or --numeric |
| 74 | +flag. You can also reverse the order using the -R/--reverse flag. |
| 75 | +Use the -S/--search flag to only flush records matching the target value instead |
| 76 | +of all records after it. |
| 77 | +
|
| 78 | +Usage: |
| 79 | + xan bisect [options] [--] <column> <value> <input> |
| 80 | + xan bisect --help |
| 81 | +
|
| 82 | +bisect options: |
| 83 | + -E, --exclude When set, the records with the target value will be |
| 84 | + excluded from the output. By default, they are |
| 85 | + included. Cannot be used with -S/--search. |
| 86 | + TODO: not equivalent to upper_bound |
| 87 | + -N, --numeric Compare according to the numerical value of cells |
| 88 | + instead of the default lexicographic order. |
| 89 | + -R, --reverse Reverse sort order, i.e. descending order. |
| 90 | + -S, --search Perform a search on the target value instead of |
| 91 | + flushing all records after the value (included). |
| 92 | + Cannot be used with -E/--exclude nor -e/--end. |
| 93 | + -e, --end <end-value> When set, the records after the target value will be |
| 94 | + flushed until <end-value> is reached (included). |
| 95 | + By default, all records after the target value are |
| 96 | + flushed. Cannot be used with -S/--search. |
| 97 | + -v, --verbose |
| 98 | +
|
| 99 | +Common options: |
| 100 | + -h, --help Display this message |
| 101 | + -o, --output <file> Write output to <file> instead of stdout. |
| 102 | + -n, --no-headers When set, the first row will not be evaled |
| 103 | + as headers. |
| 104 | + -d, --delimiter <arg> The field delimiter for reading CSV data. |
| 105 | + Must be a single character. |
| 106 | +"#; |
| 107 | + |
| 108 | +#[derive(Deserialize, Debug)] |
| 109 | +struct Args { |
| 110 | + arg_column: SelectedColumns, |
| 111 | + arg_value: String, |
| 112 | + arg_input: String, |
| 113 | + flag_exclude: bool, |
| 114 | + flag_numeric: bool, |
| 115 | + flag_reverse: bool, |
| 116 | + flag_search: bool, |
| 117 | + flag_end_value: Option<String>, |
| 118 | + flag_output: Option<String>, |
| 119 | + flag_no_headers: bool, |
| 120 | + flag_delimiter: Option<Delimiter>, |
| 121 | + flag_verbose: bool, |
| 122 | +} |
| 123 | + |
| 124 | +impl Args { |
| 125 | + #[inline] |
| 126 | + fn get_value_from_bytes(&self, bytes: &[u8]) -> Result<Value, String> { |
| 127 | + if self.flag_numeric { |
| 128 | + Value::new_number(bytes) |
| 129 | + } else { |
| 130 | + Ok(Value::new_string(bytes)) |
| 131 | + } |
| 132 | + } |
| 133 | + |
| 134 | + #[inline] |
| 135 | + fn cmp(&self, v1: &Value, v2: &Value) -> Ordering { |
| 136 | + let ordering = v1.cmp(v2); |
| 137 | + |
| 138 | + if self.flag_reverse { |
| 139 | + ordering.reverse() |
| 140 | + } else { |
| 141 | + ordering |
| 142 | + } |
| 143 | + } |
| 144 | +} |
| 145 | + |
| 146 | +pub fn run(argv: &[&str]) -> CliResult<()> { |
| 147 | + let args: Args = util::get_args(USAGE, argv)?; |
| 148 | + |
| 149 | + if args.flag_exclude && args.flag_search { |
| 150 | + Err("The -E/--exclude and -S/--search flags cannot be used together")?; |
| 151 | + } |
| 152 | + |
| 153 | + if args.flag_search && args.flag_end_value.is_some() { |
| 154 | + Err("The -S/--search and -e/--end flags cannot be used together")?; |
| 155 | + } |
| 156 | + |
| 157 | + macro_rules! log { |
| 158 | + ($($arg:tt)*) => { |
| 159 | + if args.flag_verbose { |
| 160 | + eprintln!($($arg)*); |
| 161 | + } |
| 162 | + }; |
| 163 | + } |
| 164 | + |
| 165 | + let searched_value = args.get_value_from_bytes(args.arg_value.as_bytes())?; |
| 166 | + |
| 167 | + let rconf = Config::new(&Some(args.arg_input.clone())) |
| 168 | + .no_headers(args.flag_no_headers) |
| 169 | + .select(args.arg_column.clone()) |
| 170 | + .delimiter(args.flag_delimiter); |
| 171 | + |
| 172 | + let mut seeker = rconf.simd_seeker()?.ok_or("File cannot be seeked!")?; |
| 173 | + let column_index = rconf.single_selection(seeker.byte_headers())?; |
| 174 | + |
| 175 | + let mut wtr = Config::new(&args.flag_output).simd_writer()?; |
| 176 | + |
| 177 | + if !rconf.no_headers { |
| 178 | + wtr.write_byte_record(seeker.byte_headers())?; |
| 179 | + } |
| 180 | + |
| 181 | + let first_record = match seeker.first_byte_record()? { |
| 182 | + Some(r) => r, |
| 183 | + None => { |
| 184 | + // NOTE: file is empty! |
| 185 | + return Ok(()); |
| 186 | + } |
| 187 | + }; |
| 188 | + |
| 189 | + let last_record = seeker.last_byte_record()?.unwrap(); |
| 190 | + |
| 191 | + let first_value = args.get_value_from_bytes(&first_record[column_index])?; |
| 192 | + let last_value = args.get_value_from_bytes(&last_record[column_index])?; |
| 193 | + |
| 194 | + let mut lo = seeker.first_record_position(); |
| 195 | + let mut hi = seeker.stream_len(); |
| 196 | + |
| 197 | + log!("lo byte: {}", lo); |
| 198 | + log!("hi byte: {}", hi); |
| 199 | + |
| 200 | + // File does not seem to be correctly sorted |
| 201 | + if args.cmp(&first_value, &last_value).is_gt() { |
| 202 | + Err(format!( |
| 203 | + "input is not sorted in specified order!\nSee first and last values: {} and {}", |
| 204 | + first_value, last_value |
| 205 | + ))?; |
| 206 | + } |
| 207 | + |
| 208 | + // Searched value is more than last value: we can stop right now |
| 209 | + if args.cmp(&searched_value, &last_value).is_gt() { |
| 210 | + log!("early exit: search value is after last value!"); |
| 211 | + return Ok(()); |
| 212 | + } |
| 213 | + |
| 214 | + // Searched value is less than first value or equal |
| 215 | + let mut skip_search = false; |
| 216 | + |
| 217 | + if args.cmp(&searched_value, &first_value).is_le() { |
| 218 | + log!("skipping search: search value is before first value!"); |
| 219 | + skip_search = true; |
| 220 | + } |
| 221 | + |
| 222 | + // `bisect_left` |
| 223 | + // while lo < hi: |
| 224 | + // mid = (lo+hi)//2 |
| 225 | + // if a[mid] < x: lo = mid+1 |
| 226 | + // else: hi = mid |
| 227 | + // return lo |
| 228 | + |
| 229 | + let mut jumps: usize = 0; |
| 230 | + |
| 231 | + if !skip_search { |
| 232 | + while lo < hi { |
| 233 | + let mid = (lo + hi) / 2; |
| 234 | + log!("\nmid byte: {}", mid); |
| 235 | + |
| 236 | + jumps += 1; |
| 237 | + |
| 238 | + match seeker.find_record_after(mid)? { |
| 239 | + Some((pos, record)) => { |
| 240 | + log!("successful jump n°{} to: {} (+{})", jumps, pos, pos - mid); |
| 241 | + |
| 242 | + let value = args.get_value_from_bytes(&record[column_index])?; |
| 243 | + |
| 244 | + log!("found value: {}", value); |
| 245 | + |
| 246 | + match args.cmp(&value, &searched_value) { |
| 247 | + Ordering::Less => { |
| 248 | + lo = mid + 1; |
| 249 | + log!("new lo (going right): {}", lo); |
| 250 | + } |
| 251 | + _ => { |
| 252 | + hi = mid; |
| 253 | + log!("new hi (going left): {}", hi); |
| 254 | + } |
| 255 | + } |
| 256 | + |
| 257 | + // Is there enough space for next jump to make sense? |
| 258 | + let next_mid = (lo + hi) / 2; |
| 259 | + |
| 260 | + if next_mid.abs_diff(mid) <= seeker.lookahead_len() * 2 { |
| 261 | + break; |
| 262 | + } |
| 263 | + } |
| 264 | + None => { |
| 265 | + Err(format!( |
| 266 | + "Seeker's lookahead failed (len: {}, pos: {})!", |
| 267 | + seeker.lookahead_len(), |
| 268 | + mid |
| 269 | + ))?; |
| 270 | + } |
| 271 | + } |
| 272 | + } |
| 273 | + } |
| 274 | + |
| 275 | + log!("\nfinal lo: {}", lo); |
| 276 | + log!( |
| 277 | + "made {} jumps vs. expected log(n) {}", |
| 278 | + jumps, |
| 279 | + (seeker.approx_count() as f64).log2().ceil() as usize |
| 280 | + ); |
| 281 | + |
| 282 | + let final_pos = if skip_search { |
| 283 | + seeker.first_record_position() |
| 284 | + } else { |
| 285 | + seeker.find_record_after(lo)?.unwrap().0 |
| 286 | + }; |
| 287 | + |
| 288 | + let mut reader = seeker.into_reader_at_position(SeekFrom::Start(final_pos))?; |
| 289 | + |
| 290 | + let mut record = ByteRecord::new(); |
| 291 | + let mut skipped: usize = 0; |
| 292 | + let mut logged_skipped: bool = false; |
| 293 | + |
| 294 | + while reader.read_byte_record(&mut record)? { |
| 295 | + let value = args.get_value_from_bytes(&record[column_index])?; |
| 296 | + |
| 297 | + match args.cmp(&value, &searched_value) { |
| 298 | + Ordering::Less => { |
| 299 | + skipped += 1; |
| 300 | + } |
| 301 | + Ordering::Equal => { |
| 302 | + if !args.flag_exclude { |
| 303 | + if !logged_skipped { |
| 304 | + log!("skipped records before finding: {}", skipped); |
| 305 | + logged_skipped = true; |
| 306 | + } |
| 307 | + wtr.write_byte_record(&record)?; |
| 308 | + } else { |
| 309 | + skipped += 1; |
| 310 | + } |
| 311 | + } |
| 312 | + Ordering::Greater => { |
| 313 | + if args.flag_exclude && !logged_skipped { |
| 314 | + log!("skipped records before finding: {}", skipped); |
| 315 | + logged_skipped = true; |
| 316 | + } |
| 317 | + |
| 318 | + if args.flag_search { |
| 319 | + break; |
| 320 | + } else { |
| 321 | + wtr.write_byte_record(&record)?; |
| 322 | + } |
| 323 | + } |
| 324 | + } |
| 325 | + } |
| 326 | + |
| 327 | + Ok(wtr.flush()?) |
| 328 | +} |
0 commit comments