Skip to content

Commit 20124a0

Browse files
New work
1 parent c501872 commit 20124a0

File tree

2 files changed

+84
-76
lines changed

2 files changed

+84
-76
lines changed

src/reader.rs

Lines changed: 82 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@ impl CsvReader {
3737
quotation_bitsets,
3838
}
3939
}
40-
4140
pub fn read(&mut self) -> Result<Vec<Row>> {
4241
// todo! what happens when a csv is greater than 128 bytes?
4342
// probably would want something like:
@@ -46,93 +45,89 @@ impl CsvReader {
4645

4746
let mut current_row = Vec::new();
4847

49-
let mut bit_cursor = 0;
48+
let cursor = 0;
5049

5150
for i in 0..self.quotation_bitsets.len() {
52-
/*
53-
Things we need to do:
54-
1. find which commas are truly field separators and which are not.
55-
2. BWO step 1, we'd need to identify quoted regions. CSV uses a double-quote convention
56-
*/
57-
58-
// removes all escaped quotations
59-
let valid_quotations = {
60-
let quotations = self.quotation_bitsets[i];
61-
62-
let escaped_quotations = quotations & (quotations << 1);
63-
let escaped_quotations = escaped_quotations | (escaped_quotations >> 1);
64-
65-
quotations & !escaped_quotations
66-
};
67-
68-
// masks all characters inside quotations
69-
let inside_quotations = parallel_prefix_xor_64(valid_quotations);
70-
71-
let mut valid_commas = self.comma_bitsets[i] & !inside_quotations;
72-
let mut valid_whitespace = self.whitespace_bitsets[i] & !inside_quotations;
51+
let valid_quotations = remove_escaped_quotations(self.quotation_bitsets[i]);
52+
let outside_quotations = !mark_inside_quotations(valid_quotations);
7353

74-
dbg!(valid_commas, valid_whitespace);
54+
let mut valid_commas = self.comma_bitsets[i] & outside_quotations;
55+
let mut valid_whitespace = self.whitespace_bitsets[i] & outside_quotations;
7556

76-
let mut first_comma = valid_commas.leading_zeros() as usize;
77-
let mut first_lf = valid_whitespace.leading_zeros() as usize;
78-
79-
if first_comma == 0 && first_lf == 0 {
80-
// congratulations, we just skipped 64 bytes.
57+
if valid_commas == 0 && valid_whitespace == 0 {
8158
continue;
8259
}
8360

84-
let mut start_field_cursor = 0;
85-
86-
while first_comma != 0 || first_lf != 0 {
87-
let end = first_comma.min(first_lf);
61+
let mut bitset_cursor = 0;
8862

89-
current_row.push(Range {
90-
start: bit_cursor + start_field_cursor,
91-
end: bit_cursor + end,
92-
});
63+
loop {
64+
let first_comma = valid_commas.leading_zeros() as usize;
65+
let first_whitespace = valid_whitespace.leading_zeros() as usize;
9366

94-
start_field_cursor = end + 1;
67+
let bits_traveled = first_comma.min(first_whitespace);
9568

96-
if first_comma < first_lf {
97-
valid_commas &= !(1u64 << (63 - first_comma));
98-
first_comma = valid_commas.leading_zeros() as usize;
99-
} else {
100-
rows.push(Row::from(current_row.clone()));
69+
if bits_traveled == 64 {
70+
break;
71+
}
10172

102-
valid_whitespace &= !(1u64 << (63 - first_lf));
103-
first_lf = valid_whitespace.leading_zeros() as usize;
73+
current_row.push(
74+
Range {
75+
start: cursor + bitset_cursor,
76+
end: cursor + bitset_cursor + bits_traveled
77+
}
78+
);
10479

105-
return Ok(rows);
80+
if first_whitespace < first_comma {
81+
rows.push(Row::from(current_row.clone()));
82+
current_row.clear();
83+
break;
10684
}
85+
86+
bitset_cursor += bits_traveled + 1;
87+
valid_commas <<= bits_traveled + 1;
88+
valid_whitespace <<= bits_traveled + 1;
10789
}
10890
}
10991

11092
Ok(rows)
11193
}
11294
}
11395

114-
fn build_u64(chunk: &[u8x16], broadcast: u8x16) -> u64 {
115-
let mut packed: u64 = 0;
116-
for (i, &c) in chunk.iter().enumerate() {
117-
let word = c.eq(broadcast).bitset() as u64;
118-
packed |= word << (48 - i * 16);
119-
}
96+
fn remove_escaped_quotations(q: u64) -> u64 {
97+
let escaped = q & (q << 1);
98+
let escaped = escaped | (escaped>> 1);
12099

121-
packed
100+
q & !escaped
122101
}
123102

103+
/// `mark_inside_quotations` does a parallel xor to mark all bits inbetween a quote pair.
104+
/// Note because of how xor works, the closing quote will be marked as 0. This is fine since
105+
/// we use this to mask commas and whitespace in between quote pairs.
124106
#[inline]
125-
fn parallel_prefix_xor_64(mut x: u64) -> u64 {
107+
fn mark_inside_quotations(mut x: u64) -> u64 {
126108
x ^= x << 1;
127109
x ^= x << 2;
128110
x ^= x << 4;
129111
x ^= x << 8;
130112
x ^= x << 16;
131113
x ^= x << 32;
132114

133-
x
115+
x << 1
116+
}
117+
118+
// todo, find a quicker way to do this
119+
#[inline]
120+
fn build_u64(chunk: &[u8x16], broadcast: u8x16) -> u64 {
121+
let mut packed: u64 = 0;
122+
for (i, &c) in chunk.iter().enumerate() {
123+
let word = c.eq(broadcast).bitset() as u64;
124+
packed |= word << (48 - i * 16);
125+
}
126+
127+
packed
134128
}
135129

130+
136131
#[cfg(test)]
137132
mod tests {
138133
use super::*;
@@ -261,28 +256,39 @@ mod tests {
261256
Ok(())
262257
}
263258

259+
// #[test]
260+
// fn read_taxi_zone_lookup() -> Result<()> {
261+
// let data = std::fs::read("taxi_zone_lookup.csv")?;
262+
// assert_eq!(
263+
// b"\"LocationID\",\"Borough\",\"Zone\",\"service_zone\"\r\n",
264+
// &data[..46]
265+
// );
266+
//
267+
// let rows = CsvReader::new(&data).read()?;
268+
// dbg!(&rows);
269+
//
270+
// for row in CsvReader::new(&data).read()? {
271+
// let fields = row
272+
// .fields()
273+
// .into_iter()
274+
// .map(|field_range| String::from_utf8(data[field_range.clone()].to_vec()).unwrap())
275+
// .collect::<Vec<_>>();
276+
//
277+
// dbg!(fields);
278+
// }
279+
//
280+
// Ok(())
281+
// }
282+
264283
#[test]
265-
fn read_taxi_zone_lookup() -> Result<()> {
266-
let data = std::fs::read("taxi_zone_lookup.csv")?;
267-
assert_eq!(
268-
b"\"LocationID\",\"Borough\",\"Zone\",\"service_zone\"\r\n",
269-
&data[..46]
270-
);
271-
272-
let rows = CsvReader::new(&data).read()?;
273-
assert_eq!(rows.len(), 1);
274-
dbg!(&rows);
275-
276-
for row in CsvReader::new(&data).read()? {
277-
let fields = row
278-
.fields()
279-
.into_iter()
280-
.map(|field_range| String::from_utf8(data[field_range.clone()].to_vec()).unwrap())
281-
.collect::<Vec<_>>();
282-
283-
dbg!(fields);
284-
}
284+
fn test_mark_inside_quotations() {
285+
let res = mark_inside_quotations(0b10001000);
286+
assert_eq!(res, 0b11110000);
285287

286-
Ok(())
288+
let res2 = mark_inside_quotations(0b1001_1001);
289+
assert_eq!(res2, 0b1110_1110);
290+
291+
let res4 = mark_inside_quotations(0b1100_1001);
292+
assert_eq!(res4, 0b1000_1110);
287293
}
288294
}

src/u8x16.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ impl u8x16 {
4141
unsafe { vceqq_u8(self.0, other.0) }.into()
4242
}
4343

44+
// figure out a better way to do this
45+
// maybe just get the MSB
4446
pub fn bitset(self) -> u16 {
4547
let bs: [u8; 16] = self.into();
4648

0 commit comments

Comments
 (0)