@@ -37,7 +37,6 @@ impl CsvReader {
3737 quotation_bitsets,
3838 }
3939 }
40-
4140 pub fn read ( & mut self ) -> Result < Vec < Row > > {
4241 // todo! what happens when a csv is greater than 128 bytes?
4342 // probably would want something like:
@@ -46,93 +45,89 @@ impl CsvReader {
4645
4746 let mut current_row = Vec :: new ( ) ;
4847
49- let mut bit_cursor = 0 ;
48+ let cursor = 0 ;
5049
5150 for i in 0 ..self . quotation_bitsets . len ( ) {
52- /*
53- Things we need to do:
54- 1. find which commas are truly field separators and which are not.
55- 2. BWO step 1, we'd need to identify quoted regions. CSV uses a double-quote convention
56- */
57-
58- // removes all escaped quotations
59- let valid_quotations = {
60- let quotations = self . quotation_bitsets [ i] ;
61-
62- let escaped_quotations = quotations & ( quotations << 1 ) ;
63- let escaped_quotations = escaped_quotations | ( escaped_quotations >> 1 ) ;
64-
65- quotations & !escaped_quotations
66- } ;
67-
68- // masks all characters inside quotations
69- let inside_quotations = parallel_prefix_xor_64 ( valid_quotations) ;
70-
71- let mut valid_commas = self . comma_bitsets [ i] & !inside_quotations;
72- let mut valid_whitespace = self . whitespace_bitsets [ i] & !inside_quotations;
51+ let valid_quotations = remove_escaped_quotations ( self . quotation_bitsets [ i] ) ;
52+ let outside_quotations = !mark_inside_quotations ( valid_quotations) ;
7353
74- dbg ! ( valid_commas, valid_whitespace) ;
54+ let mut valid_commas = self . comma_bitsets [ i] & outside_quotations;
55+ let mut valid_whitespace = self . whitespace_bitsets [ i] & outside_quotations;
7556
76- let mut first_comma = valid_commas. leading_zeros ( ) as usize ;
77- let mut first_lf = valid_whitespace. leading_zeros ( ) as usize ;
78-
79- if first_comma == 0 && first_lf == 0 {
80- // congratulations, we just skipped 64 bytes.
57+ if valid_commas == 0 && valid_whitespace == 0 {
8158 continue ;
8259 }
8360
84- let mut start_field_cursor = 0 ;
85-
86- while first_comma != 0 || first_lf != 0 {
87- let end = first_comma. min ( first_lf) ;
61+ let mut bitset_cursor = 0 ;
8862
89- current_row. push ( Range {
90- start : bit_cursor + start_field_cursor,
91- end : bit_cursor + end,
92- } ) ;
63+ loop {
64+ let first_comma = valid_commas. leading_zeros ( ) as usize ;
65+ let first_whitespace = valid_whitespace. leading_zeros ( ) as usize ;
9366
94- start_field_cursor = end + 1 ;
67+ let bits_traveled = first_comma . min ( first_whitespace ) ;
9568
96- if first_comma < first_lf {
97- valid_commas &= !( 1u64 << ( 63 - first_comma) ) ;
98- first_comma = valid_commas. leading_zeros ( ) as usize ;
99- } else {
100- rows. push ( Row :: from ( current_row. clone ( ) ) ) ;
69+ if bits_traveled == 64 {
70+ break ;
71+ }
10172
102- valid_whitespace &= !( 1u64 << ( 63 - first_lf) ) ;
103- first_lf = valid_whitespace. leading_zeros ( ) as usize ;
73+ current_row. push (
74+ Range {
75+ start : cursor + bitset_cursor,
76+ end : cursor + bitset_cursor + bits_traveled
77+ }
78+ ) ;
10479
105- return Ok ( rows) ;
80+ if first_whitespace < first_comma {
81+ rows. push ( Row :: from ( current_row. clone ( ) ) ) ;
82+ current_row. clear ( ) ;
83+ break ;
10684 }
85+
86+ bitset_cursor += bits_traveled + 1 ;
87+ valid_commas <<= bits_traveled + 1 ;
88+ valid_whitespace <<= bits_traveled + 1 ;
10789 }
10890 }
10991
11092 Ok ( rows)
11193 }
11294}
11395
114- fn build_u64 ( chunk : & [ u8x16 ] , broadcast : u8x16 ) -> u64 {
115- let mut packed: u64 = 0 ;
116- for ( i, & c) in chunk. iter ( ) . enumerate ( ) {
117- let word = c. eq ( broadcast) . bitset ( ) as u64 ;
118- packed |= word << ( 48 - i * 16 ) ;
119- }
96+ fn remove_escaped_quotations ( q : u64 ) -> u64 {
97+ let escaped = q & ( q << 1 ) ;
98+ let escaped = escaped | ( escaped>> 1 ) ;
12099
121- packed
100+ q & !escaped
122101}
123102
103+ /// `mark_inside_quotations` does a parallel xor to mark all bits inbetween a quote pair.
104+ /// Note because of how xor works, the closing quote will be marked as 0. This is fine since
105+ /// we use this to mask commas and whitespace in between quote pairs.
124106#[ inline]
125- fn parallel_prefix_xor_64 ( mut x : u64 ) -> u64 {
107+ fn mark_inside_quotations ( mut x : u64 ) -> u64 {
126108 x ^= x << 1 ;
127109 x ^= x << 2 ;
128110 x ^= x << 4 ;
129111 x ^= x << 8 ;
130112 x ^= x << 16 ;
131113 x ^= x << 32 ;
132114
133- x
115+ x << 1
116+ }
117+
118+ // todo, find a quicker way to do this
119+ #[ inline]
120+ fn build_u64 ( chunk : & [ u8x16 ] , broadcast : u8x16 ) -> u64 {
121+ let mut packed: u64 = 0 ;
122+ for ( i, & c) in chunk. iter ( ) . enumerate ( ) {
123+ let word = c. eq ( broadcast) . bitset ( ) as u64 ;
124+ packed |= word << ( 48 - i * 16 ) ;
125+ }
126+
127+ packed
134128}
135129
130+
136131#[ cfg( test) ]
137132mod tests {
138133 use super :: * ;
@@ -261,28 +256,39 @@ mod tests {
261256 Ok ( ( ) )
262257 }
263258
259+ // #[test]
260+ // fn read_taxi_zone_lookup() -> Result<()> {
261+ // let data = std::fs::read("taxi_zone_lookup.csv")?;
262+ // assert_eq!(
263+ // b"\"LocationID\",\"Borough\",\"Zone\",\"service_zone\"\r\n",
264+ // &data[..46]
265+ // );
266+ //
267+ // let rows = CsvReader::new(&data).read()?;
268+ // dbg!(&rows);
269+ //
270+ // for row in CsvReader::new(&data).read()? {
271+ // let fields = row
272+ // .fields()
273+ // .into_iter()
274+ // .map(|field_range| String::from_utf8(data[field_range.clone()].to_vec()).unwrap())
275+ // .collect::<Vec<_>>();
276+ //
277+ // dbg!(fields);
278+ // }
279+ //
280+ // Ok(())
281+ // }
282+
264283 #[ test]
265- fn read_taxi_zone_lookup ( ) -> Result < ( ) > {
266- let data = std:: fs:: read ( "taxi_zone_lookup.csv" ) ?;
267- assert_eq ! (
268- b"\" LocationID\" ,\" Borough\" ,\" Zone\" ,\" service_zone\" \r \n " ,
269- & data[ ..46 ]
270- ) ;
271-
272- let rows = CsvReader :: new ( & data) . read ( ) ?;
273- assert_eq ! ( rows. len( ) , 1 ) ;
274- dbg ! ( & rows) ;
275-
276- for row in CsvReader :: new ( & data) . read ( ) ? {
277- let fields = row
278- . fields ( )
279- . into_iter ( )
280- . map ( |field_range| String :: from_utf8 ( data[ field_range. clone ( ) ] . to_vec ( ) ) . unwrap ( ) )
281- . collect :: < Vec < _ > > ( ) ;
282-
283- dbg ! ( fields) ;
284- }
284+ fn test_mark_inside_quotations ( ) {
285+ let res = mark_inside_quotations ( 0b10001000 ) ;
286+ assert_eq ! ( res, 0b11110000 ) ;
285287
286- Ok ( ( ) )
288+ let res2 = mark_inside_quotations ( 0b1001_1001 ) ;
289+ assert_eq ! ( res2, 0b1110_1110 ) ;
290+
291+ let res4 = mark_inside_quotations ( 0b1100_1001 ) ;
292+ assert_eq ! ( res4, 0b1000_1110 ) ;
287293 }
288294}
0 commit comments