2424#include < string>
2525#include < vector>
2626
27+ // Helper function to trim leading and trailing whitespace.
28+ static std::string trim (const std::string &s) {
29+ size_t start = 0 ;
30+ while (start < s.size () && std::isspace (static_cast <unsigned char >(s[start])))
31+ start++;
32+ size_t end = s.size ();
33+ while (end > start && std::isspace (static_cast <unsigned char >(s[end - 1 ])))
34+ end--;
35+ return s.substr (start, end - start);
36+ }
37+
2738int generality (ValueTypeCode type) { // similar to generality in TypeInferenceUtils.cpp but for ValueTypeCode
2839 switch (type) {
2940 case ValueTypeCode::SI8:
@@ -71,65 +82,77 @@ ValueTypeCode inferValueType(const char* line, size_t &pos, char delim) {
7182
7283// Function to infer the data type of string value
7384ValueTypeCode inferValueType (const std::string &valueStr) {
74- // Check if the string represents an integer
85+
86+ if (valueStr.empty ())
87+ return ValueTypeCode::STR;
88+
89+ std::string token;
90+ token = trim (valueStr);
91+ if (valueStr.front () == ' \" ' ) {
92+ if (valueStr.back () != ' \" ' )
93+ return ValueTypeCode::STR;
94+ // Remove the surrounding quotes.
95+ token = valueStr.substr (1 , valueStr.size () - 2 );
96+ if (token.size () == 16 )
97+ return ValueTypeCode::FIXEDSTR16;
98+ }
99+
100+ // Check if the string represents an integer.
75101 bool isInteger = true ;
76- for (char c : valueStr ) {
77- if (!isdigit (c) && c != ' -' && c != ' +' && c != ' ' ) {
102+ for (char c : token ) {
103+ if (!isdigit (c) && c != ' -' && c != ' +' && ! isspace (c) ) {
78104 isInteger = false ;
79105 break ;
80106 }
81107 }
82-
83108 if (isInteger) {
84109 try {
85- int64_t value = std::stoll (valueStr);
86- if (value >= std::numeric_limits<int8_t >::min () && value <= std::numeric_limits<int8_t >::max ()) {
87- return ValueTypeCode::SI8;
88- } else if (value >= 0 && value <= std::numeric_limits<uint8_t >::max ()) {
89- return ValueTypeCode::UI8;
90- } else if (value >= std::numeric_limits<int32_t >::min () && value <= std::numeric_limits<int32_t >::max ()) {
91- return ValueTypeCode::SI32;
92- } else if (value >= 0 && value <= std::numeric_limits<uint32_t >::max ()) {
93- return ValueTypeCode::UI32;
94- } else if (value >= std::numeric_limits<int64_t >::min () && value <= std::numeric_limits<int64_t >::max ()) {
95- return ValueTypeCode::SI64;
96- } else {
97- return ValueTypeCode::UI64;
110+ size_t pos;
111+ int64_t value = std::stoll (token, &pos);
112+ // ensure there were no extra characters that were silently ignored
113+ if (pos == token.size ()) {
114+ if (value >= std::numeric_limits<int8_t >::min () && value <= std::numeric_limits<int8_t >::max ())
115+ return ValueTypeCode::SI8;
116+ else if (value >= 0 && value <= std::numeric_limits<uint8_t >::max ())
117+ return ValueTypeCode::UI8;
118+ else if (value >= std::numeric_limits<int32_t >::min () && value <= std::numeric_limits<int32_t >::max ())
119+ return ValueTypeCode::SI32;
120+ else if (value >= 0 && value <= std::numeric_limits<uint32_t >::max ())
121+ return ValueTypeCode::UI32;
122+ else if (value >= std::numeric_limits<int64_t >::min () && value <= std::numeric_limits<int64_t >::max ())
123+ return ValueTypeCode::SI64;
124+ else
125+ return ValueTypeCode::UI64;
98126 }
99127 } catch (const std::invalid_argument &) {
100- // Continue to next check
128+ // Fall through to string.
101129 } catch (const std::out_of_range &) {
102130 return ValueTypeCode::UI64;
103131 }
104132 }
105-
106- // Check if the string represents a float
133+
134+ // Check if the string represents a float.
107135 try {
108- float fvalue = std::stof (valueStr);
109- if (fvalue >= std::numeric_limits<float >::lowest () && fvalue <= std::numeric_limits<float >::max ()) {
136+ size_t pos;
137+ float fvalue = std::stof (token, &pos);
138+ if (pos == token.size () &&
139+ fvalue >= std::numeric_limits<float >::lowest () && fvalue <= std::numeric_limits<float >::max ())
110140 return ValueTypeCode::F32;
111- }
112- } catch (const std::invalid_argument &) {
113- // Continue to next check
114- } catch (const std::out_of_range &) {
115- // Continue to next check
116- }
117-
118- // Check if the string represents a double
141+ } catch (const std::invalid_argument &) { }
142+ catch (const std::out_of_range &) { }
143+
144+ // Check if the string represents a double.
119145 try {
120- double dvalue = std::stod (valueStr);
121- if (dvalue >= std::numeric_limits<double >::lowest () && dvalue <= std::numeric_limits<double >::max ()) {
146+ size_t pos;
147+ double dvalue = std::stod (token, &pos);
148+ if (pos == token.size () &&
149+ dvalue >= std::numeric_limits<double >::lowest () && dvalue <= std::numeric_limits<double >::max ())
122150 return ValueTypeCode::F64;
123- }
124- } catch (const std::invalid_argument &) {
125- // Continue to next check
126- } catch (const std::out_of_range &) {
127- // Continue to next check
128- }
129-
130- if (valueStr.size () == 16 ) {
151+ } catch (const std::invalid_argument &) { }
152+ catch (const std::out_of_range &) { }
153+
154+ if (token.size () == 16 )
131155 return ValueTypeCode::FIXEDSTR16;
132- }
133156 return ValueTypeCode::STR;
134157}
135158
@@ -139,19 +162,28 @@ FileMetaData generateFileMetaData(const std::string &filename, char delim, size_
139162 if (!file.is_open ())
140163 throw std::runtime_error (" Cannot open file: " + filename);
141164 std::string line;
142- std::vector<ValueTypeCode> colTypes; // will be resized once we know numCols
165+ std::vector<ValueTypeCode> colTypes; // resized once we know numCols
143166 bool firstLine = true ;
144167 size_t row = 0 ;
145168 while (std::getline (file, line) && row < sampleRows) {
146169 // Discard empty rows.
147- if (isEmptyLine (line))
170+ if (isEmptyLine (line))
148171 continue ;
149-
172+
173+ // If a token may span multiple lines, join subsequent lines until quotes match.
174+ size_t quoteCount = std::count (line.begin (), line.end (), ' \" ' );
175+ while ((quoteCount % 2 ) != 0 ) {
176+ std::string nextLine;
177+ if (!std::getline (file, nextLine))
178+ break ;
179+ line += " \n " + nextLine;
180+ quoteCount = std::count (line.begin (), line.end (), ' \" ' );
181+ }
182+
150183 size_t pos = 0 ;
151184 size_t col = 0 ;
152- // On first row , determine number of columns.
185+ // On first non-empty line , determine number of columns.
153186 if (firstLine) {
154- // Count the number of delimiters + 1
155187 size_t ncols = 1 ;
156188 for (char c : line)
157189 if (c == delim)
@@ -162,6 +194,7 @@ FileMetaData generateFileMetaData(const std::string &filename, char delim, size_
162194 // Process each token.
163195 while (pos < line.size () && col < colTypes.size ()) {
164196 size_t tempPos = pos;
197+ // Extract token using the existing inferValueType helper.
165198 ValueTypeCode tokenType = inferValueType (line.c_str (), tempPos, delim);
166199 // Promote type if needed.
167200 if (generality (tokenType) > generality (colTypes[col]))
@@ -172,119 +205,25 @@ FileMetaData generateFileMetaData(const std::string &filename, char delim, size_
172205 row++;
173206 }
174207 file.close ();
208+
175209 std::vector<std::string> labels;
176- size_t numCols= colTypes.size ();
210+ size_t numCols = colTypes.size ();
177211 bool isSingleValueType = true ;
178212 ValueTypeCode firstValueType = colTypes[0 ];
179213 ValueTypeCode maxValueType = colTypes[0 ];
180214 for (size_t i = 0 ; i < numCols; i++) {
181215 labels.push_back (" col_" + std::to_string (i));
182- if (generality (colTypes[i]) > generality (maxValueType)) {
216+ if (generality (colTypes[i]) > generality (maxValueType))
183217 maxValueType = colTypes[i];
184- }
185- if (colTypes[i] != firstValueType) {
218+ if (colTypes[i] != firstValueType)
186219 isSingleValueType = false ;
187- }
188220 }
189221 if (isSingleValueType) {
190222 colTypes.clear ();
191223 labels.clear ();
192224 colTypes.push_back (maxValueType);
193225 }
194- if (isMatrix) {
226+ if (isMatrix)
195227 return FileMetaData (row, numCols, true , {maxValueType}, {});
196- }
197- FileMetaData fmd = FileMetaData (row, numCols, isSingleValueType, colTypes, labels);
198- return fmd;
199- }
200-
201- void readCsvLine (File* file, size_t row, char delim, size_t numCols, uint8_t **rawCols, ValueTypeCode* colTypes, bool genTypes = true ) {
202- size_t pos, col = 0 ;
203- ValueTypeCode colType = ValueTypeCode::INVALID;
204- while (1 ) {
205- if (colTypes != nullptr ){
206- colType = colTypes[col];
207- }else if (!genTypes){
208- throw std::runtime_error (" ReadCsvFile::apply: colTypes must be provided if genTypes is false" );
209- }else { // set colTypes to most specific value type possible
210- colTypes= new ValueTypeCode[numCols];
211- for (size_t i = 0 ; i < numCols; i++) {
212- colTypes[i] = ValueTypeCode::SI8;
213- }
214- }
215- if (genTypes){
216- colType = inferValueType (file->line , pos, delim);
217- if (generality (colType) > generality (colTypes[col])) {
218- colTypes[col] = colType;
219- }
220- }
221- switch (colTypes[col]) {
222- case ValueTypeCode::SI8:
223- int8_t val_si8;
224- convertCstr (file->line + pos, &val_si8);
225- reinterpret_cast <int8_t *>(rawCols[col])[row] = val_si8;
226- break ;
227- case ValueTypeCode::SI32:
228- int32_t val_si32;
229- convertCstr (file->line + pos, &val_si32);
230- reinterpret_cast <int32_t *>(rawCols[col])[row] = val_si32;
231- break ;
232- case ValueTypeCode::SI64:
233- int64_t val_si64;
234- convertCstr (file->line + pos, &val_si64);
235- reinterpret_cast <int64_t *>(rawCols[col])[row] = val_si64;
236- break ;
237- case ValueTypeCode::UI8:
238- uint8_t val_ui8;
239- convertCstr (file->line + pos, &val_ui8);
240- reinterpret_cast <uint8_t *>(rawCols[col])[row] = val_ui8;
241- break ;
242- case ValueTypeCode::UI32:
243- uint32_t val_ui32;
244- convertCstr (file->line + pos, &val_ui32);
245- reinterpret_cast <uint32_t *>(rawCols[col])[row] = val_ui32;
246- break ;
247- case ValueTypeCode::UI64:
248- uint64_t val_ui64;
249- convertCstr (file->line + pos, &val_ui64);
250- reinterpret_cast <uint64_t *>(rawCols[col])[row] = val_ui64;
251- break ;
252- case ValueTypeCode::F32:
253- float val_f32;
254- convertCstr (file->line + pos, &val_f32);
255- reinterpret_cast <float *>(rawCols[col])[row] = val_f32;
256- break ;
257- case ValueTypeCode::F64:
258- double val_f64;
259- convertCstr (file->line + pos, &val_f64);
260- reinterpret_cast <double *>(rawCols[col])[row] = val_f64;
261- break ;
262- case ValueTypeCode::STR: {
263- std::string val_str = " " ;
264- pos = setCString (file, pos, &val_str, delim);
265- reinterpret_cast <std::string *>(rawCols[col])[row] = val_str;
266- break ;
267- }
268- case ValueTypeCode::FIXEDSTR16: {
269- std::string val_str = " " ;
270- pos = setCString (file, pos, &val_str, delim);
271- reinterpret_cast <FixedStr16 *>(rawCols[col])[row] = FixedStr16 (val_str);
272- break ;
273- }
274- default :
275- throw std::runtime_error (" ReadCsvFile::apply: unknown value type code" );
276- }
277-
278- if (++col >= numCols) {
279- break ;
280- }
281-
282- // TODO We could even exploit the fact that the strtoX functions
283- // can return a pointer to the first character after the parsed
284- // input, then we wouldn't have to search for that ourselves,
285- // just would need to check if it is really the delimiter.
286- while (file->line [pos] != delim)
287- pos++;
288- pos++; // skip delimiter
289- }
228+ return FileMetaData (row, numCols, isSingleValueType, colTypes, labels);
290229}
0 commit comments