2424#include < string>
2525#include < vector>
2626
27+ // Helper function to trim leading and trailing whitespace.
28+ static std::string trim (const std::string &s) {
29+ size_t start = 0 ;
30+ while (start < s.size () && std::isspace (static_cast <unsigned char >(s[start])))
31+ start++;
32+ size_t end = s.size ();
33+ while (end > start && std::isspace (static_cast <unsigned char >(s[end - 1 ])))
34+ end--;
35+ return s.substr (start, end - start);
36+ }
37+
2738int generality (ValueTypeCode type) { // similar to generality in TypeInferenceUtils.cpp but for ValueTypeCode
2839 switch (type) {
2940 case ValueTypeCode::SI8:
@@ -51,12 +62,10 @@ int generality(ValueTypeCode type) { // similar to generality in TypeInferenceUt
5162
5263// Helper function to check if a line is empty or contains only whitespace.
5364bool isEmptyLine (const std::string &line) {
54- return std::all_of (line.begin (), line.end (), [](unsigned char c) {
55- return std::isspace (c);
56- });
65+ return std::all_of (line.begin (), line.end (), [](unsigned char c) { return std::isspace (c); });
5766}
5867
59- ValueTypeCode inferValueType (const char * line, size_t &pos, char delim) {
68+ ValueTypeCode inferValueType (const char * line, size_t &pos, char delim) {
6069 std::string field;
6170 // Extract field until delimiter
6271 while (line[pos] != delim && line[pos] != ' \0 ' ) {
@@ -71,65 +80,79 @@ ValueTypeCode inferValueType(const char* line, size_t &pos, char delim) {
7180
7281// Function to infer the data type of string value
7382ValueTypeCode inferValueType (const std::string &valueStr) {
74- // Check if the string represents an integer
83+
84+ if (valueStr.empty ())
85+ return ValueTypeCode::STR;
86+
87+ std::string token;
88+ token = trim (valueStr);
89+ if (valueStr.front () == ' \" ' ) {
90+ if (valueStr.back () != ' \" ' )
91+ return ValueTypeCode::STR;
92+ // Remove the surrounding quotes.
93+ token = valueStr.substr (1 , valueStr.size () - 2 );
94+ if (token.size () == 16 )
95+ return ValueTypeCode::FIXEDSTR16;
96+ }
97+
98+ // Check if the string represents an integer.
7599 bool isInteger = true ;
76- for (char c : valueStr ) {
77- if (!isdigit (c) && c != ' -' && c != ' +' && c != ' ' ) {
100+ for (char c : token ) {
101+ if (!isdigit (c) && c != ' -' && c != ' +' && ! isspace (c) ) {
78102 isInteger = false ;
79103 break ;
80104 }
81105 }
82-
83106 if (isInteger) {
84107 try {
85- int64_t value = std::stoll (valueStr);
86- if (value >= std::numeric_limits<int8_t >::min () && value <= std::numeric_limits<int8_t >::max ()) {
87- return ValueTypeCode::SI8;
88- } else if (value >= 0 && value <= std::numeric_limits<uint8_t >::max ()) {
89- return ValueTypeCode::UI8;
90- } else if (value >= std::numeric_limits<int32_t >::min () && value <= std::numeric_limits<int32_t >::max ()) {
91- return ValueTypeCode::SI32;
92- } else if (value >= 0 && value <= std::numeric_limits<uint32_t >::max ()) {
93- return ValueTypeCode::UI32;
94- } else if (value >= std::numeric_limits<int64_t >::min () && value <= std::numeric_limits<int64_t >::max ()) {
95- return ValueTypeCode::SI64;
96- } else {
97- return ValueTypeCode::UI64;
108+ size_t pos;
109+ int64_t value = std::stoll (token, &pos);
110+ // ensure there were no extra characters that were silently ignored
111+ if (pos == token.size ()) {
112+ if (value >= std::numeric_limits<int8_t >::min () && value <= std::numeric_limits<int8_t >::max ())
113+ return ValueTypeCode::SI8;
114+ else if (value >= 0 && value <= std::numeric_limits<uint8_t >::max ())
115+ return ValueTypeCode::UI8;
116+ else if (value >= std::numeric_limits<int32_t >::min () && value <= std::numeric_limits<int32_t >::max ())
117+ return ValueTypeCode::SI32;
118+ else if (value >= 0 && value <= std::numeric_limits<uint32_t >::max ())
119+ return ValueTypeCode::UI32;
120+ else if (value >= std::numeric_limits<int64_t >::min () && value <= std::numeric_limits<int64_t >::max ())
121+ return ValueTypeCode::SI64;
122+ else
123+ return ValueTypeCode::UI64;
98124 }
99125 } catch (const std::invalid_argument &) {
100- // Continue to next check
126+ // Fall through to string.
101127 } catch (const std::out_of_range &) {
102128 return ValueTypeCode::UI64;
103129 }
104130 }
105131
106- // Check if the string represents a float
132+ // Check if the string represents a float.
107133 try {
108- float fvalue = std::stof (valueStr);
109- if (fvalue >= std::numeric_limits<float >::lowest () && fvalue <= std::numeric_limits<float >::max ()) {
134+ size_t pos;
135+ float fvalue = std::stof (token, &pos);
136+ if (pos == token.size () && fvalue >= std::numeric_limits<float >::lowest () &&
137+ fvalue <= std::numeric_limits<float >::max ())
110138 return ValueTypeCode::F32;
111- }
112139 } catch (const std::invalid_argument &) {
113- // Continue to next check
114140 } catch (const std::out_of_range &) {
115- // Continue to next check
116141 }
117142
118- // Check if the string represents a double
143+ // Check if the string represents a double.
119144 try {
120- double dvalue = std::stod (valueStr);
121- if (dvalue >= std::numeric_limits<double >::lowest () && dvalue <= std::numeric_limits<double >::max ()) {
145+ size_t pos;
146+ double dvalue = std::stod (token, &pos);
147+ if (pos == token.size () && dvalue >= std::numeric_limits<double >::lowest () &&
148+ dvalue <= std::numeric_limits<double >::max ())
122149 return ValueTypeCode::F64;
123- }
124150 } catch (const std::invalid_argument &) {
125- // Continue to next check
126151 } catch (const std::out_of_range &) {
127- // Continue to next check
128152 }
129153
130- if (valueStr .size () == 16 ) {
154+ if (token .size () == 16 )
131155 return ValueTypeCode::FIXEDSTR16;
132- }
133156 return ValueTypeCode::STR;
134157}
135158
@@ -139,19 +162,28 @@ FileMetaData generateFileMetaData(const std::string &filename, char delim, size_
139162 if (!file.is_open ())
140163 throw std::runtime_error (" Cannot open file: " + filename);
141164 std::string line;
142- std::vector<ValueTypeCode> colTypes; // will be resized once we know numCols
165+ std::vector<ValueTypeCode> colTypes; // resized once we know numCols
143166 bool firstLine = true ;
144167 size_t row = 0 ;
145168 while (std::getline (file, line) && row < sampleRows) {
146169 // Discard empty rows.
147- if (isEmptyLine (line))
170+ if (isEmptyLine (line))
148171 continue ;
149172
173+ // If a token may span multiple lines, join subsequent lines until quotes match.
174+ size_t quoteCount = std::count (line.begin (), line.end (), ' \" ' );
175+ while ((quoteCount % 2 ) != 0 ) {
176+ std::string nextLine;
177+ if (!std::getline (file, nextLine))
178+ break ;
179+ line += " \n " + nextLine;
180+ quoteCount = std::count (line.begin (), line.end (), ' \" ' );
181+ }
182+
150183 size_t pos = 0 ;
151184 size_t col = 0 ;
152- // On first row , determine number of columns.
185+ // On first non-empty line , determine number of columns.
153186 if (firstLine) {
154- // Count the number of delimiters + 1
155187 size_t ncols = 1 ;
156188 for (char c : line)
157189 if (c == delim)
@@ -162,6 +194,7 @@ FileMetaData generateFileMetaData(const std::string &filename, char delim, size_
162194 // Process each token.
163195 while (pos < line.size () && col < colTypes.size ()) {
164196 size_t tempPos = pos;
197+ // Extract token using the existing inferValueType helper.
165198 ValueTypeCode tokenType = inferValueType (line.c_str (), tempPos, delim);
166199 // Promote type if needed.
167200 if (generality (tokenType) > generality (colTypes[col]))
@@ -172,119 +205,25 @@ FileMetaData generateFileMetaData(const std::string &filename, char delim, size_
172205 row++;
173206 }
174207 file.close ();
208+
175209 std::vector<std::string> labels;
176- size_t numCols= colTypes.size ();
210+ size_t numCols = colTypes.size ();
177211 bool isSingleValueType = true ;
178212 ValueTypeCode firstValueType = colTypes[0 ];
179213 ValueTypeCode maxValueType = colTypes[0 ];
180214 for (size_t i = 0 ; i < numCols; i++) {
181215 labels.push_back (" col_" + std::to_string (i));
182- if (generality (colTypes[i]) > generality (maxValueType)) {
216+ if (generality (colTypes[i]) > generality (maxValueType))
183217 maxValueType = colTypes[i];
184- }
185- if (colTypes[i] != firstValueType) {
218+ if (colTypes[i] != firstValueType)
186219 isSingleValueType = false ;
187- }
188220 }
189221 if (isSingleValueType) {
190222 colTypes.clear ();
191223 labels.clear ();
192224 colTypes.push_back (maxValueType);
193225 }
194- if (isMatrix) {
226+ if (isMatrix)
195227 return FileMetaData (row, numCols, true , {maxValueType}, {});
196- }
197- FileMetaData fmd = FileMetaData (row, numCols, isSingleValueType, colTypes, labels);
198- return fmd;
199- }
200-
201- void readCsvLine (File* file, size_t row, char delim, size_t numCols, uint8_t **rawCols, ValueTypeCode* colTypes, bool genTypes = true ) {
202- size_t pos, col = 0 ;
203- ValueTypeCode colType = ValueTypeCode::INVALID;
204- while (1 ) {
205- if (colTypes != nullptr ){
206- colType = colTypes[col];
207- }else if (!genTypes){
208- throw std::runtime_error (" ReadCsvFile::apply: colTypes must be provided if genTypes is false" );
209- }else { // set colTypes to most specific value type possible
210- colTypes= new ValueTypeCode[numCols];
211- for (size_t i = 0 ; i < numCols; i++) {
212- colTypes[i] = ValueTypeCode::SI8;
213- }
214- }
215- if (genTypes){
216- colType = inferValueType (file->line , pos, delim);
217- if (generality (colType) > generality (colTypes[col])) {
218- colTypes[col] = colType;
219- }
220- }
221- switch (colTypes[col]) {
222- case ValueTypeCode::SI8:
223- int8_t val_si8;
224- convertCstr (file->line + pos, &val_si8);
225- reinterpret_cast <int8_t *>(rawCols[col])[row] = val_si8;
226- break ;
227- case ValueTypeCode::SI32:
228- int32_t val_si32;
229- convertCstr (file->line + pos, &val_si32);
230- reinterpret_cast <int32_t *>(rawCols[col])[row] = val_si32;
231- break ;
232- case ValueTypeCode::SI64:
233- int64_t val_si64;
234- convertCstr (file->line + pos, &val_si64);
235- reinterpret_cast <int64_t *>(rawCols[col])[row] = val_si64;
236- break ;
237- case ValueTypeCode::UI8:
238- uint8_t val_ui8;
239- convertCstr (file->line + pos, &val_ui8);
240- reinterpret_cast <uint8_t *>(rawCols[col])[row] = val_ui8;
241- break ;
242- case ValueTypeCode::UI32:
243- uint32_t val_ui32;
244- convertCstr (file->line + pos, &val_ui32);
245- reinterpret_cast <uint32_t *>(rawCols[col])[row] = val_ui32;
246- break ;
247- case ValueTypeCode::UI64:
248- uint64_t val_ui64;
249- convertCstr (file->line + pos, &val_ui64);
250- reinterpret_cast <uint64_t *>(rawCols[col])[row] = val_ui64;
251- break ;
252- case ValueTypeCode::F32:
253- float val_f32;
254- convertCstr (file->line + pos, &val_f32);
255- reinterpret_cast <float *>(rawCols[col])[row] = val_f32;
256- break ;
257- case ValueTypeCode::F64:
258- double val_f64;
259- convertCstr (file->line + pos, &val_f64);
260- reinterpret_cast <double *>(rawCols[col])[row] = val_f64;
261- break ;
262- case ValueTypeCode::STR: {
263- std::string val_str = " " ;
264- pos = setCString (file, pos, &val_str, delim);
265- reinterpret_cast <std::string *>(rawCols[col])[row] = val_str;
266- break ;
267- }
268- case ValueTypeCode::FIXEDSTR16: {
269- std::string val_str = " " ;
270- pos = setCString (file, pos, &val_str, delim);
271- reinterpret_cast <FixedStr16 *>(rawCols[col])[row] = FixedStr16 (val_str);
272- break ;
273- }
274- default :
275- throw std::runtime_error (" ReadCsvFile::apply: unknown value type code" );
276- }
277-
278- if (++col >= numCols) {
279- break ;
280- }
281-
282- // TODO We could even exploit the fact that the strtoX functions
283- // can return a pointer to the first character after the parsed
284- // input, then we wouldn't have to search for that ourselves,
285- // just would need to check if it is really the delimiter.
286- while (file->line [pos] != delim)
287- pos++;
288- pos++; // skip delimiter
289- }
228+ return FileMetaData (row, numCols, isSingleValueType, colTypes, labels);
290229}
0 commit comments