2424#include <string>
2525#include <vector>
2626
27+ // Helper function to trim leading and trailing whitespace.
28+ static std::string trim(const std::string &s) {
29+ size_t start = 0;
30+ while (start < s.size() && std::isspace(static_cast<unsigned char>(s[start])))
31+ start++;
32+ size_t end = s.size();
33+ while (end > start && std::isspace(static_cast<unsigned char>(s[end - 1])))
34+ end--;
35+ return s.substr(start, end - start);
36+ }
37+
2738int generality(ValueTypeCode type) { // similar to generality in TypeInferenceUtils.cpp but for ValueTypeCode
2839 switch (type) {
2940 case ValueTypeCode::SI8:
@@ -51,12 +62,10 @@ int generality(ValueTypeCode type) { // similar to generality in TypeInferenceUt
5162
5263// Helper function to check if a line is empty or contains only whitespace.
5364bool isEmptyLine(const std::string &line) {
54- return std::all_of(line.begin(), line.end(), [](unsigned char c) {
55- return std::isspace(c);
56- });
65+ return std::all_of(line.begin(), line.end(), [](unsigned char c) { return std::isspace(c); });
5766}
5867
59- ValueTypeCode inferValueType(const char* line, size_t &pos, char delim) {
68+ ValueTypeCode inferValueType(const char * line, size_t &pos, char delim) {
6069 std::string field;
6170 // Extract field until delimiter
6271 while (line[pos] != delim && line[pos] != '\0') {
@@ -71,65 +80,79 @@ ValueTypeCode inferValueType(const char* line, size_t &pos, char delim) {
7180
7281// Function to infer the data type of string value
7382ValueTypeCode inferValueType(const std::string &valueStr) {
74- // Check if the string represents an integer
83+
84+ if (valueStr.empty())
85+ return ValueTypeCode::STR;
86+
87+ std::string token;
88+ token = trim(valueStr);
89+ if (valueStr.front() == '\"') {
90+ if (valueStr.back() != '\"')
91+ return ValueTypeCode::STR;
92+ // Remove the surrounding quotes.
93+ token = valueStr.substr(1, valueStr.size() - 2);
94+ if (token.size() == 16)
95+ return ValueTypeCode::FIXEDSTR16;
96+ }
97+
98+ // Check if the string represents an integer.
7599 bool isInteger = true;
76- for (char c : valueStr ) {
77- if (!isdigit(c) && c != '-' && c != '+' && c != ' ' ) {
100+ for (char c : token ) {
101+ if (!isdigit(c) && c != '-' && c != '+' && !isspace(c) ) {
78102 isInteger = false;
79103 break;
80104 }
81105 }
82-
83106 if (isInteger) {
84107 try {
85- int64_t value = std::stoll(valueStr);
86- if (value >= std::numeric_limits<int8_t>::min() && value <= std::numeric_limits<int8_t>::max()) {
87- return ValueTypeCode::SI8;
88- } else if (value >= 0 && value <= std::numeric_limits<uint8_t>::max()) {
89- return ValueTypeCode::UI8;
90- } else if (value >= std::numeric_limits<int32_t>::min() && value <= std::numeric_limits<int32_t>::max()) {
91- return ValueTypeCode::SI32;
92- } else if (value >= 0 && value <= std::numeric_limits<uint32_t>::max()) {
93- return ValueTypeCode::UI32;
94- } else if (value >= std::numeric_limits<int64_t>::min() && value <= std::numeric_limits<int64_t>::max()) {
95- return ValueTypeCode::SI64;
96- } else {
97- return ValueTypeCode::UI64;
108+ size_t pos;
109+ int64_t value = std::stoll(token, &pos);
110+ // ensure there were no extra characters that were silently ignored
111+ if (pos == token.size()) {
112+ if (value >= std::numeric_limits<int8_t>::min() && value <= std::numeric_limits<int8_t>::max())
113+ return ValueTypeCode::SI8;
114+ else if (value >= 0 && value <= std::numeric_limits<uint8_t>::max())
115+ return ValueTypeCode::UI8;
116+ else if (value >= std::numeric_limits<int32_t>::min() && value <= std::numeric_limits<int32_t>::max())
117+ return ValueTypeCode::SI32;
118+ else if (value >= 0 && value <= std::numeric_limits<uint32_t>::max())
119+ return ValueTypeCode::UI32;
120+ else if (value >= std::numeric_limits<int64_t>::min() && value <= std::numeric_limits<int64_t>::max())
121+ return ValueTypeCode::SI64;
122+ else
123+ return ValueTypeCode::UI64;
98124 }
99125 } catch (const std::invalid_argument &) {
100- // Continue to next check
126+ // Fall through to string.
101127 } catch (const std::out_of_range &) {
102128 return ValueTypeCode::UI64;
103129 }
104130 }
105131
106- // Check if the string represents a float
132+ // Check if the string represents a float.
107133 try {
108- float fvalue = std::stof(valueStr);
109- if (fvalue >= std::numeric_limits<float>::lowest() && fvalue <= std::numeric_limits<float>::max()) {
134+ size_t pos;
135+ float fvalue = std::stof(token, &pos);
136+ if (pos == token.size() && fvalue >= std::numeric_limits<float>::lowest() &&
137+ fvalue <= std::numeric_limits<float>::max())
110138 return ValueTypeCode::F32;
111- }
112139 } catch (const std::invalid_argument &) {
113- // Continue to next check
114140 } catch (const std::out_of_range &) {
115- // Continue to next check
116141 }
117142
118- // Check if the string represents a double
143+ // Check if the string represents a double.
119144 try {
120- double dvalue = std::stod(valueStr);
121- if (dvalue >= std::numeric_limits<double>::lowest() && dvalue <= std::numeric_limits<double>::max()) {
145+ size_t pos;
146+ double dvalue = std::stod(token, &pos);
147+ if (pos == token.size() && dvalue >= std::numeric_limits<double>::lowest() &&
148+ dvalue <= std::numeric_limits<double>::max())
122149 return ValueTypeCode::F64;
123- }
124150 } catch (const std::invalid_argument &) {
125- // Continue to next check
126151 } catch (const std::out_of_range &) {
127- // Continue to next check
128152 }
129153
130- if (valueStr .size() == 16) {
154+ if (token .size() == 16)
131155 return ValueTypeCode::FIXEDSTR16;
132- }
133156 return ValueTypeCode::STR;
134157}
135158
@@ -139,19 +162,28 @@ FileMetaData generateFileMetaData(const std::string &filename, char delim, size_
139162 if (!file.is_open())
140163 throw std::runtime_error("Cannot open file: " + filename);
141164 std::string line;
142- std::vector<ValueTypeCode> colTypes; // will be resized once we know numCols
165+ std::vector<ValueTypeCode> colTypes; // resized once we know numCols
143166 bool firstLine = true;
144167 size_t row = 0;
145168 while (std::getline(file, line) && row < sampleRows) {
146169 // Discard empty rows.
147- if(isEmptyLine(line))
170+ if (isEmptyLine(line))
148171 continue;
149172
173+ // If a token may span multiple lines, join subsequent lines until quotes match.
174+ size_t quoteCount = std::count(line.begin(), line.end(), '\"');
175+ while ((quoteCount % 2) != 0) {
176+ std::string nextLine;
177+ if (!std::getline(file, nextLine))
178+ break;
179+ line += "\n" + nextLine;
180+ quoteCount = std::count(line.begin(), line.end(), '\"');
181+ }
182+
150183 size_t pos = 0;
151184 size_t col = 0;
152- // On first row , determine number of columns.
185+ // On first non-empty line , determine number of columns.
153186 if (firstLine) {
154- // Count the number of delimiters + 1
155187 size_t ncols = 1;
156188 for (char c : line)
157189 if (c == delim)
@@ -162,6 +194,7 @@ FileMetaData generateFileMetaData(const std::string &filename, char delim, size_
162194 // Process each token.
163195 while (pos < line.size() && col < colTypes.size()) {
164196 size_t tempPos = pos;
197+ // Extract token using the existing inferValueType helper.
165198 ValueTypeCode tokenType = inferValueType(line.c_str(), tempPos, delim);
166199 // Promote type if needed.
167200 if (generality(tokenType) > generality(colTypes[col]))
@@ -172,119 +205,25 @@ FileMetaData generateFileMetaData(const std::string &filename, char delim, size_
172205 row++;
173206 }
174207 file.close();
208+
175209 std::vector<std::string> labels;
176- size_t numCols= colTypes.size();
210+ size_t numCols = colTypes.size();
177211 bool isSingleValueType = true;
178212 ValueTypeCode firstValueType = colTypes[0];
179213 ValueTypeCode maxValueType = colTypes[0];
180214 for (size_t i = 0; i < numCols; i++) {
181215 labels.push_back("col_" + std::to_string(i));
182- if (generality(colTypes[i]) > generality(maxValueType)) {
216+ if (generality(colTypes[i]) > generality(maxValueType))
183217 maxValueType = colTypes[i];
184- }
185- if (colTypes[i] != firstValueType) {
218+ if (colTypes[i] != firstValueType)
186219 isSingleValueType = false;
187- }
188220 }
189221 if (isSingleValueType) {
190222 colTypes.clear();
191223 labels.clear();
192224 colTypes.push_back(maxValueType);
193225 }
194- if (isMatrix) {
226+ if (isMatrix)
195227 return FileMetaData(row, numCols, true, {maxValueType}, {});
196- }
197- FileMetaData fmd = FileMetaData(row, numCols, isSingleValueType, colTypes, labels);
198- return fmd;
199- }
200-
201- void readCsvLine(File* file, size_t row, char delim, size_t numCols, uint8_t **rawCols, ValueTypeCode* colTypes, bool genTypes = true) {
202- size_t pos, col = 0;
203- ValueTypeCode colType = ValueTypeCode::INVALID;
204- while (1) {
205- if (colTypes != nullptr){
206- colType = colTypes[col];
207- }else if(!genTypes){
208- throw std::runtime_error("ReadCsvFile::apply: colTypes must be provided if genTypes is false");
209- }else{ // set colTypes to most specific value type possible
210- colTypes= new ValueTypeCode[numCols];
211- for (size_t i = 0; i < numCols; i++) {
212- colTypes[i] = ValueTypeCode::SI8;
213- }
214- }
215- if (genTypes){
216- colType = inferValueType(file->line, pos, delim);
217- if (generality(colType) > generality(colTypes[col])) {
218- colTypes[col] = colType;
219- }
220- }
221- switch (colTypes[col]) {
222- case ValueTypeCode::SI8:
223- int8_t val_si8;
224- convertCstr(file->line + pos, &val_si8);
225- reinterpret_cast<int8_t *>(rawCols[col])[row] = val_si8;
226- break;
227- case ValueTypeCode::SI32:
228- int32_t val_si32;
229- convertCstr(file->line + pos, &val_si32);
230- reinterpret_cast<int32_t *>(rawCols[col])[row] = val_si32;
231- break;
232- case ValueTypeCode::SI64:
233- int64_t val_si64;
234- convertCstr(file->line + pos, &val_si64);
235- reinterpret_cast<int64_t *>(rawCols[col])[row] = val_si64;
236- break;
237- case ValueTypeCode::UI8:
238- uint8_t val_ui8;
239- convertCstr(file->line + pos, &val_ui8);
240- reinterpret_cast<uint8_t *>(rawCols[col])[row] = val_ui8;
241- break;
242- case ValueTypeCode::UI32:
243- uint32_t val_ui32;
244- convertCstr(file->line + pos, &val_ui32);
245- reinterpret_cast<uint32_t *>(rawCols[col])[row] = val_ui32;
246- break;
247- case ValueTypeCode::UI64:
248- uint64_t val_ui64;
249- convertCstr(file->line + pos, &val_ui64);
250- reinterpret_cast<uint64_t *>(rawCols[col])[row] = val_ui64;
251- break;
252- case ValueTypeCode::F32:
253- float val_f32;
254- convertCstr(file->line + pos, &val_f32);
255- reinterpret_cast<float *>(rawCols[col])[row] = val_f32;
256- break;
257- case ValueTypeCode::F64:
258- double val_f64;
259- convertCstr(file->line + pos, &val_f64);
260- reinterpret_cast<double *>(rawCols[col])[row] = val_f64;
261- break;
262- case ValueTypeCode::STR: {
263- std::string val_str = "";
264- pos = setCString(file, pos, &val_str, delim);
265- reinterpret_cast<std::string *>(rawCols[col])[row] = val_str;
266- break;
267- }
268- case ValueTypeCode::FIXEDSTR16: {
269- std::string val_str = "";
270- pos = setCString(file, pos, &val_str, delim);
271- reinterpret_cast<FixedStr16 *>(rawCols[col])[row] = FixedStr16(val_str);
272- break;
273- }
274- default:
275- throw std::runtime_error("ReadCsvFile::apply: unknown value type code");
276- }
277-
278- if (++col >= numCols) {
279- break;
280- }
281-
282- // TODO We could even exploit the fact that the strtoX functions
283- // can return a pointer to the first character after the parsed
284- // input, then we wouldn't have to search for that ourselves,
285- // just would need to check if it is really the delimiter.
286- while (file->line[pos] != delim)
287- pos++;
288- pos++; // skip delimiter
289- }
228+ return FileMetaData(row, numCols, isSingleValueType, colTypes, labels);
290229}
0 commit comments