Skip to content

Commit 68ba270

Browse files
added multi line support
1 parent 3aaee5b commit 68ba270

File tree

2 files changed

+81
-142
lines changed

2 files changed

+81
-142
lines changed

src/runtime/local/io/utils.cpp

Lines changed: 79 additions & 140 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,17 @@
2424
#include <string>
2525
#include <vector>
2626

27+
// Helper function to trim leading and trailing whitespace.
28+
static std::string trim(const std::string &s) {
29+
size_t start = 0;
30+
while (start < s.size() && std::isspace(static_cast<unsigned char>(s[start])))
31+
start++;
32+
size_t end = s.size();
33+
while (end > start && std::isspace(static_cast<unsigned char>(s[end - 1])))
34+
end--;
35+
return s.substr(start, end - start);
36+
}
37+
2738
int generality(ValueTypeCode type) { // similar to generality in TypeInferenceUtils.cpp but for ValueTypeCode
2839
switch (type) {
2940
case ValueTypeCode::SI8:
@@ -51,12 +62,10 @@ int generality(ValueTypeCode type) { // similar to generality in TypeInferenceUt
5162

5263
// Helper function to check if a line is empty or contains only whitespace.
5364
bool isEmptyLine(const std::string &line) {
54-
return std::all_of(line.begin(), line.end(), [](unsigned char c) {
55-
return std::isspace(c);
56-
});
65+
return std::all_of(line.begin(), line.end(), [](unsigned char c) { return std::isspace(c); });
5766
}
5867

59-
ValueTypeCode inferValueType(const char* line, size_t &pos, char delim) {
68+
ValueTypeCode inferValueType(const char *line, size_t &pos, char delim) {
6069
std::string field;
6170
// Extract field until delimiter
6271
while (line[pos] != delim && line[pos] != '\0') {
@@ -71,65 +80,79 @@ ValueTypeCode inferValueType(const char* line, size_t &pos, char delim) {
7180

7281
// Function to infer the data type of string value
7382
ValueTypeCode inferValueType(const std::string &valueStr) {
74-
// Check if the string represents an integer
83+
84+
if (valueStr.empty())
85+
return ValueTypeCode::STR;
86+
87+
std::string token;
88+
token = trim(valueStr);
89+
if (valueStr.front() == '\"') {
90+
if (valueStr.back() != '\"')
91+
return ValueTypeCode::STR;
92+
// Remove the surrounding quotes.
93+
token = valueStr.substr(1, valueStr.size() - 2);
94+
if (token.size() == 16)
95+
return ValueTypeCode::FIXEDSTR16;
96+
}
97+
98+
// Check if the string represents an integer.
7599
bool isInteger = true;
76-
for (char c : valueStr) {
77-
if (!isdigit(c) && c != '-' && c != '+' && c != ' ') {
100+
for (char c : token) {
101+
if (!isdigit(c) && c != '-' && c != '+' && !isspace(c)) {
78102
isInteger = false;
79103
break;
80104
}
81105
}
82-
83106
if (isInteger) {
84107
try {
85-
int64_t value = std::stoll(valueStr);
86-
if (value >= std::numeric_limits<int8_t>::min() && value <= std::numeric_limits<int8_t>::max()) {
87-
return ValueTypeCode::SI8;
88-
} else if (value >= 0 && value <= std::numeric_limits<uint8_t>::max()) {
89-
return ValueTypeCode::UI8;
90-
} else if (value >= std::numeric_limits<int32_t>::min() && value <= std::numeric_limits<int32_t>::max()) {
91-
return ValueTypeCode::SI32;
92-
} else if (value >= 0 && value <= std::numeric_limits<uint32_t>::max()) {
93-
return ValueTypeCode::UI32;
94-
} else if (value >= std::numeric_limits<int64_t>::min() && value <= std::numeric_limits<int64_t>::max()) {
95-
return ValueTypeCode::SI64;
96-
} else {
97-
return ValueTypeCode::UI64;
108+
size_t pos;
109+
int64_t value = std::stoll(token, &pos);
110+
// ensure there were no extra characters that were silently ignored
111+
if (pos == token.size()) {
112+
if (value >= std::numeric_limits<int8_t>::min() && value <= std::numeric_limits<int8_t>::max())
113+
return ValueTypeCode::SI8;
114+
else if (value >= 0 && value <= std::numeric_limits<uint8_t>::max())
115+
return ValueTypeCode::UI8;
116+
else if (value >= std::numeric_limits<int32_t>::min() && value <= std::numeric_limits<int32_t>::max())
117+
return ValueTypeCode::SI32;
118+
else if (value >= 0 && value <= std::numeric_limits<uint32_t>::max())
119+
return ValueTypeCode::UI32;
120+
else if (value >= std::numeric_limits<int64_t>::min() && value <= std::numeric_limits<int64_t>::max())
121+
return ValueTypeCode::SI64;
122+
else
123+
return ValueTypeCode::UI64;
98124
}
99125
} catch (const std::invalid_argument &) {
100-
// Continue to next check
126+
// Fall through to string.
101127
} catch (const std::out_of_range &) {
102128
return ValueTypeCode::UI64;
103129
}
104130
}
105131

106-
// Check if the string represents a float
132+
// Check if the string represents a float.
107133
try {
108-
float fvalue = std::stof(valueStr);
109-
if (fvalue >= std::numeric_limits<float>::lowest() && fvalue <= std::numeric_limits<float>::max()) {
134+
size_t pos;
135+
float fvalue = std::stof(token, &pos);
136+
if (pos == token.size() && fvalue >= std::numeric_limits<float>::lowest() &&
137+
fvalue <= std::numeric_limits<float>::max())
110138
return ValueTypeCode::F32;
111-
}
112139
} catch (const std::invalid_argument &) {
113-
// Continue to next check
114140
} catch (const std::out_of_range &) {
115-
// Continue to next check
116141
}
117142

118-
// Check if the string represents a double
143+
// Check if the string represents a double.
119144
try {
120-
double dvalue = std::stod(valueStr);
121-
if (dvalue >= std::numeric_limits<double>::lowest() && dvalue <= std::numeric_limits<double>::max()) {
145+
size_t pos;
146+
double dvalue = std::stod(token, &pos);
147+
if (pos == token.size() && dvalue >= std::numeric_limits<double>::lowest() &&
148+
dvalue <= std::numeric_limits<double>::max())
122149
return ValueTypeCode::F64;
123-
}
124150
} catch (const std::invalid_argument &) {
125-
// Continue to next check
126151
} catch (const std::out_of_range &) {
127-
// Continue to next check
128152
}
129153

130-
if (valueStr.size() == 16) {
154+
if (token.size() == 16)
131155
return ValueTypeCode::FIXEDSTR16;
132-
}
133156
return ValueTypeCode::STR;
134157
}
135158

@@ -139,19 +162,28 @@ FileMetaData generateFileMetaData(const std::string &filename, char delim, size_
139162
if (!file.is_open())
140163
throw std::runtime_error("Cannot open file: " + filename);
141164
std::string line;
142-
std::vector<ValueTypeCode> colTypes; // will be resized once we know numCols
165+
std::vector<ValueTypeCode> colTypes; // resized once we know numCols
143166
bool firstLine = true;
144167
size_t row = 0;
145168
while (std::getline(file, line) && row < sampleRows) {
146169
// Discard empty rows.
147-
if(isEmptyLine(line))
170+
if (isEmptyLine(line))
148171
continue;
149172

173+
// If a token may span multiple lines, join subsequent lines until quotes match.
174+
size_t quoteCount = std::count(line.begin(), line.end(), '\"');
175+
while ((quoteCount % 2) != 0) {
176+
std::string nextLine;
177+
if (!std::getline(file, nextLine))
178+
break;
179+
line += "\n" + nextLine;
180+
quoteCount = std::count(line.begin(), line.end(), '\"');
181+
}
182+
150183
size_t pos = 0;
151184
size_t col = 0;
152-
// On first row, determine number of columns.
185+
// On first non-empty line, determine number of columns.
153186
if (firstLine) {
154-
// Count the number of delimiters + 1
155187
size_t ncols = 1;
156188
for (char c : line)
157189
if (c == delim)
@@ -162,6 +194,7 @@ FileMetaData generateFileMetaData(const std::string &filename, char delim, size_
162194
// Process each token.
163195
while (pos < line.size() && col < colTypes.size()) {
164196
size_t tempPos = pos;
197+
// Extract token using the existing inferValueType helper.
165198
ValueTypeCode tokenType = inferValueType(line.c_str(), tempPos, delim);
166199
// Promote type if needed.
167200
if (generality(tokenType) > generality(colTypes[col]))
@@ -172,119 +205,25 @@ FileMetaData generateFileMetaData(const std::string &filename, char delim, size_
172205
row++;
173206
}
174207
file.close();
208+
175209
std::vector<std::string> labels;
176-
size_t numCols=colTypes.size();
210+
size_t numCols = colTypes.size();
177211
bool isSingleValueType = true;
178212
ValueTypeCode firstValueType = colTypes[0];
179213
ValueTypeCode maxValueType = colTypes[0];
180214
for (size_t i = 0; i < numCols; i++) {
181215
labels.push_back("col_" + std::to_string(i));
182-
if (generality(colTypes[i]) > generality(maxValueType)) {
216+
if (generality(colTypes[i]) > generality(maxValueType))
183217
maxValueType = colTypes[i];
184-
}
185-
if (colTypes[i] != firstValueType) {
218+
if (colTypes[i] != firstValueType)
186219
isSingleValueType = false;
187-
}
188220
}
189221
if (isSingleValueType) {
190222
colTypes.clear();
191223
labels.clear();
192224
colTypes.push_back(maxValueType);
193225
}
194-
if (isMatrix) {
226+
if (isMatrix)
195227
return FileMetaData(row, numCols, true, {maxValueType}, {});
196-
}
197-
FileMetaData fmd = FileMetaData(row, numCols, isSingleValueType, colTypes, labels);
198-
return fmd;
199-
}
200-
201-
void readCsvLine(File* file, size_t row, char delim, size_t numCols, uint8_t **rawCols, ValueTypeCode* colTypes, bool genTypes = true) {
202-
size_t pos, col = 0;
203-
ValueTypeCode colType = ValueTypeCode::INVALID;
204-
while (1) {
205-
if (colTypes != nullptr){
206-
colType = colTypes[col];
207-
}else if(!genTypes){
208-
throw std::runtime_error("ReadCsvFile::apply: colTypes must be provided if genTypes is false");
209-
}else{ // set colTypes to most specific value type possible
210-
colTypes= new ValueTypeCode[numCols];
211-
for (size_t i = 0; i < numCols; i++) {
212-
colTypes[i] = ValueTypeCode::SI8;
213-
}
214-
}
215-
if (genTypes){
216-
colType = inferValueType(file->line, pos, delim);
217-
if (generality(colType) > generality(colTypes[col])) {
218-
colTypes[col] = colType;
219-
}
220-
}
221-
switch (colTypes[col]) {
222-
case ValueTypeCode::SI8:
223-
int8_t val_si8;
224-
convertCstr(file->line + pos, &val_si8);
225-
reinterpret_cast<int8_t *>(rawCols[col])[row] = val_si8;
226-
break;
227-
case ValueTypeCode::SI32:
228-
int32_t val_si32;
229-
convertCstr(file->line + pos, &val_si32);
230-
reinterpret_cast<int32_t *>(rawCols[col])[row] = val_si32;
231-
break;
232-
case ValueTypeCode::SI64:
233-
int64_t val_si64;
234-
convertCstr(file->line + pos, &val_si64);
235-
reinterpret_cast<int64_t *>(rawCols[col])[row] = val_si64;
236-
break;
237-
case ValueTypeCode::UI8:
238-
uint8_t val_ui8;
239-
convertCstr(file->line + pos, &val_ui8);
240-
reinterpret_cast<uint8_t *>(rawCols[col])[row] = val_ui8;
241-
break;
242-
case ValueTypeCode::UI32:
243-
uint32_t val_ui32;
244-
convertCstr(file->line + pos, &val_ui32);
245-
reinterpret_cast<uint32_t *>(rawCols[col])[row] = val_ui32;
246-
break;
247-
case ValueTypeCode::UI64:
248-
uint64_t val_ui64;
249-
convertCstr(file->line + pos, &val_ui64);
250-
reinterpret_cast<uint64_t *>(rawCols[col])[row] = val_ui64;
251-
break;
252-
case ValueTypeCode::F32:
253-
float val_f32;
254-
convertCstr(file->line + pos, &val_f32);
255-
reinterpret_cast<float *>(rawCols[col])[row] = val_f32;
256-
break;
257-
case ValueTypeCode::F64:
258-
double val_f64;
259-
convertCstr(file->line + pos, &val_f64);
260-
reinterpret_cast<double *>(rawCols[col])[row] = val_f64;
261-
break;
262-
case ValueTypeCode::STR: {
263-
std::string val_str = "";
264-
pos = setCString(file, pos, &val_str, delim);
265-
reinterpret_cast<std::string *>(rawCols[col])[row] = val_str;
266-
break;
267-
}
268-
case ValueTypeCode::FIXEDSTR16: {
269-
std::string val_str = "";
270-
pos = setCString(file, pos, &val_str, delim);
271-
reinterpret_cast<FixedStr16 *>(rawCols[col])[row] = FixedStr16(val_str);
272-
break;
273-
}
274-
default:
275-
throw std::runtime_error("ReadCsvFile::apply: unknown value type code");
276-
}
277-
278-
if (++col >= numCols) {
279-
break;
280-
}
281-
282-
// TODO We could even exploit the fact that the strtoX functions
283-
// can return a pointer to the first character after the parsed
284-
// input, then we wouldn't have to search for that ourselves,
285-
// just would need to check if it is really the delimiter.
286-
while (file->line[pos] != delim)
287-
pos++;
288-
pos++; // skip delimiter
289-
}
228+
return FileMetaData(row, numCols, isSingleValueType, colTypes, labels);
290229
}
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
-5,"hello world!!!",true, 0, -0,"line1
2-
line2",
1+
-5,"hello world!!!!!",true, 0, -0,"line1
2+
line2"
33
1,-115,-1, -2.4, 256, "\"\"\\"

0 commit comments

Comments
 (0)