Skip to content

Commit 184b95a

Browse files
authored
[format] csv: optimize parsing of short, byte, int and long (#6856)
1 parent ad757d2 commit 184b95a

File tree

2 files changed

+176
-23
lines changed

2 files changed

+176
-23
lines changed

paimon-format/src/main/java/org/apache/paimon/format/csv/CsvParser.java

Lines changed: 140 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
package org.apache.paimon.format.csv;
2020

21+
import org.apache.paimon.annotation.VisibleForTesting;
2122
import org.apache.paimon.casting.CastExecutor;
2223
import org.apache.paimon.casting.CastExecutors;
2324
import org.apache.paimon.data.BinaryString;
@@ -27,6 +28,7 @@
2728
import org.apache.paimon.types.DataTypeRoot;
2829
import org.apache.paimon.types.DataTypes;
2930
import org.apache.paimon.types.RowType;
31+
import org.apache.paimon.utils.Pair;
3032

3133
import javax.annotation.Nullable;
3234

@@ -35,6 +37,8 @@
3537
import java.util.Map;
3638
import java.util.concurrent.ConcurrentHashMap;
3739

40+
import static org.apache.paimon.format.csv.CsvOptions.Mode.DROPMALFORMED;
41+
import static org.apache.paimon.format.csv.CsvOptions.Mode.PERMISSIVE;
3842
import static org.apache.paimon.utils.Preconditions.checkArgument;
3943
import static org.apache.paimon.utils.StringUtils.isNullOrWhitespaceOnly;
4044

@@ -44,6 +48,7 @@ public class CsvParser {
4448
private static final Base64.Decoder BASE64_DECODER = Base64.getDecoder();
4549
private static final Map<String, CastExecutor<?, ?>> CAST_EXECUTOR_CACHE =
4650
new ConcurrentHashMap<>();
51+
private static final int NUMBER_PARSE_RADIX = 10;
4752

4853
private final RowType dataSchemaRowType;
4954
private final int[] projectMapping;
@@ -157,20 +162,28 @@ public GenericRow parse(String line) {
157162
for (int i = 0; i < projectMapping.length; i++) {
158163
int ordinal = projectMapping[i];
159164
DataType type = dataSchemaRowType.getTypeAt(ordinal);
160-
Object field = null;
165+
Pair<Boolean, Object> parseResult = null;
166+
Exception exception = null;
167+
String parseValue = rowValues[ordinal];
161168
try {
162-
field = parseField(rowValues[ordinal], type);
169+
parseResult = parseField(parseValue, type);
163170
} catch (Exception e) {
164-
switch (mode) {
165-
case PERMISSIVE:
166-
break;
167-
case DROPMALFORMED:
168-
return null;
169-
case FAILFAST:
170-
throw e;
171-
}
171+
exception = e;
172+
}
173+
if (parseResult != null && parseResult.getLeft()) {
174+
row.setField(i, parseResult.getValue());
175+
} else if (mode == PERMISSIVE
176+
&& (parseResult == null || !parseResult.getLeft() || exception != null)) {
177+
break;
178+
} else if (mode == DROPMALFORMED
179+
&& (parseResult == null || !parseResult.getLeft() || exception != null)) {
180+
return null;
181+
} else if (exception != null) {
182+
throw new RuntimeException(exception);
183+
} else if (parseResult == null
184+
|| !parseResult.getLeft() && parseResult.getValue() == null) {
185+
throw new NumberFormatException("For input string: \"" + parseValue + "\"");
172186
}
173-
row.setField(i, field);
174187
}
175188
return row;
176189
}
@@ -188,35 +201,46 @@ private static boolean isAllWhitespace(CharSequence sequence) {
188201
return true;
189202
}
190203

191-
private Object parseField(String field, DataType dataType) {
204+
@VisibleForTesting
205+
public Pair<Boolean, Object> parseField(String field, DataType dataType) {
192206
if (field == null || field.equals(nullLiteral)) {
193-
return null;
207+
return Pair.of(true, null);
194208
}
195209

196210
DataTypeRoot typeRoot = dataType.getTypeRoot();
197211
switch (typeRoot) {
198212
case TINYINT:
199-
return Byte.parseByte(field);
213+
Integer intVal = parseInt(field);
214+
if (intVal == null || intVal > Byte.MAX_VALUE || intVal < Byte.MIN_VALUE) {
215+
return Pair.of(false, null);
216+
}
217+
return Pair.of(true, intVal.byteValue());
200218
case SMALLINT:
201-
return Short.parseShort(field);
219+
intVal = parseInt(field);
220+
if (intVal == null || intVal > Short.MAX_VALUE || intVal < Short.MIN_VALUE) {
221+
return Pair.of(false, null);
222+
}
223+
return Pair.of(true, intVal.shortValue());
202224
case INTEGER:
203-
return Integer.parseInt(field);
225+
intVal = parseInt(field);
226+
return Pair.of(intVal != null, intVal);
204227
case BIGINT:
205-
return Long.parseLong(field);
228+
Long longVal = parseLong(field);
229+
return Pair.of(longVal != null, longVal);
206230
case FLOAT:
207-
return Float.parseFloat(field);
231+
return Pair.of(true, Float.parseFloat(field));
208232
case DOUBLE:
209-
return Double.parseDouble(field);
233+
return Pair.of(true, Double.parseDouble(field));
210234
case BOOLEAN:
211-
return Boolean.parseBoolean(field);
235+
return Pair.of(true, Boolean.parseBoolean(field));
212236
case CHAR:
213237
case VARCHAR:
214-
return BinaryString.fromString(field);
238+
return Pair.of(true, BinaryString.fromString(field));
215239
case BINARY:
216240
case VARBINARY:
217-
return BASE64_DECODER.decode(field);
241+
return Pair.of(true, BASE64_DECODER.decode(field));
218242
default:
219-
return parseByCastExecutor(field, dataType);
243+
return Pair.of(true, parseByCastExecutor(field, dataType));
220244
}
221245
}
222246

@@ -233,4 +257,97 @@ private Object parseByCastExecutor(String field, DataType dataType) {
233257
}
234258
return BinaryString.fromString(field);
235259
}
260+
261+
private static Integer parseInt(String s) {
262+
if (s == null || s.isEmpty()) {
263+
return null;
264+
}
265+
int len = s.length();
266+
int i = 0;
267+
char firstChar = s.charAt(0);
268+
boolean negative = false;
269+
int limit = -Integer.MAX_VALUE;
270+
271+
if (firstChar < '0') {
272+
if (firstChar == '-') {
273+
negative = true;
274+
limit = Integer.MIN_VALUE;
275+
} else if (firstChar != '+') {
276+
return null;
277+
}
278+
279+
if (len == 1) {
280+
return null;
281+
}
282+
i++;
283+
}
284+
285+
int multmin = limit / NUMBER_PARSE_RADIX;
286+
int result = 0;
287+
int digit;
288+
289+
while (i < len) {
290+
digit = Character.digit(s.charAt(i++), NUMBER_PARSE_RADIX);
291+
if (digit < 0) {
292+
return null;
293+
}
294+
if (result < multmin) {
295+
return null;
296+
}
297+
result *= NUMBER_PARSE_RADIX;
298+
if (result < limit + digit) {
299+
return null;
300+
}
301+
result -= digit;
302+
}
303+
304+
return negative ? result : -result;
305+
}
306+
307+
private static Long parseLong(String s) {
308+
if (s == null || s.isEmpty()) {
309+
return null;
310+
}
311+
int len = s.length();
312+
int i = 0;
313+
char firstChar = s.charAt(0);
314+
boolean negative = false;
315+
long limit = -Long.MAX_VALUE;
316+
317+
if (firstChar < '0') {
318+
if (firstChar == '-') {
319+
negative = true;
320+
limit = Long.MIN_VALUE;
321+
} else if (firstChar != '+') {
322+
return null;
323+
}
324+
325+
if (len == 1) {
326+
return null;
327+
}
328+
i++;
329+
}
330+
331+
long multmin = limit / NUMBER_PARSE_RADIX;
332+
long result = 0;
333+
int digit;
334+
335+
while (i < len) {
336+
digit = Character.digit(s.charAt(i++), NUMBER_PARSE_RADIX);
337+
if (digit < 0) {
338+
return null;
339+
}
340+
341+
if (result < multmin) {
342+
return null;
343+
}
344+
result *= NUMBER_PARSE_RADIX;
345+
if (result < limit + digit) {
346+
return null;
347+
}
348+
result -= digit;
349+
}
350+
351+
return negative ? result : -result;
352+
}
236353
}

paimon-format/src/test/java/org/apache/paimon/format/csv/CsvFileFormatTest.java

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -534,6 +534,7 @@ public void testCsvModeWriteRead() throws IOException {
534534
rowType,
535535
testFile);
536536
})
537+
.cause()
537538
.isInstanceOf(IllegalArgumentException.class);
538539
}
539540

@@ -569,6 +570,41 @@ public void testSpecialCases() throws IOException {
569570
assertThat(permissiveResult.get(3).getDouble(2)).isEqualTo(400.81);
570571
}
571572

573+
@Test
574+
public void testCsvParserParseField() {
575+
RowType rowType =
576+
DataTypes.ROW(
577+
DataTypes.TINYINT(),
578+
DataTypes.SMALLINT(),
579+
DataTypes.INT(),
580+
DataTypes.BIGINT());
581+
int[] projection = {0, 1, 2, 3};
582+
CsvParser parser = new CsvParser(rowType, projection, new CsvOptions(new Options()));
583+
584+
// Test normal cases
585+
assertThat(parser.parseField("123", DataTypes.INT()).getValue()).isEqualTo(123);
586+
assertThat(parser.parseField("-0", DataTypes.INT()).getValue()).isEqualTo(0);
587+
assertThat(parser.parseField("0", DataTypes.INT()).getValue()).isEqualTo(0);
588+
assertThat(parser.parseField("123", DataTypes.BIGINT()).getValue()).isEqualTo(123L);
589+
assertThat(parser.parseField("-0", DataTypes.BIGINT()).getValue()).isEqualTo(0L);
590+
assertThat(parser.parseField("0", DataTypes.BIGINT()).getValue()).isEqualTo(0L);
591+
assertThat(parser.parseField("123", DataTypes.TINYINT()).getValue()).isEqualTo((byte) 123);
592+
assertThat(parser.parseField("12345", DataTypes.SMALLINT()).getValue())
593+
.isEqualTo((short) 12345);
594+
595+
// Test invalid format
596+
assertThat(parser.parseField("abc", DataTypes.INT()).getValue()).isNull();
597+
assertThat(parser.parseField("12.3", DataTypes.INT()).getValue()).isNull();
598+
599+
// Test overflow
600+
assertThat(parser.parseField("2147483648", DataTypes.INT()).getValue()).isNull();
601+
assertThat(parser.parseField("-2147483649", DataTypes.INT()).getValue()).isNull();
602+
assertThat(parser.parseField("9223372036854775808", DataTypes.BIGINT()).getValue())
603+
.isNull();
604+
assertThat(parser.parseField("128", DataTypes.TINYINT()).getValue()).isNull();
605+
assertThat(parser.parseField("32768", DataTypes.SMALLINT()).getValue()).isNull();
606+
}
607+
572608
private List<InternalRow> read(
573609
FileFormat format, RowType fullRowType, RowType readRowType, Path testFile)
574610
throws IOException {

0 commit comments

Comments
 (0)