1818
1919package org .apache .paimon .format .csv ;
2020
21+ import org .apache .paimon .annotation .VisibleForTesting ;
2122import org .apache .paimon .casting .CastExecutor ;
2223import org .apache .paimon .casting .CastExecutors ;
2324import org .apache .paimon .data .BinaryString ;
2728import org .apache .paimon .types .DataTypeRoot ;
2829import org .apache .paimon .types .DataTypes ;
2930import org .apache .paimon .types .RowType ;
31+ import org .apache .paimon .utils .Pair ;
3032
3133import javax .annotation .Nullable ;
3234
3537import java .util .Map ;
3638import java .util .concurrent .ConcurrentHashMap ;
3739
40+ import static org .apache .paimon .format .csv .CsvOptions .Mode .DROPMALFORMED ;
41+ import static org .apache .paimon .format .csv .CsvOptions .Mode .PERMISSIVE ;
3842import static org .apache .paimon .utils .Preconditions .checkArgument ;
3943import static org .apache .paimon .utils .StringUtils .isNullOrWhitespaceOnly ;
4044
@@ -44,6 +48,7 @@ public class CsvParser {
4448 private static final Base64 .Decoder BASE64_DECODER = Base64 .getDecoder ();
4549 private static final Map <String , CastExecutor <?, ?>> CAST_EXECUTOR_CACHE =
4650 new ConcurrentHashMap <>();
51+ private static final int NUMBER_PARSE_RADIX = 10 ;
4752
4853 private final RowType dataSchemaRowType ;
4954 private final int [] projectMapping ;
@@ -157,20 +162,28 @@ public GenericRow parse(String line) {
157162 for (int i = 0 ; i < projectMapping .length ; i ++) {
158163 int ordinal = projectMapping [i ];
159164 DataType type = dataSchemaRowType .getTypeAt (ordinal );
160- Object field = null ;
165+ Pair <Boolean , Object > parseResult = null ;
166+ Exception exception = null ;
167+ String parseValue = rowValues [ordinal ];
161168 try {
162- field = parseField (rowValues [ ordinal ] , type );
169+ parseResult = parseField (parseValue , type );
163170 } catch (Exception e ) {
164- switch (mode ) {
165- case PERMISSIVE :
166- break ;
167- case DROPMALFORMED :
168- return null ;
169- case FAILFAST :
170- throw e ;
171- }
171+ exception = e ;
172+ }
173+ if (parseResult != null && parseResult .getLeft ()) {
174+ row .setField (i , parseResult .getValue ());
175+ } else if (mode == PERMISSIVE
176+ && (parseResult == null || !parseResult .getLeft () || exception != null )) {
177+ break ;
178+ } else if (mode == DROPMALFORMED
179+ && (parseResult == null || !parseResult .getLeft () || exception != null )) {
180+ return null ;
181+ } else if (exception != null ) {
182+ throw new RuntimeException (exception );
183+ } else if (parseResult == null
184+ || !parseResult .getLeft () && parseResult .getValue () == null ) {
185+ throw new NumberFormatException ("For input string: \" " + parseValue + "\" " );
172186 }
173- row .setField (i , field );
174187 }
175188 return row ;
176189 }
@@ -188,35 +201,46 @@ private static boolean isAllWhitespace(CharSequence sequence) {
188201 return true ;
189202 }
190203
191- private Object parseField (String field , DataType dataType ) {
204+ @ VisibleForTesting
205+ public Pair <Boolean , Object > parseField (String field , DataType dataType ) {
192206 if (field == null || field .equals (nullLiteral )) {
193- return null ;
207+ return Pair . of ( true , null ) ;
194208 }
195209
196210 DataTypeRoot typeRoot = dataType .getTypeRoot ();
197211 switch (typeRoot ) {
198212 case TINYINT :
199- return Byte .parseByte (field );
213+ Integer intVal = parseInt (field );
214+ if (intVal == null || intVal > Byte .MAX_VALUE || intVal < Byte .MIN_VALUE ) {
215+ return Pair .of (false , null );
216+ }
217+ return Pair .of (true , intVal .byteValue ());
200218 case SMALLINT :
201- return Short .parseShort (field );
219+ intVal = parseInt (field );
220+ if (intVal == null || intVal > Short .MAX_VALUE || intVal < Short .MIN_VALUE ) {
221+ return Pair .of (false , null );
222+ }
223+ return Pair .of (true , intVal .shortValue ());
202224 case INTEGER :
203- return Integer .parseInt (field );
225+ intVal = parseInt (field );
226+ return Pair .of (intVal != null , intVal );
204227 case BIGINT :
205- return Long .parseLong (field );
228+ Long longVal = parseLong (field );
229+ return Pair .of (longVal != null , longVal );
206230 case FLOAT :
207- return Float .parseFloat (field );
231+ return Pair . of ( true , Float .parseFloat (field ) );
208232 case DOUBLE :
209- return Double .parseDouble (field );
233+ return Pair . of ( true , Double .parseDouble (field ) );
210234 case BOOLEAN :
211- return Boolean .parseBoolean (field );
235+ return Pair . of ( true , Boolean .parseBoolean (field ) );
212236 case CHAR :
213237 case VARCHAR :
214- return BinaryString .fromString (field );
238+ return Pair . of ( true , BinaryString .fromString (field ) );
215239 case BINARY :
216240 case VARBINARY :
217- return BASE64_DECODER .decode (field );
241+ return Pair . of ( true , BASE64_DECODER .decode (field ) );
218242 default :
219- return parseByCastExecutor (field , dataType );
243+ return Pair . of ( true , parseByCastExecutor (field , dataType ) );
220244 }
221245 }
222246
@@ -233,4 +257,97 @@ private Object parseByCastExecutor(String field, DataType dataType) {
233257 }
234258 return BinaryString .fromString (field );
235259 }
260+
261+ private static Integer parseInt (String s ) {
262+ if (s == null || s .isEmpty ()) {
263+ return null ;
264+ }
265+ int len = s .length ();
266+ int i = 0 ;
267+ char firstChar = s .charAt (0 );
268+ boolean negative = false ;
269+ int limit = -Integer .MAX_VALUE ;
270+
271+ if (firstChar < '0' ) {
272+ if (firstChar == '-' ) {
273+ negative = true ;
274+ limit = Integer .MIN_VALUE ;
275+ } else if (firstChar != '+' ) {
276+ return null ;
277+ }
278+
279+ if (len == 1 ) {
280+ return null ;
281+ }
282+ i ++;
283+ }
284+
285+ int multmin = limit / NUMBER_PARSE_RADIX ;
286+ int result = 0 ;
287+ int digit ;
288+
289+ while (i < len ) {
290+ digit = Character .digit (s .charAt (i ++), NUMBER_PARSE_RADIX );
291+ if (digit < 0 ) {
292+ return null ;
293+ }
294+ if (result < multmin ) {
295+ return null ;
296+ }
297+ result *= NUMBER_PARSE_RADIX ;
298+ if (result < limit + digit ) {
299+ return null ;
300+ }
301+ result -= digit ;
302+ }
303+
304+ return negative ? result : -result ;
305+ }
306+
307+ private static Long parseLong (String s ) {
308+ if (s == null || s .isEmpty ()) {
309+ return null ;
310+ }
311+ int len = s .length ();
312+ int i = 0 ;
313+ char firstChar = s .charAt (0 );
314+ boolean negative = false ;
315+ long limit = -Long .MAX_VALUE ;
316+
317+ if (firstChar < '0' ) {
318+ if (firstChar == '-' ) {
319+ negative = true ;
320+ limit = Long .MIN_VALUE ;
321+ } else if (firstChar != '+' ) {
322+ return null ;
323+ }
324+
325+ if (len == 1 ) {
326+ return null ;
327+ }
328+ i ++;
329+ }
330+
331+ long multmin = limit / NUMBER_PARSE_RADIX ;
332+ long result = 0 ;
333+ int digit ;
334+
335+ while (i < len ) {
336+ digit = Character .digit (s .charAt (i ++), NUMBER_PARSE_RADIX );
337+ if (digit < 0 ) {
338+ return null ;
339+ }
340+
341+ if (result < multmin ) {
342+ return null ;
343+ }
344+ result *= NUMBER_PARSE_RADIX ;
345+ if (result < limit + digit ) {
346+ return null ;
347+ }
348+ result -= digit ;
349+ }
350+
351+ return negative ? result : -result ;
352+ }
236353}
0 commit comments