|
72 | 72 | import org.apache.sysds.runtime.data.TensorBlock; |
73 | 73 | import org.apache.sysds.runtime.data.TensorIndexes; |
74 | 74 | import org.apache.sysds.runtime.frame.data.FrameBlock; |
| 75 | +import org.apache.sysds.runtime.frame.data.columns.ArrayWrapper; |
75 | 76 | import org.apache.sysds.runtime.matrix.data.MatrixBlock; |
76 | 77 | import org.apache.sysds.runtime.matrix.data.MatrixCell; |
77 | 78 | import org.apache.sysds.runtime.matrix.data.MatrixIndexes; |
78 | | -import org.apache.sysds.runtime.transform.TfUtils; |
79 | 79 | import org.apache.sysds.runtime.util.LocalFileUtils; |
80 | 80 |
|
81 | 81 | import io.airlift.compress.lzo.LzoCodec; |
@@ -242,6 +242,29 @@ public static String[] splitCSV(String str, String delim){ |
242 | 242 | return tokens.toArray(new String[0]); |
243 | 243 | } |
244 | 244 |
|
| 245 | + public static String[] splitCSV(String str, String delim, int clen){ |
| 246 | + if(str == null || str.isEmpty()) |
| 247 | + return new String[] {""}; |
| 248 | + |
| 249 | + int from = 0, to = 0; |
| 250 | + final int len = str.length(); |
| 251 | + final int delimLen = delim.length(); |
| 252 | + |
| 253 | + final String[] tokens = new String[clen]; |
| 254 | + int c = 0; |
| 255 | + while(from < len) { // for all tokens |
| 256 | + to = getTo(str, from, delim, len, delimLen); |
| 257 | + tokens[c++] = str.substring(from, to); |
| 258 | + from = to + delimLen; |
| 259 | + } |
| 260 | + |
| 261 | + // handle empty string at end |
| 262 | + if(from == len) |
| 263 | + tokens[c++] = ""; |
| 264 | + |
| 265 | + return tokens; |
| 266 | + } |
| 267 | + |
245 | 268 | /** |
246 | 269 | * Splits a string by a specified delimiter into all tokens, including empty |
247 | 270 | * while respecting the rules for quotes and escapes defined in RFC4180, |
@@ -346,7 +369,7 @@ private static boolean isEmptyMatch(final String str, final int from, final Stri |
346 | 369 | * @param dLen The length of the delimiter string |
347 | 370 | * @return The next index. |
348 | 371 | */ |
349 | | - private static int getTo(final String str, final int from, final String delim, |
| 372 | + public static int getTo(final String str, final int from, final String delim, |
350 | 373 | final int len, final int dLen) { |
351 | 374 | final char cq = CSV_QUOTE_CHAR; |
352 | 375 | final int fromP1 = from + 1; |
@@ -404,17 +427,32 @@ private static int getToNoQuoteCharDelim(final String str, final int from, final |
404 | 427 | } |
405 | 428 |
|
406 | 429 | public static String trim(String str) { |
| 430 | + final int len = str.length(); |
| 431 | + if(len == 0) |
| 432 | + return str; |
| 433 | + return trim(str, len); |
| 434 | + } |
| 435 | + |
| 436 | + /** |
| 437 | + * Caller must have a string of at least 1 character length. |
| 438 | + * |
| 439 | + * @param str string to trim |
| 440 | + * @param len length of string |
| 441 | + * @return the trimmed string. |
| 442 | + */ |
| 443 | + public static String trim(final String str, final int len) { |
407 | 444 | try{ |
408 | | - final int len = str.length(); |
409 | | - if(len == 0) |
410 | | - return str; |
411 | 445 | // short the call to return input if not whitespace in ends. |
412 | | - else if(str.charAt(0) <= ' ' || str.charAt(len -1) <= ' ') |
| 446 | + if(str.charAt(0) <= ' ' || str.charAt(len -1) <= ' ') |
413 | 447 | return str.trim(); |
414 | 448 | else |
415 | 449 | return str; |
416 | | - }catch(Exception e){ |
417 | | - throw new RuntimeException("failed trimming: " + str + " " + str.length(),e); |
| 450 | + } |
| 451 | + catch(NullPointerException e){ |
| 452 | + return null; |
| 453 | + } |
| 454 | + catch(Exception e){ |
| 455 | + throw new RuntimeException("failed trimming: " + str + " " + str.length(), e); |
418 | 456 | } |
419 | 457 | } |
420 | 458 |
|
@@ -657,10 +695,10 @@ public static int countNumColumnsCSV(InputSplit[] splits, InputFormat informat, |
657 | 695 | try { |
658 | 696 | if( reader.next(key, value) ) { |
659 | 697 | boolean hasValue = true; |
660 | | - if( value.toString().startsWith(TfUtils.TXMTD_MVPREFIX) ) |
661 | | - hasValue = reader.next(key, value); |
662 | | - if( value.toString().startsWith(TfUtils.TXMTD_NDPREFIX) ) |
663 | | - hasValue = reader.next(key, value); |
| 698 | + // if( value.toString().startsWith(TfUtils.TXMTD_MVPREFIX) ) |
| 699 | + // hasValue = reader.next(key, value); |
| 700 | + // if( value.toString().startsWith(TfUtils.TXMTD_NDPREFIX) ) |
| 701 | + // hasValue = reader.next(key, value); |
664 | 702 | String row = value.toString().trim(); |
665 | 703 | if( hasValue && !row.isEmpty() ) { |
666 | 704 | ncol = IOUtilFunctions.countTokensCSV(row, delim); |
@@ -901,6 +939,13 @@ public static Writer getSeqWriterFrame(Path path, Configuration job, int replica |
901 | 939 | Writer.replication((short) (replication > 0 ? replication : 1))); |
902 | 940 | } |
903 | 941 |
|
| 942 | + public static Writer getSeqWriterArray(Path path, Configuration job, int replication) throws IOException { |
| 943 | + return SequenceFile.createWriter(job, Writer.file(path), Writer.bufferSize(4096), |
| 944 | + Writer.keyClass(LongWritable.class), Writer.valueClass(ArrayWrapper.class), |
| 945 | + Writer.compression(getCompressionEncodingType(), getCompressionCodec()), |
| 946 | + Writer.replication((short) (replication > 0 ? replication : 1))); |
| 947 | + } |
| 948 | + |
904 | 949 | public static Writer getSeqWriterTensor(Path path, Configuration job, int replication) throws IOException { |
905 | 950 | return SequenceFile.createWriter(job, Writer.file(path), Writer.bufferSize(4096), |
906 | 951 | Writer.replication((short) (replication > 0 ? replication : 1)), |
|
0 commit comments