Binary readers update

Baunsgaard · Baunsgaard · commit 6744bf8b4590 · 2025-01-13T21:35:46.000+01:00
diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderBinaryBlock.java b/src/main/java/org/apache/sysds/runtime/io/FrameReaderBinaryBlock.java
@@ -31,6 +31,8 @@
 import org.apache.sysds.conf.ConfigurationManager;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.frame.data.FrameBlock;
+import org.apache.sysds.runtime.frame.data.columns.ArrayWrapper;
+import org.apache.sysds.runtime.frame.data.columns.DDCArray;
 
 /**
  * Single-threaded frame binary block reader.
@@ -58,6 +60,9 @@ public final FrameBlock readFrameFromHDFS(String fname, ValueType[] schema, Stri
 
 		// core read (sequential/parallel)
 		readBinaryBlockFrameFromHDFS(path, job, fs, ret, rlen, clen);
+		
+		readBinaryDictionariesFromHDFS(new Path(fname + ".dict"), job, fs, ret);
+
 		return ret;
 	}
 
@@ -114,6 +119,29 @@ protected static void readBinaryBlockFrameFromSequenceFile(Path path, JobConf jo
 		}
 	}
 
+	protected static void readBinaryDictionariesFromHDFS(Path path, JobConf job, FileSystem fs, FrameBlock ret) {
+		try{
+			if(fs.exists(path)){
+				LongWritable key = new LongWritable();
+				ArrayWrapper value = new ArrayWrapper(null);
+				SequenceFile.Reader reader = new SequenceFile.Reader(job, SequenceFile.Reader.file(path));
+				try{
+					while(reader.next(key,value)){
+						int colId = (int)key.get();
+						DDCArray<?> a = (DDCArray<?>) ret.getColumn(colId);
+						ret.setColumn(colId, a.setDict(value._a));
+					}
+				}
+				finally{
+					IOUtilFunctions.closeSilently(reader);
+				}
+			}
+		}
+		catch(IOException e){
+			throw new DMLRuntimeException("Failed to read Frame Dictionaries", e);
+		}
+	}
+
 	/**
 	 * Specific functionality of FrameReaderBinaryBlock, mostly used for testing.
 	 * 
@@ -143,4 +171,7 @@ public FrameBlock readFirstBlock(String fname) throws IOException {
 
 		return value;
 	}
+
+
+
 }
diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameWriterBinaryBlock.java b/src/main/java/org/apache/sysds/runtime/io/FrameWriterBinaryBlock.java
@@ -20,6 +20,8 @@
 package org.apache.sysds.runtime.io;
 
 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
 
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -29,6 +31,10 @@
 import org.apache.sysds.conf.ConfigurationManager;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.frame.data.FrameBlock;
+import org.apache.sysds.runtime.frame.data.columns.Array;
+import org.apache.sysds.runtime.frame.data.columns.ArrayWrapper;
+import org.apache.sysds.runtime.frame.data.columns.DDCArray;
+import org.apache.sysds.runtime.matrix.data.Pair;
 import org.apache.sysds.runtime.util.HDFSTool;
 
 /**
@@ -43,30 +49,67 @@ public final void writeFrameToHDFS(FrameBlock src, String fname, long rlen, long
 		// prepare file access
 		JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
 		Path path = new Path(fname);
-
+		
 		// if the file already exists on HDFS, remove it.
 		HDFSTool.deleteFileIfExistOnHDFS(fname);
-
+		HDFSTool.deleteFileIfExistOnHDFS(fname + ".dict");
+		
 		// bound check for src block
 		if(src.getNumRows() > rlen || src.getNumColumns() > clen) {
 			throw new IOException("Frame block [1:" + src.getNumRows() + ",1:" + src.getNumColumns() + "] "
 				+ "out of overall frame range [1:" + rlen + ",1:" + clen + "].");
 		}
 
+		Pair<List<Pair<Integer,Array<?>>>, FrameBlock> prep = extractDictionaries(src);
+		src = prep.getValue();
+
 		// write binary block to hdfs (sequential/parallel)
-		writeBinaryBlockFrameToHDFS(path, job, src, rlen, clen);
+		writeBinaryBlockFrameToHDFS(path, job, prep.getValue(), rlen, clen);
+
+		if(prep.getKey().size() > 0)
+			writeBinaryBlockDictsToSequenceFile(new Path(fname + ".dict"), job, prep.getKey());
+		
+	}
+
+	protected Pair<List<Pair<Integer,Array<?>>>, FrameBlock> extractDictionaries(FrameBlock src){
+		List<Pair<Integer,Array<?>>> dicts = new ArrayList<>();
+		int blen = ConfigurationManager.getBlocksize();
+		if(src.getNumRows() < blen )
+			return new Pair<>(dicts, src);
+		boolean modified = false;
+		for(int i = 0; i < src.getNumColumns(); i++){
+			Array<?> a = src.getColumn(i);
+			if(a instanceof DDCArray){
+				DDCArray<?> d = (DDCArray<?>)a;
+				dicts.add(new Pair<>(i, d.getDict()));
+				if(modified == false){
+					modified = true;
+					// make sure other users of this frame does not get effected
+				   src = src.copyShallow(); 
+				}
+				src.setColumn(i, d.nullDict());
+			}
+		} 
+		return new Pair<>(dicts, src);
 	}
 
 	protected void writeBinaryBlockFrameToHDFS(Path path, JobConf job, FrameBlock src, long rlen, long clen)
 		throws IOException, DMLRuntimeException {
 		FileSystem fs = IOUtilFunctions.getFileSystem(path);
 		int blen = ConfigurationManager.getBlocksize();
-
+		
 		// sequential write to single file
 		writeBinaryBlockFrameToSequenceFile(path, job, fs, src, blen, 0, (int) rlen);
 		IOUtilFunctions.deleteCrcFilesFromLocalFileSystem(fs, path);
 	}
 
+	protected void writeBinaryBlockDictsToSequenceFile(Path path, JobConf job, List<Pair<Integer, Array<?>>> dicts)
+		throws IOException, DMLRuntimeException {
+		FileSystem fs = IOUtilFunctions.getFileSystem(path);
+		writeBinaryBlockDictsToSequenceFile(path, job, fs, dicts);
+		IOUtilFunctions.deleteCrcFilesFromLocalFileSystem(fs, path);
+	}
+
 	/**
 	 * Internal primitive to write a block-aligned row range of a frame to a single sequence file, which is used for both
 	 * single- and multi-threaded writers (for consistency).
@@ -111,4 +154,20 @@ protected static void writeBinaryBlockFrameToSequenceFile(Path path, JobConf job
 			IOUtilFunctions.closeSilently(writer);
 		}
 	}
+
+	protected static void writeBinaryBlockDictsToSequenceFile(Path path, JobConf job, FileSystem fs, List<Pair<Integer,Array<?>>> dicts) throws IOException{
+		final Writer writer = IOUtilFunctions.getSeqWriterArray(path, job, 1);
+		try{
+			LongWritable index = new LongWritable();
+
+			for(int i = 0; i < dicts.size(); i++){
+				Pair<Integer, Array<?>> p = dicts.get(i);
+				index.set(p.getKey());
+				writer.append(index, new ArrayWrapper(p.getValue()));
+			}
+		}
+		finally {
+			IOUtilFunctions.closeSilently(writer);
+		}
+	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/io/IOUtilFunctions.java b/src/main/java/org/apache/sysds/runtime/io/IOUtilFunctions.java
@@ -72,10 +72,10 @@
 import org.apache.sysds.runtime.data.TensorBlock;
 import org.apache.sysds.runtime.data.TensorIndexes;
 import org.apache.sysds.runtime.frame.data.FrameBlock;
+import org.apache.sysds.runtime.frame.data.columns.ArrayWrapper;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.data.MatrixCell;
 import org.apache.sysds.runtime.matrix.data.MatrixIndexes;
-import org.apache.sysds.runtime.transform.TfUtils;
 import org.apache.sysds.runtime.util.LocalFileUtils;
 
 import io.airlift.compress.lzo.LzoCodec;
@@ -242,6 +242,29 @@ public static String[] splitCSV(String str, String delim){
 		return tokens.toArray(new String[0]);
 	}
 
+	public static String[] splitCSV(String str, String delim, int clen){
+		if(str == null || str.isEmpty())
+			return new String[] {""};
+
+		int from = 0, to = 0;
+		final int len = str.length();
+		final int delimLen = delim.length();
+
+		final String[] tokens = new String[clen];
+		int c = 0;
+		while(from < len) { // for all tokens
+			to = getTo(str, from, delim, len, delimLen);
+			tokens[c++] = str.substring(from, to);
+			from = to + delimLen;
+		}
+
+		// handle empty string at end
+		if(from == len)
+			tokens[c++] = "";
+
+		return tokens;
+	}
+
 	/**
 	 * Splits a string by a specified delimiter into all tokens, including empty
 	 * while respecting the rules for quotes and escapes defined in RFC4180,
@@ -346,7 +369,7 @@ private static boolean isEmptyMatch(final String str, final int from, final Stri
 	 * @param dLen  The length of the delimiter string
 	 * @return The next index.
 	 */
-	private static int getTo(final String str, final int from, final String delim,
+	public static int getTo(final String str, final int from, final String delim,
 		final int len, final int dLen) {
 		final char cq = CSV_QUOTE_CHAR;
 		final int fromP1 = from + 1;
@@ -404,17 +427,32 @@ private static int getToNoQuoteCharDelim(final String str, final int from, final
 	}
 
 	public static String trim(String str) {
+		final int len = str.length();
+		if(len == 0)
+			return str;
+		return trim(str, len);
+	}
+
+	/**
+	 * Caller must have a string of at least 1 character length.
+	 * 
+	 * @param str string to trim
+	 * @param len length of string
+	 * @return the trimmed string.
+	 */
+	public static String trim(final String str, final int len) {
 		try{
-			final int len = str.length();
-			if(len == 0)
-				return str;
 			// short the call to return input if not whitespace in ends.
-			else if(str.charAt(0) <= ' ' || str.charAt(len -1) <= ' ')
+			if(str.charAt(0) <= ' ' || str.charAt(len -1) <= ' ')
 				return str.trim();
 			else 
 				return str;
-		}catch(Exception e){
-			throw new RuntimeException("failed trimming: " + str + " " + str.length(),e);
+		}
+		catch(NullPointerException e){
+			return null;
+		}
+		catch(Exception e){
+			throw new RuntimeException("failed trimming: " + str + " " + str.length(), e);
 		}
 	}
 
@@ -657,10 +695,10 @@ public static int countNumColumnsCSV(InputSplit[] splits, InputFormat informat,
 			try {
 				if( reader.next(key, value) ) {
 					boolean hasValue = true;
-					if( value.toString().startsWith(TfUtils.TXMTD_MVPREFIX) )
-						hasValue = reader.next(key, value);
-					if( value.toString().startsWith(TfUtils.TXMTD_NDPREFIX) )
-						hasValue = reader.next(key, value);
+					// if( value.toString().startsWith(TfUtils.TXMTD_MVPREFIX) )
+					// 	hasValue = reader.next(key, value);
+					// if( value.toString().startsWith(TfUtils.TXMTD_NDPREFIX) )
+					// 	hasValue = reader.next(key, value);
 					String row = value.toString().trim();
 					if( hasValue && !row.isEmpty() ) {
 						ncol = IOUtilFunctions.countTokensCSV(row, delim);
@@ -901,6 +939,13 @@ public static Writer getSeqWriterFrame(Path path, Configuration job, int replica
 			Writer.replication((short) (replication > 0 ? replication : 1)));
 	}
 
+	public static Writer getSeqWriterArray(Path path, Configuration job, int replication) throws IOException {
+		return SequenceFile.createWriter(job, Writer.file(path), Writer.bufferSize(4096),
+			Writer.keyClass(LongWritable.class), Writer.valueClass(ArrayWrapper.class),
+			Writer.compression(getCompressionEncodingType(), getCompressionCodec()),
+			Writer.replication((short) (replication > 0 ? replication : 1)));
+	}
+
 	public static Writer getSeqWriterTensor(Path path, Configuration job, int replication) throws IOException {
 		return SequenceFile.createWriter(job, Writer.file(path), Writer.bufferSize(4096),
 		Writer.replication((short) (replication > 0 ? replication : 1)),