Add parse_text_table()

LadyCailin · LadyCailin · commit 80f18e6c15aa · 2021-02-11T19:29:03.000+01:00
diff --git a/src/main/java/com/laytonsmith/core/functions/CompositeFunction.java b/src/main/java/com/laytonsmith/core/functions/CompositeFunction.java
@@ -1,8 +1,10 @@
 package com.laytonsmith.core.functions;
 
 import com.laytonsmith.PureUtilities.Common.StreamUtils;
+import com.laytonsmith.core.MSLog;
 import com.laytonsmith.core.MethodScriptCompiler;
 import com.laytonsmith.core.ParseTree;
+import com.laytonsmith.core.Prefs;
 import com.laytonsmith.core.Script;
 import com.laytonsmith.core.compiler.analysis.ParamDeclaration;
 import com.laytonsmith.core.compiler.analysis.Scope;
@@ -19,6 +21,7 @@
 import com.laytonsmith.core.exceptions.ConfigRuntimeException;
 import com.laytonsmith.core.exceptions.FunctionReturnException;
 import com.laytonsmith.core.natives.interfaces.Mixed;
+import java.io.File;
 
 import java.util.HashMap;
 import java.util.Map;
@@ -40,13 +43,17 @@ public final Mixed exec(Target t, Environment env, Mixed... args) throws ConfigR
 		ParseTree tree;
 		// TODO: Ultimately, this is not scalable. We need to compile and cache these scripts at Java compile time,
 		// not at runtime the first time a function is used. This is an easier first step though.
+		File debugFile = null;
+		if(Prefs.DebugMode()) {
+			debugFile = new File("/NATIVE-MSCRIPT/" + getName());
+		}
 		if(!CACHED_SCRIPTS.containsKey(this.getClass())) {
 			try {
 
 				String script = script();
 				Scope rootScope = new Scope();
 				rootScope.addDeclaration(new ParamDeclaration("@arguments", CArray.TYPE, Target.UNKNOWN));
-				tree = MethodScriptCompiler.compile(MethodScriptCompiler.lex(script, env, null, true),
+				tree = MethodScriptCompiler.compile(MethodScriptCompiler.lex(script, env, debugFile, true),
 						env, env.getEnvClasses(), new StaticAnalysis(rootScope, true))
 						// the root of the tree is null, so go ahead and pull it up
 						.getChildAt(0);
@@ -77,6 +84,10 @@ public final Mixed exec(Target t, Environment env, Mixed... args) throws ConfigR
 		} catch (FunctionReturnException ex) {
 			ret = ex.getReturn();
 		} catch (ConfigRuntimeException ex) {
+			if(Prefs.DebugMode()) {
+				MSLog.GetLogger().e(MSLog.Tags.GENERAL, "Possibly false stacktrace, could be internal error",
+						ex.getTarget());
+			}
 			if(gEnv.GetStackTraceManager().getCurrentStackTrace().isEmpty()) {
 				ex.setTarget(t);
 				ConfigRuntimeException.StackTraceElement ste = new ConfigRuntimeException
diff --git a/src/main/java/com/laytonsmith/core/functions/DataTransformations.java b/src/main/java/com/laytonsmith/core/functions/DataTransformations.java
@@ -525,4 +525,50 @@ public Version since() {
 		}
 
 	}
+
+	@api
+	public static class parse_text_table extends CompositeFunction {
+
+		@Override
+		public Class<? extends CREThrowable>[] thrown() {
+			return new Class[]{CREFormatException.class};
+		}
+
+		@Override
+		public boolean isRestricted() {
+			return false;
+		}
+
+		@Override
+		public Boolean runAsync() {
+			return null;
+		}
+
+
+		@Override
+		public String getName() {
+			return "parse_text_table";
+		}
+
+		@Override
+		public Integer[] numArgs() {
+			return new Integer[]{1, 2};
+		}
+
+		@Override
+		public String docs() {
+			return getBundledDocs();
+		}
+
+		@Override
+		public Version since() {
+			return MSVersion.V3_3_4;
+		}
+
+		@Override
+		protected String script() {
+			return getBundledCode();
+		}
+
+	}
 }
diff --git a/src/main/resources/functionDocs/parse_text_table b/src/main/resources/functionDocs/parse_text_table
@@ -0,0 +1,70 @@
+array {string table, [array options]} Parses tabular data into an array ----
+
+Given a string such as:
+
+<%PRE|
+column1         column2             column3
+-------         -------             -------
+data a1         data a2             data a3
+data b1         data b2             data b3
+%>
+
+this function will parse the data into an associative array:
+
+<%CODE|
+array('column1': array('data a1', 'data b1'), 'column2': array('data a2', 'data b2'), 'column3': array('data a3', 'data b3'))
+%>
+
+The second parameter is an array of options, which can allow for more flexible input, though is optional, and has
+default values for all parameters.
+
+{|
+|-
+! scope="col" width="6%" | Setting
+! scope="col" width="10%" | Type
+! scope="col" width="6%" | Default
+! scope="col" width="78%" | Description
+|-
+| columns
+| array
+| null
+| If the string doesn't have column headings in the first line, these can be provided as an array here. Note that if you
+provide this parameter, columnWidth is a required parameter as well.
+|-
+| columnWidth
+| array
+| null
+| For data that isn't consistently formatted, you may need to provide your own values for the column widths. Normally,
+this is calculated automatically based on the first and second lines, but if those don't match the data, or aren't
+provided, you need to provide this manually. This should be an array of the same size or one less of the columns option,
+and should contain the width of each column, optionally skipping the last.
+For instance, in the example table shown above, the width should be array(16, 20) or array(16, 20, 7). If the last value
+is skipped, this means "the rest of the line".
+|-
+| tabWidth
+| int
+| 4
+| Before converting the data, all tabs are normalized to spaces based on the tab width of the line. For instance, if the
+line of data is <pre>"a\tb\tc"</pre> then this will be converted to <pre>"a   b   c"</pre>, and then the column width
+data is used. In cases where data is separated using exclusively spaces this setting won't matter, as the column width
+and data should line up in any case. However, if tabs are used, it may misformat depending on the tab width assumptions
+that the data originated from. If you can control the data, it is more reliable to output data using spaces rather than
+tabs, or use a tab width of 4.
+|-
+| skipEmptyLines
+| boolean
+| true
+| If true, empty lines are totally skipped. If false, blank lines will add zero width strings in all the columns in their place.
+|}
+
+When using the automatic column width detection, it isn't required to have any particular character used as the header separator
+in the second line. Nor is it required to fill the line. It's merely required to have one or more space between each
+column, and then the column width is measured between the start of each character sequence. For instance, the following table
+would be properly parsed as well:
+
+<%PRE|
+column1    column2    column3
+---------- -------    -
+a          b          c
+d          e          f
+%>
diff --git a/src/main/resources/function_impl/parse_text_table.ms b/src/main/resources/function_impl/parse_text_table.ms
@@ -0,0 +1,104 @@
+string @data = @arguments[0];
+array @options = array_get(@arguments, 1, associative_array());
+
+array @ret = associative_array();
+array @columns = array_get(@options, 'columns', array())[];
+array @columnWidth = array_get(@options, 'columnWidth', array())[];
+int @tabWidth = array_get(@options, 'tabWidth', 4);
+boolean @skipEmptyLines = array_get(@options, 'skipEmptyLines', true);
+
+array @lines = reg_split('\n|\r\n|\n\r', @data);
+
+closure @normalizeLine = iclosure(@line, @tabWidth) {
+	@output = "";
+	for(@i = 0, @i < length(@line), @i++) {
+		if(@line[@i] != '\t') {
+			@output .= @line[@i];
+		} else {
+			@output .= string_multiply(" ", @tabWidth - (length(@output) % @tabWidth));
+		}
+	}
+	return(@output);
+};
+
+closure @splitLine = iclosure(@line, @columnWidths) {
+	array @sections = array();
+	@lastX = 0;
+	foreach(@width in @columnWidths) {
+		try {
+			if(@width == math_const('INFINITY') || @lastX + @width > length(@line)) {
+				@sections[] = substr(@line, @lastX);
+			} else {
+				@sections[] = substr(@line, @lastX, @lastX + @width);
+			}
+		} catch (Exception @ex) {
+			// The line ends before the last column, fill with empty cells.
+			@sections[] = "";
+		}
+		@lastX += @width
+	}
+	return(@sections);
+};
+
+if(array_size(@columns) != 0) {
+	// Validate input args
+	if(array_size(@columnWidth) < array_size(@columns)) {
+		@columnWidth[] = math_const('INFINITY');
+	}
+	if(array_size(@columnWidth) != array_size(@columns)) {
+		throw(FormatException, "columnWidth must be the same size as, or one less than the size of the columns array.");
+	}
+} else {
+	if(array_size(@lines) < 2) {
+		throw(FormatException, "The input data must have at least 2 lines, the column names, and the header separator.");
+	}
+	// We need to calculate the columns and columnWidth ourselves.
+	// The general approach here is to simply count the columns between
+	// the beginning of the line and the start of the first character after
+	// a space/tab character. This determines the width, then we go back and
+	// use that data to calculate the values that go in @columns based on the first line.
+	@columnNames = execute(@lines[0], @tabWidth, @normalizeLine);
+	@columnSeparator = execute(@lines[1], @tabWidth, @normalizeLine);
+	@inSpaces = false;
+	@width = 0;
+	for(@i = 0, @i < length(@columnSeparator), @i++) {
+		@char = @columnSeparator[@i];
+		if(@inSpaces && @char != ' ') {
+			// New column here. Finalize this column and reset.
+			@columnWidth[] =  @width;
+			@width = 0;
+			@inSpaces = false;
+		} else if(@char == ' ') {
+			@inSpaces = true;
+		}
+		@width++;
+	}
+	// Push infinity on the end, which is the last column width
+	@columnWidth[] = math_const('INFINITY');
+	// Now we know the column widths, parse the column names
+	@columnNames = array_map(execute(@columnNames, @columnWidth, @splitLine), closure(@item) {return(trim(@item))});
+
+	// remove the first two lines
+	array_remove(@lines, 0);
+	array_remove(@lines, 0);
+}
+
+foreach(@column in @columnNames) {
+	@ret[@column] = array();
+}
+
+foreach(@line in @lines) {
+	if(@skipEmptyLines) {
+		if(trim(@line) == "") {
+			continue();
+		}
+	}
+	@columns = execute(execute(@line, @tabWidth, @normalizeLine), @columnWidth, @splitLine);
+	for(@i = 0, @i < array_size(@columns), @i++) {
+		@ret[@columnNames[@i]][] = trim(@columns[@i]);
+	}
+}
+
+return(@ret);
+
+// DONE??