apache
diff --git a/‎scripts/builtin/ampute.dml‎
Lines changed: 0 additions & 1 deletion b/‎scripts/builtin/ampute.dml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎scripts/builtin/confusionMatrix.dml‎
Lines changed: 1 addition & 1 deletion b/‎scripts/builtin/confusionMatrix.dml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/builtin/cooccurrenceMatrix.dml‎
Lines changed: 6 additions & 7 deletions b/‎scripts/builtin/cooccurrenceMatrix.dml‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎scripts/builtin/decisionTree.dml‎
Lines changed: 2 additions & 2 deletions b/‎scripts/builtin/decisionTree.dml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎scripts/builtin/differenceStatistics.dml‎
Lines changed: 5 additions & 0 deletions b/‎scripts/builtin/differenceStatistics.dml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎scripts/builtin/glove.dml‎
Lines changed: 27 additions & 2 deletions b/‎scripts/builtin/glove.dml‎
Lines changed: 27 additions & 2 deletions
diff --git a/‎scripts/builtin/imputeByKNN.dml‎
Lines changed: 9 additions & 16 deletions b/‎scripts/builtin/imputeByKNN.dml‎
Lines changed: 9 additions & 16 deletions
diff --git a/‎scripts/builtin/quantizeByCluster.dml‎
Lines changed: 2 additions & 3 deletions b/‎scripts/builtin/quantizeByCluster.dml‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎scripts/builtin/randomForest.dml‎
Lines changed: 4 additions & 3 deletions b/‎scripts/builtin/randomForest.dml‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎scripts/builtin/shapExplainer.dml‎
Lines changed: 1 addition & 0 deletions b/‎scripts/builtin/shapExplainer.dml‎
Lines changed: 1 addition & 0 deletions
@@ -30,7 +30,6 @@
 # mech         a string [either "MAR", "MNAR", or "MCAR"] specifying the missingness mechanism. Chosen "MAR" and "MNAR" settings will be overridden if a non-default weight matrix is specified
 # weights      a weight matrix [shape: k-by-m], containing weights that will be used to calculate the weighted sum scores. Will be overridden if mech == "MCAR"
 # seed         a manually defined seed for reproducible RNG
-
 # -------------------------------------------------------------------------------------
 #
 # OUTPUT:
 
@@ -23,7 +23,7 @@
 # and actual labels. We return both the counts and relative frequency
 # (normalized by sum of true labels)
 #
-# .. code-block::
+# .. code-block:: text
 #
 #                   True Labels
 #                     1    2
 
@@ -18,22 +18,21 @@
 # under the License.
 #
 #-------------------------------------------------------------
-#
-# The implementation is based on
+
+# Cleans and processes text data by removing punctuation, converting it to lowercase, and reformatting.
+# Adds an index column to the result. The implementation is based on
 # https://github.com/stanfordnlp/GloVe/blob/master/src/cooccur.c
 #
-#-------------------------------------------------------------
-
-## Cleans and processes text data by removing punctuation, converting it to lowercase, and reformatting.
-## Adds an index column to the result.
 # INPUT:
 # ------------------------------------------------------------------------------
 # S     (Frame[Unknown]): 1D input data frame containing text data.
 # ------------------------------------------------------------------------------
+#
 # OUTPUT:
 # ------------------------------------------------------------------------------
 # result    (Frame[Unknown]): Processed text data with an index column.
 # ------------------------------------------------------------------------------
+
 processText = function(Frame[Unknown] S) return (Frame[Unknown] result){
     print("processText");
     tmpStr = map(S[,1], "x -> x.replaceAll(\"[.]\", \"\")");
@@ -172,4 +171,4 @@ f_cooccurrenceMatrix = function(
     [wordPosition, docID] = getWordPosition(processedResult, maxTokens);
     [recodedWordPosition, tableSize, column] = getRecodedMatrix(wordPosition);
     coocMatrix = createCoocMatrix(cbind(docID, recodedWordPosition), tableSize, distanceWeighting, symmetric, windowSize);
-}
+}
@@ -30,9 +30,9 @@
 #   and the following trees, M would look as follows:
 #
 #   (L1)               |d<5|
-#                     /     \
+#                     /     \\
 #   (L2)           P1:2    |a<7|
-#                          /   \
+#                          /   \\
 #   (L3)                 P2:2 P3:1
 #
 #   --> M :=
 
@@ -28,6 +28,11 @@
 # X        First Matrix to compare
 # Y        Second Matrix to compare
 # --------------------------------------------------------------------------------
+#
+# OUTPUT:
+# -------------------------------------------------------------------------------------
+# stats.   Difference statistics
+# -------------------------------------------------------------------------------------
 
 m_differenceStatistics = function(Matrix[Double] X, Matrix[Double] Y)  {
 
 
@@ -18,6 +18,31 @@
 # under the License.
 #-------------------------------------------------------------
 
+# Computes the vector embeddings for words in a large text corpus. 
+#
+# INPUT:
+# -------------------------------------------------------------------------------- 
+# input                 1DInput corpus in CSV format.
+# seed                  Random seed for reproducibility.
+# vector_size           Dimensionality of word vectors, V.
+# eta                   Learning rate for optimization, recommended value: 0.05.
+# alpha                 Weighting function parameter, recommended value: 0.75.
+# x_max                 Maximum co-occurrence value as per the GloVe paper: 100.
+# tol                   Tolerance value to avoid overfitting, recommended value: 1e-4.
+# iterations            Total number of training iterations.
+# print_loss_it         Interval (in iterations) for printing the loss.
+# maxTokens             Maximum number of tokens per text entry.
+# windowSize            Context window size.
+# distanceWeighting     Whether to apply distance-based weighting.
+# symmetric             Determines if the matrix is symmetric (TRUE) or asymmetric (FALSE).
+# ------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ------------------------------------------------------------------------------
+# G                     The word indices and their word vectors, of shape (N, V). Each represented as a vector, of shape (1,V)
+# ------------------------------------------------------------------------------
+
+
 init = function(matrix[double] cooc_matrix, double x_max, double alpha)
   return(matrix[double] weights, matrix[double] log_cooc_matrix){
   E = 2.718281828;
@@ -119,7 +144,7 @@ gloveWithCoocMatrix = function(matrix[double] cooc_matrix, frame[Unknown] cooc_i
     G = cbind(cooc_index[,2], as.frame(G));
 }
 
-glove = function(
+f_glove = function(
     Frame[Unknown] input,
     int seed, int vector_size,
     double alpha, double eta,
@@ -159,4 +184,4 @@ glove = function(
 
         [cooc_matrix, cooc_index] = cooccurrenceMatrix(input, maxTokens, windowSize, distanceWeighting, symmetric);
         G = gloveWithCoocMatrix(cooc_matrix, cooc_index, seed, vector_size, alpha, eta, x_max, tol, iterations, print_loss_it);
-}
+}
@@ -25,23 +25,16 @@
 # the missing values by column means. Currently, only the column with the most
 # missing values is actually imputed.
 #
-# ------------------------------------------------------------------------------
 # INPUT:
 # ------------------------------------------------------------------------------
-# X           Matrix with missing values, which are represented as NaNs
-# method      Method used for imputing missing values with different performance
-#             and accuracy tradeoffs:
-#             'dist' (default): Compute all-pairs distances and impute the
-#                               missing values by closest. O(N^2 * #features)
-#             'dist_missing':   Compute distances between data and records with
-#                               missing values. O(N*M * #features), assuming
-#                               that the number of records with MV is M<<N.
-#             'dist_sample':    Compute distances between sample of data and
-#                               records with missing values. O(S*M * #features)
-#                               with M<<N and S<<N, but suboptimal imputation.
-# seed        Root seed value for random/sample calls for deterministic behavior
-#             -1 for true randomization
-# sample_frac Sample fraction for 'dist_sample' (value between 0 and 1)
+# X             Matrix with missing values, which are represented as NaNs
+# method        Method used for imputing missing values with different performance and accuracy tradeoffs:\n
+#               - 'dist' (default): Compute all-pairs distances and impute the missing values by closest. O(N^2 * #features)
+#               - 'dist_missing': Compute distances between data and records with missing values. O(N*M * #features), assuming that the number of records with MV is M<<N.
+#               - 'dist_sample': Compute distances between sample of data and records with missing values. O(S*M * #features) with M<<N and S<<N, but suboptimal imputation.
+#
+# seed          Root seed value for random/sample calls for deterministic behavior. -1 for true randomization
+# sample_frac   Sample fraction for 'dist_sample' (value between 0 and 1)
 # ------------------------------------------------------------------------------
 #
 # OUTPUT:
@@ -136,4 +129,4 @@ compute_missing_values = function (Matrix[Double] X, Matrix[Double] filled_matri
     #Get the subset records that need to be imputed
     imputedValue = t(reshaped) %*% aligned
     imputedValue = t(imputedValue)
-}
+}
@@ -58,7 +58,7 @@
 #           the product quantization. Only relevant when space_decomp = TRUE.
 # ------------------------------------------------------------------------------------------
 
-m_quantizeByCluster = function(Matrix[Double]X, Integer M = 4, Integer k = 10, Integer runs = 10,
+m_quantizeByCluster = function(Matrix[Double] X, Integer M = 4, Integer k = 10, Integer runs = 10,
     Integer max_iter = 1000, Double eps = 1e-6, Integer avg_sample_size_per_centroid = 50, Boolean separate=TRUE, Boolean space_decomp=FALSE, Integer seed = -1)
   return(Matrix[Double] codebook, Matrix[Double] codes, Matrix[Double] R)
 {
@@ -118,5 +118,4 @@ m_quantizeByCluster = function(Matrix[Double]X, Integer M = 4, Integer k = 10, I
       codes[,i] = tmp_c + offset
     }
   }
-}
-
+}
@@ -26,16 +26,17 @@
 # and optionally subset of features (columns). During tree construction, split
 # candidates are additionally chosen on a sample of remaining features.
 #
-# .. code-block::
+# .. code-block:: text
 #
 #   For example, given a feature matrix with features [a,b,c,d]
 #   and the following two trees, M (the output) would look as follows:
 #
 #   (L1)          |a<7|                   |d<5|
-#                /     \                 /     \
+#                /     \\                 /     \\
 #   (L2)     |c<3|     |b<4|         |a<7|     P3:2
-#            /   \     /   \         /   \
+#            /   \\     /   \\         /  \\
 #   (L3)   P1:2 P2:1 P3:1 P4:2     P1:2 P2:1
+#
 #   --> M :=
 #   [[1, 7, 3, 3, 2, 4, 0, 2, 0, 1, 0, 1, 0, 2],  (1st tree)
 #    [4, 5, 1, 7, 0, 2, 0, 2, 0, 1, 0, 0, 0, 0]]  (2nd tree)
 
@@ -51,6 +51,7 @@
 # S              Matrix holding the shapley values along the cols, one row per instance.
 # expected       Double holding the average prediction of all instances.
 # -----------------------------------------------------------------------------
+
 s_shapExplainer = function(String model_function, list[unknown] model_args, Matrix[Double] x_instances,
     Matrix[Double] X_bg, Integer n_permutations = 10, Integer n_samples = 100, Integer remove_non_var=0,
     Matrix[Double] partitions=as.matrix(-1), Integer seed = -1, Integer verbose = 0)
Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,6 @@`
`30`	`30`	`# mech a string [either "MAR", "MNAR", or "MCAR"] specifying the missingness mechanism. Chosen "MAR" and "MNAR" settings will be overridden if a non-default weight matrix is specified`
`31`	`31`	`# weights a weight matrix [shape: k-by-m], containing weights that will be used to calculate the weighted sum scores. Will be overridden if mech == "MCAR"`
`32`	`32`	`# seed a manually defined seed for reproducible RNG`
`33`		`-`
`34`	`33`	`# -------------------------------------------------------------------------------------`
`35`	`34`	`#`
`36`	`35`	`# OUTPUT:`
Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,7 @@`
`23`	`23`	`# and actual labels. We return both the counts and relative frequency`
`24`	`24`	`# (normalized by sum of true labels)`
`25`	`25`	`#`
`26`		`-# .. code-block::`
	`26`	`+# .. code-block:: text`
`27`	`27`	`#`
`28`	`28`	`# True Labels`
`29`	`29`	`# 1 2`
Original file line number	Diff line number	Diff line change
`@@ -30,9 +30,9 @@`
`30`	`30`	`# and the following trees, M would look as follows:`
`31`	`31`	`#`
`32`	`32`	`# (L1) \|d<5\|`
`33`		`-# / \`
	`33`	`+# / \\`
`34`	`34`	`# (L2) P1:2 \|a<7\|`
`35`		`-# / \`
	`35`	`+# / \\`
`36`	`36`	`# (L3) P2:2 P3:1`
`37`	`37`	`#`
`38`	`38`	`# --> M :=`
Original file line number	Diff line number	Diff line change
`@@ -58,7 +58,7 @@`
`58`	`58`	`# the product quantization. Only relevant when space_decomp = TRUE.`
`59`	`59`	`# ------------------------------------------------------------------------------------------`
`60`	`60`
`61`		`-m_quantizeByCluster = function(Matrix[Double]X, Integer M = 4, Integer k = 10, Integer runs = 10,`
	`61`	`+m_quantizeByCluster = function(Matrix[Double] X, Integer M = 4, Integer k = 10, Integer runs = 10,`
`62`	`62`	`Integer max_iter = 1000, Double eps = 1e-6, Integer avg_sample_size_per_centroid = 50, Boolean separate=TRUE, Boolean space_decomp=FALSE, Integer seed = -1)`
`63`	`63`	`return(Matrix[Double] codebook, Matrix[Double] codes, Matrix[Double] R)`
`64`	`64`	`{`
`@@ -118,5 +118,4 @@ m_quantizeByCluster = function(Matrix[Double]X, Integer M = 4, Integer k = 10, I`
`118`	`118`	`codes[,i] = tmp_c + offset`
`119`	`119`	`}`
`120`	`120`	`}`
`121`		`-}`
`122`		`-`
	`121`	`+}`