Skip to content

Commit 1b4fe90

Browse files
committed
Fixes in generator.py, builtin function fixes
1 parent a6faf44 commit 1b4fe90

39 files changed

+578
-75
lines changed

scripts/builtin/ampute.dml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
# mech a string [either "MAR", "MNAR", or "MCAR"] specifying the missingness mechanism. Chosen "MAR" and "MNAR" settings will be overridden if a non-default weight matrix is specified
3131
# weights a weight matrix [shape: k-by-m], containing weights that will be used to calculate the weighted sum scores. Will be overridden if mech == "MCAR"
3232
# seed a manually defined seed for reproducible RNG
33-
3433
# -------------------------------------------------------------------------------------
3534
#
3635
# OUTPUT:

scripts/builtin/confusionMatrix.dml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
# and actual labels. We return both the counts and relative frequency
2424
# (normalized by sum of true labels)
2525
#
26-
# .. code-block::
26+
# .. code-block:: text
2727
#
2828
# True Labels
2929
# 1 2

scripts/builtin/cooccurrenceMatrix.dml

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,22 +18,21 @@
1818
# under the License.
1919
#
2020
#-------------------------------------------------------------
21-
#
22-
# The implementation is based on
21+
22+
# Cleans and processes text data by removing punctuation, converting it to lowercase, and reformatting.
23+
# Adds an index column to the result. The implementation is based on
2324
# https://github.com/stanfordnlp/GloVe/blob/master/src/cooccur.c
2425
#
25-
#-------------------------------------------------------------
26-
27-
## Cleans and processes text data by removing punctuation, converting it to lowercase, and reformatting.
28-
## Adds an index column to the result.
2926
# INPUT:
3027
# ------------------------------------------------------------------------------
3128
# S (Frame[Unknown]): 1D input data frame containing text data.
3229
# ------------------------------------------------------------------------------
30+
#
3331
# OUTPUT:
3432
# ------------------------------------------------------------------------------
3533
# result (Frame[Unknown]): Processed text data with an index column.
3634
# ------------------------------------------------------------------------------
35+
3736
processText = function(Frame[Unknown] S) return (Frame[Unknown] result){
3837
print("processText");
3938
tmpStr = map(S[,1], "x -> x.replaceAll(\"[.]\", \"\")");
@@ -172,4 +171,4 @@ f_cooccurrenceMatrix = function(
172171
[wordPosition, docID] = getWordPosition(processedResult, maxTokens);
173172
[recodedWordPosition, tableSize, column] = getRecodedMatrix(wordPosition);
174173
coocMatrix = createCoocMatrix(cbind(docID, recodedWordPosition), tableSize, distanceWeighting, symmetric, windowSize);
175-
}
174+
}

scripts/builtin/decisionTree.dml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,9 @@
3030
# and the following trees, M would look as follows:
3131
#
3232
# (L1) |d<5|
33-
# / \
33+
# / \\
3434
# (L2) P1:2 |a<7|
35-
# / \
35+
# / \\
3636
# (L3) P2:2 P3:1
3737
#
3838
# --> M :=

scripts/builtin/differenceStatistics.dml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,11 @@
2828
# X First Matrix to compare
2929
# Y Second Matrix to compare
3030
# --------------------------------------------------------------------------------
31+
#
32+
# OUTPUT:
33+
# -------------------------------------------------------------------------------------
34+
# stats. Difference statistics
35+
# -------------------------------------------------------------------------------------
3136

3237
m_differenceStatistics = function(Matrix[Double] X, Matrix[Double] Y) {
3338

scripts/builtin/glove.dml

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,31 @@
1818
# under the License.
1919
#-------------------------------------------------------------
2020

21+
# Computes the vector embeddings for words in a large text corpus.
22+
#
23+
# INPUT:
24+
# --------------------------------------------------------------------------------
25+
# input 1DInput corpus in CSV format.
26+
# seed Random seed for reproducibility.
27+
# vector_size Dimensionality of word vectors, V.
28+
# eta Learning rate for optimization, recommended value: 0.05.
29+
# alpha Weighting function parameter, recommended value: 0.75.
30+
# x_max Maximum co-occurrence value as per the GloVe paper: 100.
31+
# tol Tolerance value to avoid overfitting, recommended value: 1e-4.
32+
# iterations Total number of training iterations.
33+
# print_loss_it Interval (in iterations) for printing the loss.
34+
# maxTokens Maximum number of tokens per text entry.
35+
# windowSize Context window size.
36+
# distanceWeighting Whether to apply distance-based weighting.
37+
# symmetric Determines if the matrix is symmetric (TRUE) or asymmetric (FALSE).
38+
# ------------------------------------------------------------------------------
39+
#
40+
# OUTPUT:
41+
# ------------------------------------------------------------------------------
42+
# G The word indices and their word vectors, of shape (N, V). Each represented as a vector, of shape (1,V)
43+
# ------------------------------------------------------------------------------
44+
45+
2146
init = function(matrix[double] cooc_matrix, double x_max, double alpha)
2247
return(matrix[double] weights, matrix[double] log_cooc_matrix){
2348
E = 2.718281828;
@@ -119,7 +144,7 @@ gloveWithCoocMatrix = function(matrix[double] cooc_matrix, frame[Unknown] cooc_i
119144
G = cbind(cooc_index[,2], as.frame(G));
120145
}
121146

122-
glove = function(
147+
f_glove = function(
123148
Frame[Unknown] input,
124149
int seed, int vector_size,
125150
double alpha, double eta,
@@ -159,4 +184,4 @@ glove = function(
159184

160185
[cooc_matrix, cooc_index] = cooccurrenceMatrix(input, maxTokens, windowSize, distanceWeighting, symmetric);
161186
G = gloveWithCoocMatrix(cooc_matrix, cooc_index, seed, vector_size, alpha, eta, x_max, tol, iterations, print_loss_it);
162-
}
187+
}

scripts/builtin/imputeByKNN.dml

Lines changed: 9 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -25,23 +25,16 @@
2525
# the missing values by column means. Currently, only the column with the most
2626
# missing values is actually imputed.
2727
#
28-
# ------------------------------------------------------------------------------
2928
# INPUT:
3029
# ------------------------------------------------------------------------------
31-
# X Matrix with missing values, which are represented as NaNs
32-
# method Method used for imputing missing values with different performance
33-
# and accuracy tradeoffs:
34-
# 'dist' (default): Compute all-pairs distances and impute the
35-
# missing values by closest. O(N^2 * #features)
36-
# 'dist_missing': Compute distances between data and records with
37-
# missing values. O(N*M * #features), assuming
38-
# that the number of records with MV is M<<N.
39-
# 'dist_sample': Compute distances between sample of data and
40-
# records with missing values. O(S*M * #features)
41-
# with M<<N and S<<N, but suboptimal imputation.
42-
# seed Root seed value for random/sample calls for deterministic behavior
43-
# -1 for true randomization
44-
# sample_frac Sample fraction for 'dist_sample' (value between 0 and 1)
30+
# X Matrix with missing values, which are represented as NaNs
31+
# method Method used for imputing missing values with different performance and accuracy tradeoffs:\n
32+
# - 'dist' (default): Compute all-pairs distances and impute the missing values by closest. O(N^2 * #features)
33+
# - 'dist_missing': Compute distances between data and records with missing values. O(N*M * #features), assuming that the number of records with MV is M<<N.
34+
# - 'dist_sample': Compute distances between sample of data and records with missing values. O(S*M * #features) with M<<N and S<<N, but suboptimal imputation.
35+
#
36+
# seed Root seed value for random/sample calls for deterministic behavior. -1 for true randomization
37+
# sample_frac Sample fraction for 'dist_sample' (value between 0 and 1)
4538
# ------------------------------------------------------------------------------
4639
#
4740
# OUTPUT:
@@ -136,4 +129,4 @@ compute_missing_values = function (Matrix[Double] X, Matrix[Double] filled_matri
136129
#Get the subset records that need to be imputed
137130
imputedValue = t(reshaped) %*% aligned
138131
imputedValue = t(imputedValue)
139-
}
132+
}

scripts/builtin/quantizeByCluster.dml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@
5858
# the product quantization. Only relevant when space_decomp = TRUE.
5959
# ------------------------------------------------------------------------------------------
6060

61-
m_quantizeByCluster = function(Matrix[Double]X, Integer M = 4, Integer k = 10, Integer runs = 10,
61+
m_quantizeByCluster = function(Matrix[Double] X, Integer M = 4, Integer k = 10, Integer runs = 10,
6262
Integer max_iter = 1000, Double eps = 1e-6, Integer avg_sample_size_per_centroid = 50, Boolean separate=TRUE, Boolean space_decomp=FALSE, Integer seed = -1)
6363
return(Matrix[Double] codebook, Matrix[Double] codes, Matrix[Double] R)
6464
{
@@ -118,5 +118,4 @@ m_quantizeByCluster = function(Matrix[Double]X, Integer M = 4, Integer k = 10, I
118118
codes[,i] = tmp_c + offset
119119
}
120120
}
121-
}
122-
121+
}

scripts/builtin/randomForest.dml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,16 +26,17 @@
2626
# and optionally subset of features (columns). During tree construction, split
2727
# candidates are additionally chosen on a sample of remaining features.
2828
#
29-
# .. code-block::
29+
# .. code-block:: text
3030
#
3131
# For example, given a feature matrix with features [a,b,c,d]
3232
# and the following two trees, M (the output) would look as follows:
3333
#
3434
# (L1) |a<7| |d<5|
35-
# / \ / \
35+
# / \\ / \\
3636
# (L2) |c<3| |b<4| |a<7| P3:2
37-
# / \ / \ / \
37+
# / \\ / \\ / \\
3838
# (L3) P1:2 P2:1 P3:1 P4:2 P1:2 P2:1
39+
#
3940
# --> M :=
4041
# [[1, 7, 3, 3, 2, 4, 0, 2, 0, 1, 0, 1, 0, 2], (1st tree)
4142
# [4, 5, 1, 7, 0, 2, 0, 2, 0, 1, 0, 0, 0, 0]] (2nd tree)

scripts/builtin/shapExplainer.dml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
# S Matrix holding the shapley values along the cols, one row per instance.
5252
# expected Double holding the average prediction of all instances.
5353
# -----------------------------------------------------------------------------
54+
5455
s_shapExplainer = function(String model_function, list[unknown] model_args, Matrix[Double] x_instances,
5556
Matrix[Double] X_bg, Integer n_permutations = 10, Integer n_samples = 100, Integer remove_non_var=0,
5657
Matrix[Double] partitions=as.matrix(-1), Integer seed = -1, Integer verbose = 0)

0 commit comments

Comments
 (0)