[SYSTEMDS-3179] Builtin for GloVe cooccurrence matrix computation

saminbassiri · mboehm7 · commit 6fd08c0de828 · 2025-05-11T15:21:34.000+02:00
Closes #2200.
diff --git a/scripts/builtin/cooccurrenceMatrix.dml b/scripts/builtin/cooccurrenceMatrix.dml
@@ -0,0 +1,175 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+#
+# The implementation is based on
+# https://github.com/stanfordnlp/GloVe/blob/master/src/cooccur.c
+#
+#-------------------------------------------------------------
+
+## Cleans and processes text data by removing punctuation, converting it to lowercase, and reformatting.
+## Adds an index column to the result.
+# INPUT:
+# ------------------------------------------------------------------------------
+# S     (Frame[Unknown]): 1D input data frame containing text data.
+# ------------------------------------------------------------------------------
+# OUTPUT:
+# ------------------------------------------------------------------------------
+# result    (Frame[Unknown]): Processed text data with an index column.
+# ------------------------------------------------------------------------------
+processText = function(Frame[Unknown] S) return (Frame[Unknown] result){
+    print("processText");
+    tmpStr = map(S[,1], "x -> x.replaceAll(\"[.]\", \"\")");
+    tmpStr = map(tmpStr, "x -> x.replaceAll(\"[^a-zA-Z\\s]\", \" \")");
+    tmpStr = map(tmpStr, "x -> x.toLowerCase()");
+    result = cbind(as.frame(seq(1, nrow(S), 1)), tmpStr);
+}
+
+## Tokenizes text data and retrieves word positions.
+# INPUT:
+# ------------------------------------------------------------------------------
+# S           (Frame[Unknown]): 2D input text data with an index column.
+# maxTokens   (Int): Maximum number of tokens per text entry.
+# ------------------------------------------------------------------------------
+# OUTPUT:
+# ------------------------------------------------------------------------------
+# result  (Frame[Unknown]): Tokenized words.
+# docID   (Matrix[double]): Document ID matrix corresponding to tokens.
+# ------------------------------------------------------------------------------
+getWordPosition = function(Frame[Unknown] S, Int maxTokens) return (Frame[Unknown] result, Matrix[double] docID){
+    print("getWordPosition");
+    jspec_pos = "{\"algo\": \"split\", \"out\": \"position\",\"out_params\": {\"sort_alpha\": false},\"id_cols\": [1],\"tokenize_col\": 2}";
+    wordPosition = tokenize(target=S, spec=jspec_pos, max_tokens=maxTokens);
+    result = wordPosition[,3];
+    docID = as.matrix(wordPosition[,1]);
+}
+
+## Encodes words into a numerical matrix format, retrieves the vocabulary size, and maps word indices.
+## Uses transformencode() to recode strings and find each unique string position in the co-occurrence matrix.
+# INPUT:
+# ------------------------------------------------------------------------------
+# S     (Frame[Unknown]): 1D frame of tokenized word positions.
+# ------------------------------------------------------------------------------
+# OUTPUT:
+# ------------------------------------------------------------------------------
+# recodedWordPosition   (Matrix[double]): Encoded word positions as a numerical matrix.
+# tableSize            (Int): Number of distinct words in the input text (co-occurrence matrix size).
+# column               (Frame[Unknown]): Mapping of word indices to distinct words in the co-occurrence matrix.
+# ------------------------------------------------------------------------------
+getRecodedMatrix = function(Frame[Unknown] S) return (Matrix[double] recodedWordPosition, Int tableSize, Frame[Unknown] column){
+    print("getRecodedMatrix");
+    [recodedWordPosition, M] = transformencode(target=S, spec="{ids:true,recode:[1]}");
+
+    distinctWord = map(M[,1], "s -> UtilFunctions.splitRecodeEntry(s)[0]");
+    index = map(M[,1], "s -> Integer.valueOf(UtilFunctions.splitRecodeEntry(s)[1])");
+    column = cbind(index, distinctWord);
+    sortedIndex = order(target=as.matrix(index), by=1, decreasing=FALSE, index.return=TRUE);
+
+    #TODO vectorize via order of frames
+    for(i in 1:nrow(sortedIndex)){
+        p = as.integer(as.scalar(sortedIndex[i,1]));
+        column[as.integer(as.scalar(index[p])), 2] = distinctWord[p];
+        column[i, 1] = as.scalar(index[p]);
+    }
+    tableSize = nrow(distinctWord);
+}
+
+## Iterates over the recoded word positions to construct a co-occurrence matrix.
+# INPUT:
+# ------------------------------------------------------------------------------
+# recodedWordPosition (Matrix[double]): 2D matrix of recoded word positions with text IDs.
+# tableSize          (Int): Size of the vocabulary (number of unique words).
+# distanceWeighting  (Boolean): Flag to apply distance weighting to co-occurrence counts.
+# symmetric          (Boolean): Determines if the matrix is symmetric (TRUE) or asymmetric (FALSE).
+# windowSize        (Int): Context window size.
+# ------------------------------------------------------------------------------
+# OUTPUT:
+# ------------------------------------------------------------------------------
+# coocMatrix (Matrix[double]): Final word-word co-occurrence matrix.
+# ------------------------------------------------------------------------------
+createCoocMatrix = function(
+    Matrix[double] recodedWordPosition,
+    Int tableSize,
+    boolean distanceWeighting,
+    boolean symmetric,
+    Int windowSize)
+return (Matrix[double] coocMatrix)
+{
+    print("Processing word cooccurrence...");
+    coocMatrix = matrix(0, tableSize, tableSize);
+
+    #TODO vectorize loop
+    for (i in 1:nrow(recodedWordPosition)) {
+        docId = as.integer(as.scalar(recodedWordPosition[i,1]));
+        wordIndex = as.integer(as.scalar(recodedWordPosition[i,2]));
+        if(wordIndex != 0){# This check is due to wrong result of the transformencode when running jvm test.
+            for (j in 1:windowSize) {
+                # Check left context
+                if (i-j > 0) {
+                    if(docId == as.integer(as.scalar(recodedWordPosition[i-j, 1])))
+                    {
+                        neighbourWordIndex = as.integer(as.scalar(recodedWordPosition[i-j,2]));
+                        increase = ifelse(distanceWeighting, 1.0 / j, 1.0);
+                        coocMatrix[wordIndex, neighbourWordIndex] = coocMatrix[wordIndex, neighbourWordIndex] + increase;
+                    }
+                }
+                # Check right context if symmetric
+                if(symmetric == TRUE){
+                    if (i+j < nrow(recodedWordPosition) + 1) {
+                        if(docId == as.integer(as.scalar(recodedWordPosition[i+j, 1])))
+                        {
+                            neighbourWordIndex = as.integer(as.scalar(recodedWordPosition[i+j,2]));
+                            increase = ifelse(distanceWeighting, 1.0 / j, 1.0);
+                            coocMatrix[wordIndex, neighbourWordIndex] = coocMatrix[wordIndex, neighbourWordIndex] + increase;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    print("Word-word cooccurrence matrix computation completed.");
+}
+
+## Main function to process text data to construct a word-word co-occurrence matrix.
+# INPUT:
+# ------------------------------------------------------------------------------
+# input            (Frame[Unknown]): 1DInput corpus in CSV format.
+# maxTokens        (Int): Maximum number of tokens per text entry.
+# windowSize       (Int): Context window size.
+# distanceWeighting (Boolean): Whether to apply distance-based weighting.
+# symmetric        (Boolean): Determines if the matrix is symmetric (TRUE) or asymmetric (FALSE).
+# ------------------------------------------------------------------------------
+# OUTPUT:
+# ------------------------------------------------------------------------------
+# coocMatrix (Matrix[double]): The computed co-occurrence matrix.
+# column     (Frame[Unknown]): Word-index mapping for the co-occurrence matrix.
+# ------------------------------------------------------------------------------
+f_cooccurrenceMatrix = function(
+    Frame[Unknown] input,
+    Int maxTokens,
+    Int windowSize,
+    Boolean distanceWeighting,
+    Boolean symmetric) return (Matrix[Double] coocMatrix, Frame[Unknown] column){
+
+    processedResult = processText(input);
+    [wordPosition, docID] = getWordPosition(processedResult, maxTokens);
+    [recodedWordPosition, tableSize, column] = getRecodedMatrix(wordPosition);
+    coocMatrix = createCoocMatrix(cbind(docID, recodedWordPosition), tableSize, distanceWeighting, symmetric, windowSize);
+}
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -93,6 +93,7 @@ public enum Builtins {
 	CONV2D("conv2d", false),
 	CONV2D_BACKWARD_FILTER("conv2d_backward_filter", false),
 	CONV2D_BACKWARD_DATA("conv2d_backward_data", false),
+	COOCCURRENCEMATRIX("cooccurrenceMatrix", true),
 	COR("cor", true),
 	CORRECTTYPOS("correctTypos", true),
 	CORRECTTYPOSAPPLY("correctTyposApply", true),
diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinCooccurrenceMatrixTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinCooccurrenceMatrixTest.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin.part1;
+
+import org.apache.sysds.common.Types;
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+public class BuiltinCooccurrenceMatrixTest extends AutomatedTestBase {
+
+	private static final String TEST_NAME = "cooccurrenceMatrix";
+	private static final String TEST_DIR = "functions/builtin/";
+	private static final String RESOURCE_DIRECTORY = "src/test/resources/datasets/";
+	private static final String TEST_CLASS_DIR = TEST_DIR + BuiltinCooccurrenceMatrixTest.class.getSimpleName() + "/";
+	private static final double EPSILON = 1e-10; // Tolerance for comparison
+
+	@Override
+	public void setUp() {
+		addTestConfiguration(TEST_NAME,new TestConfiguration(TEST_CLASS_DIR, TEST_NAME,new String[]{"TestResult",}));
+	}
+
+	@Test
+	public void cooccurrenceMatrixTest() {
+		runCooccurrenceMatrix(20, 2, "FALSE", "TRUE");
+		HashMap<MatrixValue.CellIndex, Double> cooccurrenceMatrix = readDMLMatrixFromOutputDir("TestResult");
+		double[][] computedC = TestUtils.convertHashMapToDoubleArray(cooccurrenceMatrix);
+
+		// Unique words: {apple, banana, orange, grape}
+		// Co-occurrence based on word pairs in same sentences
+		double[][] expectedC = new double[][] {
+				{0, 1, 2, 0},  // apple with {banana, orange}
+				{1, 0, 3, 1},  // banana with {apple, orange, grape}
+				{2, 3, 0, 2},  // orange with {apple, banana, grape}
+				{0, 1, 2, 0}   // grape with {banana, orange, grape}
+		};
+
+		TestUtils.compareMatrices(expectedC, computedC, expectedC.length, expectedC[0].length, EPSILON);
+
+	}
+
+	public void runCooccurrenceMatrix(Integer maxTokens, Integer windowSize, String distanceWeighting, String symmetric) {
+		// Load test configuration
+		Types.ExecMode platformOld = setExecMode(Types.ExecType.CP);
+		try{
+			loadTestConfiguration(getTestConfiguration(TEST_NAME));
+
+			String HOME = SCRIPT_DIR + TEST_DIR;
+
+			fullDMLScriptName = HOME + TEST_NAME + ".dml";
+
+			programArgs = new String[]{"-nvargs",
+					"input=" + RESOURCE_DIRECTORY + "GloVe/coocMatrixTest.csv",
+					"maxTokens=" + maxTokens,
+					"windowSize=" + windowSize,
+					"distanceWeighting=" + distanceWeighting,
+					"symmetric=" + symmetric,
+					"out_file=" + output("TestResult")};
+			System.out.println("Run dml script..");
+			runTest(true, false, null, -1);
+			System.out.println("DONE");
+		}
+		finally {
+			rtplatform = platformOld;
+		}
+	}
+}
diff --git a/src/test/resources/datasets/GloVe/coocMatrixTest.csv b/src/test/resources/datasets/GloVe/coocMatrixTest.csv
@@ -0,0 +1,6 @@
+apple banana orange.
+banana orange grape.
+apple. orange
+grape 1111 ------ orange.
+------ <<<<<<< 1111 22222.
+banana orange
diff --git a/src/test/scripts/functions/builtin/cooccurrenceMatrix.dml b/src/test/scripts/functions/builtin/cooccurrenceMatrix.dml
@@ -0,0 +1,25 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+input = read($input, data_type="frame", format="csv", sep=",", header=FALSE);
+
+[coocMatrix,  column] = cooccurrenceMatrix(input, $maxTokens, $windowSize, $distanceWeighting, $symmetric);
+write(coocMatrix, $out_file , data_type="matrix");