jplag · wsimonw · Dec 29, 2023 · Jan 1, 2024 · Jan 5, 2024 · Jan 5, 2024
diff --git a/README.md b/README.md
@@ -67,7 +67,7 @@ JPlag is released on [Maven Central](https://search.maven.org/search?q=de.jplag)
 </dependency>
 ```
 
-### Building from sources 
+### Building from sources
 1. Download or clone the code from this repository.
 2. Run `mvn clean package` from the repository root to compile and build all submodules.
    Run `mvn clean package assembly:single` instead if you need the full jar, which includes all dependencies.
@@ -107,7 +107,7 @@ Parameter descriptions:
                         Root-directories with submissions to check for
                           plagiarism (same as root).
       --normalize       Activate the normalization of tokens. Supported for
-                          languages: Java, C++.
+                          languages: Java, Java-CPG, C++.
       -old, --old=<oldDirectories>[,<oldDirectories>...]
                         Root-directories with prior submissions to compare
                           against.
@@ -238,4 +238,4 @@ Please consider our [guidelines for contributions](https://github.com/jplag/JPla
 ## Contact
 If you encounter bugs or other issues, please report them [here](https://github.com/jplag/jplag/issues).
 For other purposes, you can contact us at jplag@ipd.kit.edu.
-We would love to hear about your research related to JPlag. Feel free to contact us!
+We would love to hear about your research related to JPlag. Feel free to contact us!
diff --git a/cli/pom.xml b/cli/pom.xml
@@ -30,6 +30,10 @@
             <groupId>de.jplag</groupId>
             <artifactId>java</artifactId>
         </dependency>
+        <dependency>
+            <groupId>de.jplag</groupId>
+            <artifactId>java-cpg</artifactId>
+        </dependency>
         <dependency>
             <groupId>de.jplag</groupId>
             <artifactId>python-3</artifactId>

diff --git a/cli/src/main/java/de/jplag/cli/options/CliOptions.java b/cli/src/main/java/de/jplag/cli/options/CliOptions.java
@@ -76,7 +76,7 @@ public class CliOptions implements Runnable {
     public JPlagMode mode = JPlagMode.AUTO;
 
     /** Enable token normalization (Java, C++). */
-    @Option(names = {"--normalize"}, description = "Activate the normalization of tokens. Supported for languages: Java, C++.")
+    @Option(names = {"--normalize"}, description = "Activate the normalization of tokens. Supported for languages: Java, Java-CPG, C++.")
     public boolean normalize = false;
 
     /** Advanced options group. */

diff --git a/core/src/main/java/de/jplag/comparison/GreedyStringTiling.java b/core/src/main/java/de/jplag/comparison/GreedyStringTiling.java
@@ -13,6 +13,7 @@
 import de.jplag.Match;
 import de.jplag.Submission;
 import de.jplag.Token;
+import de.jplag.TokenEquivalenceModel;
 import de.jplag.options.JPlagOptions;
 
 /**
@@ -31,9 +32,10 @@ public class GreedyStringTiling {
     private final Map<Submission, RollingTokenHashTable> cachedHashLookupTables = Collections.synchronizedMap(new IdentityHashMap<>());
 
     private final TokenSequenceMapper tokenSequenceMapper;
+    private final TokenEquivalenceModel tokenEquivalenceModel;
 
     /**
-     * Creates a instance of the Greedy String Tiling algorithm.
+     * Creates an instance of the Greedy String Tiling algorithm.
      * @param options are the options, controlling algorithm parameters like minimum token match.
      * @param tokenValueMapper provides integer mappings for token sequences.
      */
@@ -43,6 +45,7 @@ public GreedyStringTiling(JPlagOptions options, TokenSequenceMapper tokenValueMa
         int minimumNeighborLength = Math.clamp(options.mergingOptions().minimumNeighborLength(), 1, options.minimumTokenMatch());
 
         this.minimumMatchLength = options.mergingOptions().enabled() ? minimumNeighborLength : options.minimumTokenMatch();
+        this.tokenEquivalenceModel = options.language().getTokenEquivalenceModel();
 
         this.tokenSequenceMapper = tokenValueMapper;
     }
@@ -109,6 +112,10 @@ private JPlagComparison compareOrdered(Submission leftSubmission, Submission rig
         int[] leftTokens = this.tokenSequenceMapper.getTokenSequenceFor(leftSubmission);
         int[] rightTokens = this.tokenSequenceMapper.getTokenSequenceFor(rightSubmission);
 
+        if (!tokenEquivalenceModel.ensureTokenType(leftSubmission.getTokenList())
+                || !tokenEquivalenceModel.ensureTokenType(rightSubmission.getTokenList())) {
+            throw new IllegalStateException("Token equivalence model requires specific token types, but they are not given.");
+        }
         boolean[] leftExcludedTokens = calculateExcludedTokens(leftSubmission);
         boolean[] rightExcludedTokens = calculateExcludedTokens(rightSubmission);
 
@@ -134,7 +141,7 @@ private JPlagComparison compareOrdered(Submission leftSubmission, Submission rig
                     }
 
                     int subsequenceMatchLength = findLongestUnmarkedMatch(leftTokens, leftStartIndex, leftExcludedTokens, rightTokens,
-                            rightStartIndex, rightExcludedTokens, maximumMatchLength);
+                            rightStartIndex, rightExcludedTokens, maximumMatchLength, leftSubmission.getTokenList(), rightSubmission.getTokenList());
                     if (subsequenceMatchLength >= maximumMatchLength) {
                         if (subsequenceMatchLength > maximumMatchLength) {
                             iterationMatches.clear();
@@ -177,17 +184,21 @@ private JPlagComparison compareOrdered(Submission leftSubmission, Submission rig
      * length.
      */
     private int findLongestUnmarkedMatch(int[] leftValues, int leftStartIndex, boolean[] leftMarked, int[] rightValues, int rightStartIndex,
-            boolean[] rightMarked, int minimumSequenceLength) {
+            boolean[] rightMarked, int minimumSequenceLength, List<Token> leftTokens, List<Token> rightTokens) {
         for (int offset = minimumSequenceLength - 1; offset >= 0; offset--) {
             int leftIndex = leftStartIndex + offset;
             int rightIndex = rightStartIndex + offset;
-            if (leftValues[leftIndex] != rightValues[rightIndex] || leftMarked[leftIndex] || rightMarked[rightIndex]) {
+            if (!tokenEquivalenceModel.arePrimaryEquivalent(leftValues[leftIndex], rightValues[rightIndex])
+                    || !tokenEquivalenceModel.areSecondaryEquivalent(leftTokens.get(leftIndex).getType(), rightTokens.get(rightIndex).getType())
+                    || leftMarked[leftIndex] || rightMarked[rightIndex]) {
                 return 0;
             }
         }
         int offset = minimumSequenceLength;
-        while (leftValues[leftStartIndex + offset] == rightValues[rightStartIndex + offset] && !leftMarked[leftStartIndex + offset]
-                && !rightMarked[rightStartIndex + offset]) {
+        while (tokenEquivalenceModel.arePrimaryEquivalent(leftValues[leftStartIndex + offset], rightValues[rightStartIndex + offset])
+                && tokenEquivalenceModel.areSecondaryEquivalent(leftTokens.get(leftStartIndex + offset).getType(),
+                        rightTokens.get(rightStartIndex + offset).getType())
+                && !leftMarked[leftStartIndex + offset] && !rightMarked[rightStartIndex + offset]) {
             offset++;
         }
         return offset;

diff --git a/core/src/main/java/de/jplag/comparison/LongestCommonSubsequenceSearch.java b/core/src/main/java/de/jplag/comparison/LongestCommonSubsequenceSearch.java
@@ -101,7 +101,7 @@ public JPlagResult compareSubmissions(SubmissionSet submissionSet) throws Compar
         long startTimeMillis = System.currentTimeMillis();
 
         // Set up data structures:
-        TokenSequenceMapper tokenSequenceMapper = new TokenSequenceMapper(submissionSet);
+        TokenSequenceMapper tokenSequenceMapper = new TokenSequenceMapper(options.language().getTokenEquivalenceModel(), submissionSet);
         GreedyStringTiling coreAlgorithm = new GreedyStringTiling(options, tokenSequenceMapper);
 
         // Prepare base code comparisons:

diff --git a/core/src/main/java/de/jplag/comparison/TokenSequenceMapper.java b/core/src/main/java/de/jplag/comparison/TokenSequenceMapper.java
@@ -9,6 +9,7 @@
 import de.jplag.Submission;
 import de.jplag.SubmissionSet;
 import de.jplag.Token;
+import de.jplag.TokenEquivalenceModel;
 import de.jplag.TokenType;
 import de.jplag.logging.ProgressBarLogger;
 import de.jplag.logging.ProgressBarType;
@@ -21,13 +22,16 @@
 public class TokenSequenceMapper {
     private final Map<TokenType, Integer> tokenTypeToId;
     private final Map<Submission, int[]> submissionToTokenSequence;
+    private final TokenEquivalenceModel tokenEquivalenceModel;
 
     /**
      * Creates the submission to token ID mapping for a set of submissions. This will also show the progress to the user
      * using the {@link ProgressBarLogger}.
+     * @param tokenEquivalenceModel the model to use for token type equivalence.
      * @param submissionSet is the set of submissions to process.
      */
-    public TokenSequenceMapper(SubmissionSet submissionSet) {
+    public TokenSequenceMapper(TokenEquivalenceModel tokenEquivalenceModel, SubmissionSet submissionSet) {
+        this.tokenEquivalenceModel = tokenEquivalenceModel;
         tokenTypeToId = new HashMap<>();
         submissionToTokenSequence = new IdentityHashMap<>();
 
@@ -47,7 +51,7 @@ private void addSingleSubmission(Submission submission) {
         List<Token> tokens = submission.getTokenList();
         int[] tokenSequence = new int[tokens.size()];
         for (int i = 0; i < tokens.size(); i++) {
-            TokenType type = tokens.get(i).getType();
+            TokenType type = tokenEquivalenceModel.getPrimaryType(tokens.get(i));
             tokenTypeToId.putIfAbsent(type, tokenTypeToId.size());
             tokenSequence[i] = tokenTypeToId.get(type);
         }

diff --git a/coverage-report/pom.xml b/coverage-report/pom.xml
@@ -119,6 +119,10 @@
             <groupId>de.jplag</groupId>
             <artifactId>multi-language</artifactId>
         </dependency>
+        <dependency>
+            <groupId>de.jplag</groupId>
+            <artifactId>java-cpg</artifactId>
+        </dependency>
     </dependencies>
     <build>
         <plugins>

diff --git a/docs/1.-How-to-Use-JPlag.md b/docs/1.-How-to-Use-JPlag.md
@@ -42,7 +42,7 @@ Parameter descriptions:
                         Root-directories with submissions to check for
                           plagiarism (same as root).
       --normalize       Activate the normalization of tokens. Supported for
-                          languages: Java, C++.
+                          languages: Java, Java-CPG, C++.
       -old, --old=<oldDirectories>[,<oldDirectories>...]
                         Root-directories with prior submissions to compare
                           against.
@@ -251,7 +251,7 @@ The base code is a special kind of submission. It is the template on which all o
     └── Solution.java
 ```
 
-In this example, students must solve a problem by implementing the `run` method in the template below. Because they are not supposed to modify the `main` function, it will be identical for each student. 
+In this example, students must solve a problem by implementing the `run` method in the template below. Because they are not supposed to modify the `main` function, it will be identical for each student.
 
 ```java
 // BaseCode/Solution.java
@@ -269,14 +269,14 @@ public class Solution {
 }
 ```
 
-To prevent JPlag from detecting similarities in the `main` function (and other parts of the template), we can instruct JPlag to ignore matches with the given base code by providing the `-bc=<base-code-name>` option. 
+To prevent JPlag from detecting similarities in the `main` function (and other parts of the template), we can instruct JPlag to ignore matches with the given base code by providing the `-bc=<base-code-name>` option.
 The `<base-code-name>` in the example above is `BaseCode`.
 
 ### Multiple Root Directories
 * You can run JPlag with multiple root directories; JPlag compares submissions from all of them
 * JPlag distinguishes between old and new root directories
-** Submissions in new root directories are checked amongst themselves and against submissions from other root directories
-** Submissions in old root directories are only checked against submissions from other new root directories
+  ** Submissions in new root directories are checked amongst themselves and against submissions from other root directories
+  ** Submissions in old root directories are only checked against submissions from other new root directories
 * You need at least one (new) root directory to run JPlag
 
 This allows you to check submissions against those of previous years:
@@ -307,4 +307,4 @@ classDiagram
     Directory --> "1..*" File : contains
     Submission <|-- File : is a
     Submission <|-- Directory : is a
-```
+```
diff --git a/language-api/src/main/java/de/jplag/DefaultTokenEquivalenceModel.java b/language-api/src/main/java/de/jplag/DefaultTokenEquivalenceModel.java
@@ -0,0 +1,18 @@
+package de.jplag;
+
+/**
+ * The default token equivalence model that can be used by most languages. It assumes tokens are only equivalent if they
+ * have the same type and contain no additional data.
+ */
+public class DefaultTokenEquivalenceModel implements TokenEquivalenceModel {
+
+    @Override
+    public TokenType getPrimaryType(Token token) {
+        return token.getType();
+    }
+
+    @Override
+    public boolean arePrimaryEquivalent(int leftValue, int rightValue) {
+        return leftValue == rightValue;
+    }
+}
diff --git a/language-api/src/main/java/de/jplag/Language.java b/language-api/src/main/java/de/jplag/Language.java
@@ -147,6 +147,14 @@ default boolean requiresCoreNormalization() {
         return true;
     }
 
+    /**
+     * @return The token equivalence model to use for this language. Override this method if you need a custom token
+     * equivalence model.
+     */
+    default TokenEquivalenceModel getTokenEquivalenceModel() {
+        return new DefaultTokenEquivalenceModel();
+    }
+
     /**
      * @return True, if the language module can be used by the multi-language module
      */

diff --git a/language-api/src/main/java/de/jplag/Token.java b/language-api/src/main/java/de/jplag/Token.java
@@ -28,7 +28,8 @@ public class Token {
     private final int endColumn;
     private final File file;
     private final TokenType type;
-    private CodeSemantics semantics; // value null if no semantics
+    private CodeSemantics semantics; // value null if no semantics, maybe move into tokentype since information about the tokens information not bout
+                                     // the position?
 
     /**
      * Creates a token with column and length information.

diff --git a/language-api/src/main/java/de/jplag/TokenEquivalenceModel.java b/language-api/src/main/java/de/jplag/TokenEquivalenceModel.java
@@ -0,0 +1,49 @@
+package de.jplag;
+
+import java.util.List;
+
+/**
+ * Defines an interface for when tokens are considered equivalent. This is used to determine matches between tokens by
+ * using a two step approach: First, the primary types of the tokens are compared using
+ * {@link #arePrimaryEquivalent(int, int)}. If they are considered equivalent, the secondary types are compared using
+ * {@link #areSecondaryEquivalent(TokenType, TokenType)}.
+ */
+public interface TokenEquivalenceModel {
+
+    /**
+     * Gets the primary {@link TokenType} of a token.
+     * @param token The token
+     * @return The primary type
+     */
+    TokenType getPrimaryType(Token token);
+
+    /**
+     * Ensures that the tokens have the correct type assigned. By default, this method does nothing and returns true.
+     * @param tokens The tokens
+     * @return True, if the types are ensured
+     */
+    default boolean ensureTokenType(List<Token> tokens) {
+        return true;
+    }
+
+    /**
+     * Determines whether two tokens are primary equivalent based on their int representation. Uses an int representation of
+     * the token types for performance reasons.
+     * @param leftValue the left token value
+     * @param rightValue the right token value
+     * @return True, if the primary token values are equivalent
+     */
+    boolean arePrimaryEquivalent(int leftValue, int rightValue);
+
+    /**
+     * Determines whether two tokens are secondary equivalent based on their TokenType representation. By default, this
+     * method returns true.
+     * @param leftType the left token type
+     * @param rightType the right token type
+     * @return True, if the secondary token types are equivalent
+     */
+    default boolean areSecondaryEquivalent(TokenType leftType, TokenType rightType) {
+        return true;
+    }
+
+}