Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 25 additions & 20 deletions cli/src/main/java/de/jplag/cli/JPlagOptionsBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import de.jplag.cli.picocli.CliInputHandler;
import de.jplag.clustering.ClusteringOptions;
import de.jplag.clustering.Preprocessing;
import de.jplag.frequency.FrequencyAnalysisOptions;
import de.jplag.merging.MergingOptions;
import de.jplag.options.JPlagOptions;

Expand Down Expand Up @@ -59,49 +60,53 @@ private JPlagOptions initializeJPlagOptions(Set<File> submissionDirectories, Set
throws CliException {
ClusteringOptions clusteringOptions = getClusteringOptions();
MergingOptions mergingOptions = getMergingOptions();
FrequencyAnalysisOptions frequencyAnalysisOptions = getFrequencyAnalysisOptions();

return new JPlagOptions(this.cliInputHandler.getSelectedLanguage(), this.cliOptions.minTokenMatch, submissionDirectories,
oldSubmissionDirectories, null, this.cliOptions.advanced.subdirectory, suffixes, this.cliOptions.advanced.exclusionFileName,
JPlagOptions.DEFAULT_SIMILARITY_METRIC, this.cliOptions.advanced.similarityThreshold, this.cliOptions.shownComparisons,
clusteringOptions, this.cliOptions.advanced.debug, mergingOptions, this.cliOptions.normalize,
this.cliOptions.advanced.analyzeComments, this.cliOptions.frequencyOptions.frequencyStrategy,
this.cliOptions.frequencyOptions.frequencyStrategyMinValue, this.cliOptions.frequencyOptions.weightingStrategy,
this.cliOptions.frequencyOptions.weightingFactor);
this.cliOptions.advanced.analyzeComments, frequencyAnalysisOptions);
}

private ClusteringOptions getClusteringOptions() {
ClusteringOptions clusteringOptions = new ClusteringOptions().withEnabled(!this.cliOptions.clustering.disable)
.withAlgorithm(this.cliOptions.clustering.enabled.algorithm).withSimilarityMetric(this.cliOptions.clustering.enabled.metric)
.withSpectralKernelBandwidth(this.cliOptions.clusterSpectralBandwidth)
.withSpectralGaussianProcessVariance(this.cliOptions.clusterSpectralNoise).withSpectralMinRuns(this.cliOptions.clusterSpectralMinRuns)
.withSpectralMaxRuns(this.cliOptions.clusterSpectralMaxRuns)
.withSpectralMaxKMeansIterationPerRun(this.cliOptions.clusterSpectralKMeansIterations)
.withAgglomerativeThreshold(this.cliOptions.clusterAgglomerativeThreshold)
.withAgglomerativeInterClusterSimilarity(this.cliOptions.clusterAgglomerativeInterClusterSimilarity);

if (this.cliOptions.clusterPreprocessingNone) {
ClusteringOptions clusteringOptions = new ClusteringOptions().withEnabled(!cliOptions.clustering.disable)
.withAlgorithm(cliOptions.clustering.enabled.algorithm).withSimilarityMetric(cliOptions.clustering.enabled.metric)
.withSpectralKernelBandwidth(cliOptions.clusterSpectralBandwidth).withSpectralGaussianProcessVariance(cliOptions.clusterSpectralNoise)
.withSpectralMinRuns(cliOptions.clusterSpectralMinRuns).withSpectralMaxRuns(cliOptions.clusterSpectralMaxRuns)
.withSpectralMaxKMeansIterationPerRun(cliOptions.clusterSpectralKMeansIterations)
.withAgglomerativeThreshold(cliOptions.clusterAgglomerativeThreshold)
.withAgglomerativeInterClusterSimilarity(cliOptions.clusterAgglomerativeInterClusterSimilarity);

if (cliOptions.clusterPreprocessingNone) {
clusteringOptions = clusteringOptions.withPreprocessor(Preprocessing.NONE);
}

if (this.cliOptions.clusterPreprocessingCdf) {
if (cliOptions.clusterPreprocessingCdf) {
clusteringOptions = clusteringOptions.withPreprocessor(Preprocessing.CUMULATIVE_DISTRIBUTION_FUNCTION);
}

if (this.cliOptions.clusterPreprocessingPercentile != 0) {
if (cliOptions.clusterPreprocessingPercentile != 0) {
clusteringOptions = clusteringOptions.withPreprocessor(Preprocessing.PERCENTILE)
.withPreprocessorPercentile(this.cliOptions.clusterPreprocessingPercentile);
.withPreprocessorPercentile(cliOptions.clusterPreprocessingPercentile);
}

if (this.cliOptions.clusterPreprocessingThreshold != 0) {
if (cliOptions.clusterPreprocessingThreshold != 0) {
clusteringOptions = clusteringOptions.withPreprocessor(Preprocessing.THRESHOLD)
.withPreprocessorThreshold(this.cliOptions.clusterPreprocessingThreshold);
.withPreprocessorThreshold(cliOptions.clusterPreprocessingThreshold);
}

return clusteringOptions;
}

private MergingOptions getMergingOptions() {
return new MergingOptions(this.cliOptions.merging.enabled, this.cliOptions.merging.minimumNeighborLength,
this.cliOptions.merging.maximumGapSize, this.cliOptions.merging.minimumRequiredMerges);
return new MergingOptions(cliOptions.merging.enabled, cliOptions.merging.minimumNeighborLength, cliOptions.merging.maximumGapSize,
cliOptions.merging.minimumRequiredMerges);
}

private FrequencyAnalysisOptions getFrequencyAnalysisOptions() {
return new FrequencyAnalysisOptions(cliOptions.frequencyOptions.enabled, cliOptions.frequencyOptions.frequencyStrategy,
cliOptions.frequencyOptions.frequencyStrategyMinValue, cliOptions.frequencyOptions.weightingStrategy,
cliOptions.frequencyOptions.weightingFactor);
}
}
17 changes: 12 additions & 5 deletions cli/src/main/java/de/jplag/cli/options/CliOptions.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
import de.jplag.clustering.ClusteringAlgorithm;
import de.jplag.clustering.ClusteringOptions;
import de.jplag.clustering.algorithm.InterClusterSimilarity;
import de.jplag.highlightextraction.FrequencyAnalysisOptions;
import de.jplag.highlightextraction.FrequencyAnalysisStrategy;
import de.jplag.highlightextraction.MatchFrequencyWeightingFunction;
import de.jplag.frequency.FrequencyAnalysisOptions;
import de.jplag.frequency.FrequencyStrategy;
import de.jplag.frequency.MatchFrequencyWeightingFunction;
import de.jplag.java.JavaLanguage;
import de.jplag.merging.MergingOptions;
import de.jplag.options.JPlagOptions;
Expand Down Expand Up @@ -184,10 +184,17 @@ public static class ClusteringEnabled {
/** Highlight extraction options. */
public static class FrequencyAnalysis {

/**
* Enables frequency analysis to weigh matched code fragments according to their overall rarity.
*/
@Option(names = {
"--include-frequency"}, description = "Enables frequency analysis to weigh matched code fragments according to their overall rarity.")
public boolean enabled = FrequencyAnalysisOptions.DEFAULT_ENABLED;

/** Frequency Determination strategy. */
@Option(names = {
"--frequency-strategy"}, description = "Strategy for frequency Analysis, one of: ${COMPLETION-CANDIDATES} (default: ${DEFAULT_VALUE}).")
public FrequencyAnalysisStrategy frequencyStrategy = new FrequencyAnalysisOptions().frequencyStrategy();
"--frequency-strategy"}, description = "Strategy for frequency Analysis, one of: ${COMPLETION-CANDIDATES} (default: ${DEFAULT_VALUE}).", converter = FrequencyStrategyPicocliBindings.class, completionCandidates = FrequencyStrategyPicocliBindings.class, defaultValue = "complete")
public FrequencyStrategy frequencyStrategy = new FrequencyAnalysisOptions().frequencyStrategy();

/** Min value for considered subsequence length in Frequency Determination strategy. */
@Option(names = {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package de.jplag.cli.options;

import java.util.ArrayList;
import java.util.Map;

import de.jplag.frequency.CompleteMatchesStrategy;
import de.jplag.frequency.ContainedMatchesStrategy;
import de.jplag.frequency.FrequencyStrategy;
import de.jplag.frequency.SubMatchesStrategy;
import de.jplag.frequency.WindowOfMatchesStrategy;

import picocli.CommandLine;

public class FrequencyStrategyPicocliBindings extends ArrayList<String> implements CommandLine.ITypeConverter<FrequencyStrategy> {
private final static Map<String, FrequencyStrategy> STRATEGIES = Map.of("complete", new CompleteMatchesStrategy(), "contained",
new ContainedMatchesStrategy(), "subMatches", new SubMatchesStrategy(), "windowOfMatches", new WindowOfMatchesStrategy());

public FrequencyStrategyPicocliBindings() {
super(STRATEGIES.keySet());
}

@Override
public FrequencyStrategy convert(String value) throws Exception {
return STRATEGIES.get(value);
}
}
9 changes: 5 additions & 4 deletions core/src/main/java/de/jplag/JPlag.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import de.jplag.exceptions.ExitException;
import de.jplag.exceptions.RootDirectoryException;
import de.jplag.exceptions.SubmissionException;
import de.jplag.highlightextraction.FrequencyMatchWeighter;
import de.jplag.frequency.FrequencyAnalysis;
import de.jplag.merging.MatchMerging;
import de.jplag.options.JPlagOptions;
import de.jplag.reporting.reportobject.model.Version;
Expand Down Expand Up @@ -99,13 +99,14 @@ public static JPlagResult run(JPlagOptions options) throws ExitException {
result = new MatchMerging(options).mergeMatchesOf(result);
}

FrequencyMatchWeighter matchWeighter = new FrequencyMatchWeighter();
List<JPlagComparison> frequencyWeightedComparisons = matchWeighter.useMatchFrequencyToInfluenceSimilarity(options, result);
if (options.frequencyAnalysisOptions().enabled()) {
result = FrequencyAnalysis.applyFrequencyWeighting(result, options.frequencyAnalysisOptions(), options.minimumTokenMatch());
}

if (logger.isInfoEnabled()) {
logger.info("Total time for comparing submissions: {}", TimeUtil.formatDuration(result.getDuration()));
}
result.setClusteringResult(ClusteringFactory.getClusterings(frequencyWeightedComparisons, options.clusteringOptions()));
result.setClusteringResult(ClusteringFactory.getClusterings(result.getAllComparisons(), options.clusteringOptions()));

logSkippedSubmissions(submissionSet, options);

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package de.jplag.highlightextraction;
package de.jplag.frequency;

import java.util.List;
import java.util.Map;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package de.jplag.highlightextraction;
package de.jplag.frequency;

import static de.jplag.highlightextraction.SubSequenceUtil.getSubSequences;
import static de.jplag.frequency.SubSequenceUtil.getSubSequences;

import java.util.ArrayList;
import java.util.List;
Expand Down
45 changes: 45 additions & 0 deletions core/src/main/java/de/jplag/frequency/FrequencyAnalysis.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
package de.jplag.frequency;

import java.util.List;
import java.util.Map;

import de.jplag.JPlagComparison;
import de.jplag.JPlagResult;
import de.jplag.TokenType;

/**
* Contains the logic of the frequency based weighting of the Matches in all Comparisons, influencing the similarity
* between two comparisons according to the FrequencyStrategy and Similarity strategy.
*/
public final class FrequencyAnalysis {

private FrequencyAnalysis() {
throw new IllegalStateException(); // private constructor for non-instantiability
}

/**
* Calculates the rarity of all matched token sequences and weighs matches accordingly.
* @param result are the JPlag results to re-weigh according to frequency of matched section.
* @param options are the frequency analysis options.
* @param minimumTokenMatch is the minimum token match value.
* @return the modified result with re-weighed matches.
*/
public static JPlagResult applyFrequencyWeighting(JPlagResult result, FrequencyAnalysisOptions options, int minimumTokenMatch) {

// Compute absolute token sequence frequency:
FrequencyDetermination frequencyDetermination = new FrequencyDetermination(options.frequencyStrategy(),
Math.max(options.frequencyStrategyMinValue(), minimumTokenMatch));
Map<List<TokenType>, Integer> tokenSequenceFrequencies = frequencyDetermination.buildFrequencyMap(result.getAllComparisons());

// Compute absolute match sequence frequency:
MatchFrequencyEvaluator frequencyEvaluator = new MatchFrequencyEvaluator(options.frequencyStrategy(), tokenSequenceFrequencies);
Map<List<TokenType>, Double> matchFrequencies = frequencyEvaluator.computeMatchFrequencies(result.getAllComparisons());

// Weigh matches based on frequency:
MatchFrequencyWeighting weighting = new MatchFrequencyWeighting(result.getAllComparisons(), options.weightingStrategy(), matchFrequencies);
List<JPlagComparison> convertedComparisons = result.getAllComparisons().stream()
.map(comparison -> weighting.weightedComparisonSimilarity(comparison, options.weightingFactor())).toList();
return new JPlagResult(convertedComparisons, result.getSubmissions(), result.getDuration(), result.getOptions());
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package de.jplag.frequency;

import io.soabase.recordbuilder.core.RecordBuilder;

/**
* Options for Frequency Analysis.
* @param enabled specifies if the analysis is enabled.
* @param frequencyStrategy the strategy used to determine the frequency of a Match
* @param frequencyStrategyMinValue the minimum considered size of Subsequences from matches in the frequencyStrategy
* @param weightingStrategy strategy used to influence the similarity based on Match frequency
* @param weightingFactor how strong the impact of the weightingStrategy is
*/
@RecordBuilder
public record FrequencyAnalysisOptions(boolean enabled, FrequencyStrategy frequencyStrategy, int frequencyStrategyMinValue,
MatchFrequencyWeightingFunction weightingStrategy, double weightingFactor) implements FrequencyAnalysisOptionsBuilder.With {

/** default value for the analysis being enabled. **/
public static final boolean DEFAULT_ENABLED = false;

/**
* Default options for frequency Analysis.
*/
public FrequencyAnalysisOptions() {
this(false, new CompleteMatchesStrategy(), 1, MatchFrequencyWeightingFunction.SIGMOID, 0.25);
}
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package de.jplag.highlightextraction;
package de.jplag.frequency;

import java.util.HashMap;
import java.util.List;
Expand Down Expand Up @@ -32,9 +32,10 @@ public FrequencyDetermination(FrequencyStrategy frequencyStrategy, int strategyN
/**
* Builds the frequency map by applying the strategy method on all matches found in the given list of comparisons.
* @param comparisons contains information of matches between two submissions.
* @return the mapping between token sequences and number of occurrences.
* @throws IllegalArgumentException if match indices are out of range.
*/
public void buildFrequencyMap(List<JPlagComparison> comparisons) {
public Map<List<TokenType>, Integer> buildFrequencyMap(List<JPlagComparison> comparisons) {
for (JPlagComparison comparison : comparisons) {
Submission leftSubmission = comparison.firstSubmission();
List<Token> submissionTokens = leftSubmission.getTokenList();
Expand All @@ -45,6 +46,7 @@ public void buildFrequencyMap(List<JPlagComparison> comparisons) {
frequencyStrategy.processMatchTokenTypes(matchTokenTypes, this::addSequenceKey, this::addSequence, strategyNumber);
}
}
return matchFrequencyMap;
}

/**
Expand All @@ -55,13 +57,6 @@ private void addSequenceKey(List<TokenType> sequence) {
matchFrequencyMap.putIfAbsent(sequence, 0);
}

/**
* @return Map containing (sub-)matches and their frequency according to the strategy.
*/
public Map<List<TokenType>, Integer> getMatchFrequencyMap() {
return matchFrequencyMap;
}

/**
* Updates the frequency of the given sequence in the frequency map.
* @param sequence The token sequence whose frequency will be updated.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package de.jplag.highlightextraction;
package de.jplag.frequency;

import java.util.List;
import java.util.Map;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package de.jplag.highlightextraction;
package de.jplag.frequency;

import java.util.List;

Expand Down
61 changes: 61 additions & 0 deletions core/src/main/java/de/jplag/frequency/MatchFrequencyEvaluator.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
package de.jplag.frequency;

import java.util.HashMap;
import java.util.List;
import java.util.Map;

import de.jplag.JPlagComparison;
import de.jplag.Match;
import de.jplag.Token;
import de.jplag.TokenType;

/**
* Calculates absolute frequencies for token sequences of matches.
*/
public class MatchFrequencyEvaluator {
private final FrequencyStrategy strategy;
private final Map<List<TokenType>, Integer> frequencyMap;
private final Map<List<TokenType>, Double> matchFrequency;

/**
* Constructor defining the used frequency strategy and frequency map.
* @param strategy chosen to determine the frequency of a match
* @param frequencyMap build frequencyMap based on the strategy
*/
public MatchFrequencyEvaluator(FrequencyStrategy strategy, Map<List<TokenType>, Integer> frequencyMap) {
this.strategy = strategy;
this.frequencyMap = frequencyMap;
this.matchFrequency = new HashMap<>();
}

/**
* Calculates absolute frequency value for all matches. Depending on the frequency strategy, this might either be just
* the occurrences of the match token sequence or also average occurrences of sub sequences.
* @param matches the matches to determine the frequency for
* @param tokenSequence token sequence of the comparison
* @return the frequency of the match
*/
public Map<List<TokenType>, Double> computeMatchFrequencies(List<Match> matches, List<TokenType> tokenSequence) {
for (Match match : matches) {
List<TokenType> matchTokens = FrequencyUtil.matchesToMatchTokenTypes(match, tokenSequence);
double absoluteFrequency = strategy.calculateMatchFrequency(match, frequencyMap, matchTokens);
matchFrequency.put(matchTokens, absoluteFrequency);
}
return matchFrequency;
}

/**
* Calculates absolute frequency values for all matches of all comparisons.
* @param comparisons list of comparisons to consider.
* @return the frequency values.
*/
public Map<List<TokenType>, Double> computeMatchFrequencies(List<JPlagComparison> comparisons) {
for (JPlagComparison comparison : comparisons) {
List<Token> tokenSequence = comparison.firstSubmission().getTokenList(); // TODO this might break with match merging
List<TokenType> firstSubmissionTokenTypes = tokenSequence.stream().map(Token::getType).toList();
computeMatchFrequencies(comparison.matches(), firstSubmissionTokenTypes);
}
return matchFrequency;
}

}
Loading
Loading