diff --git a/cli/src/main/java/de/jplag/cli/JPlagOptionsBuilder.java b/cli/src/main/java/de/jplag/cli/JPlagOptionsBuilder.java index c2dbde4cc5..be68a8e574 100644 --- a/cli/src/main/java/de/jplag/cli/JPlagOptionsBuilder.java +++ b/cli/src/main/java/de/jplag/cli/JPlagOptionsBuilder.java @@ -12,6 +12,7 @@ import de.jplag.cli.picocli.CliInputHandler; import de.jplag.clustering.ClusteringOptions; import de.jplag.clustering.Preprocessing; +import de.jplag.frequency.FrequencyAnalysisOptions; import de.jplag.merging.MergingOptions; import de.jplag.options.JPlagOptions; @@ -59,49 +60,53 @@ private JPlagOptions initializeJPlagOptions(Set submissionDirectories, Set throws CliException { ClusteringOptions clusteringOptions = getClusteringOptions(); MergingOptions mergingOptions = getMergingOptions(); + FrequencyAnalysisOptions frequencyAnalysisOptions = getFrequencyAnalysisOptions(); return new JPlagOptions(this.cliInputHandler.getSelectedLanguage(), this.cliOptions.minTokenMatch, submissionDirectories, oldSubmissionDirectories, null, this.cliOptions.advanced.subdirectory, suffixes, this.cliOptions.advanced.exclusionFileName, JPlagOptions.DEFAULT_SIMILARITY_METRIC, this.cliOptions.advanced.similarityThreshold, this.cliOptions.shownComparisons, clusteringOptions, this.cliOptions.advanced.debug, mergingOptions, this.cliOptions.normalize, - this.cliOptions.advanced.analyzeComments, this.cliOptions.frequencyOptions.frequencyStrategy, - this.cliOptions.frequencyOptions.frequencyStrategyMinValue, this.cliOptions.frequencyOptions.weightingStrategy, - this.cliOptions.frequencyOptions.weightingFactor); + this.cliOptions.advanced.analyzeComments, frequencyAnalysisOptions); } private ClusteringOptions getClusteringOptions() { - ClusteringOptions clusteringOptions = new ClusteringOptions().withEnabled(!this.cliOptions.clustering.disable) - .withAlgorithm(this.cliOptions.clustering.enabled.algorithm).withSimilarityMetric(this.cliOptions.clustering.enabled.metric) - .withSpectralKernelBandwidth(this.cliOptions.clusterSpectralBandwidth) - .withSpectralGaussianProcessVariance(this.cliOptions.clusterSpectralNoise).withSpectralMinRuns(this.cliOptions.clusterSpectralMinRuns) - .withSpectralMaxRuns(this.cliOptions.clusterSpectralMaxRuns) - .withSpectralMaxKMeansIterationPerRun(this.cliOptions.clusterSpectralKMeansIterations) - .withAgglomerativeThreshold(this.cliOptions.clusterAgglomerativeThreshold) - .withAgglomerativeInterClusterSimilarity(this.cliOptions.clusterAgglomerativeInterClusterSimilarity); - - if (this.cliOptions.clusterPreprocessingNone) { + ClusteringOptions clusteringOptions = new ClusteringOptions().withEnabled(!cliOptions.clustering.disable) + .withAlgorithm(cliOptions.clustering.enabled.algorithm).withSimilarityMetric(cliOptions.clustering.enabled.metric) + .withSpectralKernelBandwidth(cliOptions.clusterSpectralBandwidth).withSpectralGaussianProcessVariance(cliOptions.clusterSpectralNoise) + .withSpectralMinRuns(cliOptions.clusterSpectralMinRuns).withSpectralMaxRuns(cliOptions.clusterSpectralMaxRuns) + .withSpectralMaxKMeansIterationPerRun(cliOptions.clusterSpectralKMeansIterations) + .withAgglomerativeThreshold(cliOptions.clusterAgglomerativeThreshold) + .withAgglomerativeInterClusterSimilarity(cliOptions.clusterAgglomerativeInterClusterSimilarity); + + if (cliOptions.clusterPreprocessingNone) { clusteringOptions = clusteringOptions.withPreprocessor(Preprocessing.NONE); } - if (this.cliOptions.clusterPreprocessingCdf) { + if (cliOptions.clusterPreprocessingCdf) { clusteringOptions = clusteringOptions.withPreprocessor(Preprocessing.CUMULATIVE_DISTRIBUTION_FUNCTION); } - if (this.cliOptions.clusterPreprocessingPercentile != 0) { + if (cliOptions.clusterPreprocessingPercentile != 0) { clusteringOptions = clusteringOptions.withPreprocessor(Preprocessing.PERCENTILE) - .withPreprocessorPercentile(this.cliOptions.clusterPreprocessingPercentile); + .withPreprocessorPercentile(cliOptions.clusterPreprocessingPercentile); } - if (this.cliOptions.clusterPreprocessingThreshold != 0) { + if (cliOptions.clusterPreprocessingThreshold != 0) { clusteringOptions = clusteringOptions.withPreprocessor(Preprocessing.THRESHOLD) - .withPreprocessorThreshold(this.cliOptions.clusterPreprocessingThreshold); + .withPreprocessorThreshold(cliOptions.clusterPreprocessingThreshold); } return clusteringOptions; } private MergingOptions getMergingOptions() { - return new MergingOptions(this.cliOptions.merging.enabled, this.cliOptions.merging.minimumNeighborLength, - this.cliOptions.merging.maximumGapSize, this.cliOptions.merging.minimumRequiredMerges); + return new MergingOptions(cliOptions.merging.enabled, cliOptions.merging.minimumNeighborLength, cliOptions.merging.maximumGapSize, + cliOptions.merging.minimumRequiredMerges); + } + + private FrequencyAnalysisOptions getFrequencyAnalysisOptions() { + return new FrequencyAnalysisOptions(cliOptions.frequencyOptions.enabled, cliOptions.frequencyOptions.frequencyStrategy, + cliOptions.frequencyOptions.frequencyStrategyMinValue, cliOptions.frequencyOptions.weightingStrategy, + cliOptions.frequencyOptions.weightingFactor); } } diff --git a/cli/src/main/java/de/jplag/cli/options/CliOptions.java b/cli/src/main/java/de/jplag/cli/options/CliOptions.java index 7819ffb730..4fe90de60f 100644 --- a/cli/src/main/java/de/jplag/cli/options/CliOptions.java +++ b/cli/src/main/java/de/jplag/cli/options/CliOptions.java @@ -9,9 +9,9 @@ import de.jplag.clustering.ClusteringAlgorithm; import de.jplag.clustering.ClusteringOptions; import de.jplag.clustering.algorithm.InterClusterSimilarity; -import de.jplag.highlightextraction.FrequencyAnalysisOptions; -import de.jplag.highlightextraction.FrequencyAnalysisStrategy; -import de.jplag.highlightextraction.MatchFrequencyWeightingFunction; +import de.jplag.frequency.FrequencyAnalysisOptions; +import de.jplag.frequency.FrequencyStrategy; +import de.jplag.frequency.MatchFrequencyWeightingFunction; import de.jplag.java.JavaLanguage; import de.jplag.merging.MergingOptions; import de.jplag.options.JPlagOptions; @@ -184,10 +184,17 @@ public static class ClusteringEnabled { /** Highlight extraction options. */ public static class FrequencyAnalysis { + /** + * Enables frequency analysis to weigh matched code fragments according to their overall rarity. + */ + @Option(names = { + "--include-frequency"}, description = "Enables frequency analysis to weigh matched code fragments according to their overall rarity.") + public boolean enabled = FrequencyAnalysisOptions.DEFAULT_ENABLED; + /** Frequency Determination strategy. */ @Option(names = { - "--frequency-strategy"}, description = "Strategy for frequency Analysis, one of: ${COMPLETION-CANDIDATES} (default: ${DEFAULT_VALUE}).") - public FrequencyAnalysisStrategy frequencyStrategy = new FrequencyAnalysisOptions().frequencyStrategy(); + "--frequency-strategy"}, description = "Strategy for frequency Analysis, one of: ${COMPLETION-CANDIDATES} (default: ${DEFAULT_VALUE}).", converter = FrequencyStrategyPicocliBindings.class, completionCandidates = FrequencyStrategyPicocliBindings.class, defaultValue = "complete") + public FrequencyStrategy frequencyStrategy = new FrequencyAnalysisOptions().frequencyStrategy(); /** Min value for considered subsequence length in Frequency Determination strategy. */ @Option(names = { diff --git a/cli/src/main/java/de/jplag/cli/options/FrequencyStrategyPicocliBindings.java b/cli/src/main/java/de/jplag/cli/options/FrequencyStrategyPicocliBindings.java new file mode 100644 index 0000000000..f40675195d --- /dev/null +++ b/cli/src/main/java/de/jplag/cli/options/FrequencyStrategyPicocliBindings.java @@ -0,0 +1,26 @@ +package de.jplag.cli.options; + +import java.util.ArrayList; +import java.util.Map; + +import de.jplag.frequency.CompleteMatchesStrategy; +import de.jplag.frequency.ContainedMatchesStrategy; +import de.jplag.frequency.FrequencyStrategy; +import de.jplag.frequency.SubMatchesStrategy; +import de.jplag.frequency.WindowOfMatchesStrategy; + +import picocli.CommandLine; + +public class FrequencyStrategyPicocliBindings extends ArrayList implements CommandLine.ITypeConverter { + private final static Map STRATEGIES = Map.of("complete", new CompleteMatchesStrategy(), "contained", + new ContainedMatchesStrategy(), "subMatches", new SubMatchesStrategy(), "windowOfMatches", new WindowOfMatchesStrategy()); + + public FrequencyStrategyPicocliBindings() { + super(STRATEGIES.keySet()); + } + + @Override + public FrequencyStrategy convert(String value) throws Exception { + return STRATEGIES.get(value); + } +} diff --git a/core/src/main/java/de/jplag/JPlag.java b/core/src/main/java/de/jplag/JPlag.java index 0105554df3..0eed67b169 100644 --- a/core/src/main/java/de/jplag/JPlag.java +++ b/core/src/main/java/de/jplag/JPlag.java @@ -15,7 +15,7 @@ import de.jplag.exceptions.ExitException; import de.jplag.exceptions.RootDirectoryException; import de.jplag.exceptions.SubmissionException; -import de.jplag.highlightextraction.FrequencyMatchWeighter; +import de.jplag.frequency.FrequencyAnalysis; import de.jplag.merging.MatchMerging; import de.jplag.options.JPlagOptions; import de.jplag.reporting.reportobject.model.Version; @@ -99,13 +99,14 @@ public static JPlagResult run(JPlagOptions options) throws ExitException { result = new MatchMerging(options).mergeMatchesOf(result); } - FrequencyMatchWeighter matchWeighter = new FrequencyMatchWeighter(); - List frequencyWeightedComparisons = matchWeighter.useMatchFrequencyToInfluenceSimilarity(options, result); + if (options.frequencyAnalysisOptions().enabled()) { + result = FrequencyAnalysis.applyFrequencyWeighting(result, options.frequencyAnalysisOptions(), options.minimumTokenMatch()); + } if (logger.isInfoEnabled()) { logger.info("Total time for comparing submissions: {}", TimeUtil.formatDuration(result.getDuration())); } - result.setClusteringResult(ClusteringFactory.getClusterings(frequencyWeightedComparisons, options.clusteringOptions())); + result.setClusteringResult(ClusteringFactory.getClusterings(result.getAllComparisons(), options.clusteringOptions())); logSkippedSubmissions(submissionSet, options); diff --git a/core/src/main/java/de/jplag/highlightextraction/CompleteMatchesStrategy.java b/core/src/main/java/de/jplag/frequency/CompleteMatchesStrategy.java similarity index 97% rename from core/src/main/java/de/jplag/highlightextraction/CompleteMatchesStrategy.java rename to core/src/main/java/de/jplag/frequency/CompleteMatchesStrategy.java index f74175d3a4..2ddbce56ce 100644 --- a/core/src/main/java/de/jplag/highlightextraction/CompleteMatchesStrategy.java +++ b/core/src/main/java/de/jplag/frequency/CompleteMatchesStrategy.java @@ -1,4 +1,4 @@ -package de.jplag.highlightextraction; +package de.jplag.frequency; import java.util.List; import java.util.Map; diff --git a/core/src/main/java/de/jplag/highlightextraction/ContainedMatchesStrategy.java b/core/src/main/java/de/jplag/frequency/ContainedMatchesStrategy.java similarity index 95% rename from core/src/main/java/de/jplag/highlightextraction/ContainedMatchesStrategy.java rename to core/src/main/java/de/jplag/frequency/ContainedMatchesStrategy.java index 9df8e475ae..5e3d6f9673 100644 --- a/core/src/main/java/de/jplag/highlightextraction/ContainedMatchesStrategy.java +++ b/core/src/main/java/de/jplag/frequency/ContainedMatchesStrategy.java @@ -1,6 +1,6 @@ -package de.jplag.highlightextraction; +package de.jplag.frequency; -import static de.jplag.highlightextraction.SubSequenceUtil.getSubSequences; +import static de.jplag.frequency.SubSequenceUtil.getSubSequences; import java.util.ArrayList; import java.util.List; diff --git a/core/src/main/java/de/jplag/frequency/FrequencyAnalysis.java b/core/src/main/java/de/jplag/frequency/FrequencyAnalysis.java new file mode 100644 index 0000000000..bccee37fc5 --- /dev/null +++ b/core/src/main/java/de/jplag/frequency/FrequencyAnalysis.java @@ -0,0 +1,45 @@ +package de.jplag.frequency; + +import java.util.List; +import java.util.Map; + +import de.jplag.JPlagComparison; +import de.jplag.JPlagResult; +import de.jplag.TokenType; + +/** + * Contains the logic of the frequency based weighting of the Matches in all Comparisons, influencing the similarity + * between two comparisons according to the FrequencyStrategy and Similarity strategy. + */ +public final class FrequencyAnalysis { + + private FrequencyAnalysis() { + throw new IllegalStateException(); // private constructor for non-instantiability + } + + /** + * Calculates the rarity of all matched token sequences and weighs matches accordingly. + * @param result are the JPlag results to re-weigh according to frequency of matched section. + * @param options are the frequency analysis options. + * @param minimumTokenMatch is the minimum token match value. + * @return the modified result with re-weighed matches. + */ + public static JPlagResult applyFrequencyWeighting(JPlagResult result, FrequencyAnalysisOptions options, int minimumTokenMatch) { + + // Compute absolute token sequence frequency: + FrequencyDetermination frequencyDetermination = new FrequencyDetermination(options.frequencyStrategy(), + Math.max(options.frequencyStrategyMinValue(), minimumTokenMatch)); + Map, Integer> tokenSequenceFrequencies = frequencyDetermination.buildFrequencyMap(result.getAllComparisons()); + + // Compute absolute match sequence frequency: + MatchFrequencyEvaluator frequencyEvaluator = new MatchFrequencyEvaluator(options.frequencyStrategy(), tokenSequenceFrequencies); + Map, Double> matchFrequencies = frequencyEvaluator.computeMatchFrequencies(result.getAllComparisons()); + + // Weigh matches based on frequency: + MatchFrequencyWeighting weighting = new MatchFrequencyWeighting(result.getAllComparisons(), options.weightingStrategy(), matchFrequencies); + List convertedComparisons = result.getAllComparisons().stream() + .map(comparison -> weighting.weightedComparisonSimilarity(comparison, options.weightingFactor())).toList(); + return new JPlagResult(convertedComparisons, result.getSubmissions(), result.getDuration(), result.getOptions()); + } + +} diff --git a/core/src/main/java/de/jplag/frequency/FrequencyAnalysisOptions.java b/core/src/main/java/de/jplag/frequency/FrequencyAnalysisOptions.java new file mode 100644 index 0000000000..2be4c82c4d --- /dev/null +++ b/core/src/main/java/de/jplag/frequency/FrequencyAnalysisOptions.java @@ -0,0 +1,26 @@ +package de.jplag.frequency; + +import io.soabase.recordbuilder.core.RecordBuilder; + +/** + * Options for Frequency Analysis. + * @param enabled specifies if the analysis is enabled. + * @param frequencyStrategy the strategy used to determine the frequency of a Match + * @param frequencyStrategyMinValue the minimum considered size of Subsequences from matches in the frequencyStrategy + * @param weightingStrategy strategy used to influence the similarity based on Match frequency + * @param weightingFactor how strong the impact of the weightingStrategy is + */ +@RecordBuilder +public record FrequencyAnalysisOptions(boolean enabled, FrequencyStrategy frequencyStrategy, int frequencyStrategyMinValue, + MatchFrequencyWeightingFunction weightingStrategy, double weightingFactor) implements FrequencyAnalysisOptionsBuilder.With { + + /** default value for the analysis being enabled. **/ + public static final boolean DEFAULT_ENABLED = false; + + /** + * Default options for frequency Analysis. + */ + public FrequencyAnalysisOptions() { + this(false, new CompleteMatchesStrategy(), 1, MatchFrequencyWeightingFunction.SIGMOID, 0.25); + } +} diff --git a/core/src/main/java/de/jplag/highlightextraction/FrequencyDetermination.java b/core/src/main/java/de/jplag/frequency/FrequencyDetermination.java similarity index 89% rename from core/src/main/java/de/jplag/highlightextraction/FrequencyDetermination.java rename to core/src/main/java/de/jplag/frequency/FrequencyDetermination.java index 746d58b6b6..48cbe8ae35 100644 --- a/core/src/main/java/de/jplag/highlightextraction/FrequencyDetermination.java +++ b/core/src/main/java/de/jplag/frequency/FrequencyDetermination.java @@ -1,4 +1,4 @@ -package de.jplag.highlightextraction; +package de.jplag.frequency; import java.util.HashMap; import java.util.List; @@ -32,9 +32,10 @@ public FrequencyDetermination(FrequencyStrategy frequencyStrategy, int strategyN /** * Builds the frequency map by applying the strategy method on all matches found in the given list of comparisons. * @param comparisons contains information of matches between two submissions. + * @return the mapping between token sequences and number of occurrences. * @throws IllegalArgumentException if match indices are out of range. */ - public void buildFrequencyMap(List comparisons) { + public Map, Integer> buildFrequencyMap(List comparisons) { for (JPlagComparison comparison : comparisons) { Submission leftSubmission = comparison.firstSubmission(); List submissionTokens = leftSubmission.getTokenList(); @@ -45,6 +46,7 @@ public void buildFrequencyMap(List comparisons) { frequencyStrategy.processMatchTokenTypes(matchTokenTypes, this::addSequenceKey, this::addSequence, strategyNumber); } } + return matchFrequencyMap; } /** @@ -55,13 +57,6 @@ private void addSequenceKey(List sequence) { matchFrequencyMap.putIfAbsent(sequence, 0); } - /** - * @return Map containing (sub-)matches and their frequency according to the strategy. - */ - public Map, Integer> getMatchFrequencyMap() { - return matchFrequencyMap; - } - /** * Updates the frequency of the given sequence in the frequency map. * @param sequence The token sequence whose frequency will be updated. diff --git a/core/src/main/java/de/jplag/highlightextraction/FrequencyStrategy.java b/core/src/main/java/de/jplag/frequency/FrequencyStrategy.java similarity index 97% rename from core/src/main/java/de/jplag/highlightextraction/FrequencyStrategy.java rename to core/src/main/java/de/jplag/frequency/FrequencyStrategy.java index 006390d810..a1cee38d15 100644 --- a/core/src/main/java/de/jplag/highlightextraction/FrequencyStrategy.java +++ b/core/src/main/java/de/jplag/frequency/FrequencyStrategy.java @@ -1,4 +1,4 @@ -package de.jplag.highlightextraction; +package de.jplag.frequency; import java.util.List; import java.util.Map; diff --git a/core/src/main/java/de/jplag/highlightextraction/FrequencyUtil.java b/core/src/main/java/de/jplag/frequency/FrequencyUtil.java similarity index 95% rename from core/src/main/java/de/jplag/highlightextraction/FrequencyUtil.java rename to core/src/main/java/de/jplag/frequency/FrequencyUtil.java index 1ee80781db..40a50fc63b 100644 --- a/core/src/main/java/de/jplag/highlightextraction/FrequencyUtil.java +++ b/core/src/main/java/de/jplag/frequency/FrequencyUtil.java @@ -1,4 +1,4 @@ -package de.jplag.highlightextraction; +package de.jplag.frequency; import java.util.List; diff --git a/core/src/main/java/de/jplag/frequency/MatchFrequencyEvaluator.java b/core/src/main/java/de/jplag/frequency/MatchFrequencyEvaluator.java new file mode 100644 index 0000000000..2349477b62 --- /dev/null +++ b/core/src/main/java/de/jplag/frequency/MatchFrequencyEvaluator.java @@ -0,0 +1,61 @@ +package de.jplag.frequency; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import de.jplag.JPlagComparison; +import de.jplag.Match; +import de.jplag.Token; +import de.jplag.TokenType; + +/** + * Calculates absolute frequencies for token sequences of matches. + */ +public class MatchFrequencyEvaluator { + private final FrequencyStrategy strategy; + private final Map, Integer> frequencyMap; + private final Map, Double> matchFrequency; + + /** + * Constructor defining the used frequency strategy and frequency map. + * @param strategy chosen to determine the frequency of a match + * @param frequencyMap build frequencyMap based on the strategy + */ + public MatchFrequencyEvaluator(FrequencyStrategy strategy, Map, Integer> frequencyMap) { + this.strategy = strategy; + this.frequencyMap = frequencyMap; + this.matchFrequency = new HashMap<>(); + } + + /** + * Calculates absolute frequency value for all matches. Depending on the frequency strategy, this might either be just + * the occurrences of the match token sequence or also average occurrences of sub sequences. + * @param matches the matches to determine the frequency for + * @param tokenSequence token sequence of the comparison + * @return the frequency of the match + */ + public Map, Double> computeMatchFrequencies(List matches, List tokenSequence) { + for (Match match : matches) { + List matchTokens = FrequencyUtil.matchesToMatchTokenTypes(match, tokenSequence); + double absoluteFrequency = strategy.calculateMatchFrequency(match, frequencyMap, matchTokens); + matchFrequency.put(matchTokens, absoluteFrequency); + } + return matchFrequency; + } + + /** + * Calculates absolute frequency values for all matches of all comparisons. + * @param comparisons list of comparisons to consider. + * @return the frequency values. + */ + public Map, Double> computeMatchFrequencies(List comparisons) { + for (JPlagComparison comparison : comparisons) { + List tokenSequence = comparison.firstSubmission().getTokenList(); // TODO this might break with match merging + List firstSubmissionTokenTypes = tokenSequence.stream().map(Token::getType).toList(); + computeMatchFrequencies(comparison.matches(), firstSubmissionTokenTypes); + } + return matchFrequency; + } + +} diff --git a/core/src/main/java/de/jplag/highlightextraction/MatchFrequencyWeighting.java b/core/src/main/java/de/jplag/frequency/MatchFrequencyWeighting.java similarity index 91% rename from core/src/main/java/de/jplag/highlightextraction/MatchFrequencyWeighting.java rename to core/src/main/java/de/jplag/frequency/MatchFrequencyWeighting.java index 9a0919181f..4fae0d30eb 100644 --- a/core/src/main/java/de/jplag/highlightextraction/MatchFrequencyWeighting.java +++ b/core/src/main/java/de/jplag/frequency/MatchFrequencyWeighting.java @@ -1,6 +1,7 @@ -package de.jplag.highlightextraction; +package de.jplag.frequency; import java.util.List; +import java.util.Map; import de.jplag.JPlagComparison; import de.jplag.Match; @@ -20,7 +21,7 @@ public class MatchFrequencyWeighting { * Chosen weighting function. */ private final MatchWeightingFunction strategy; - private final MatchFrequency matchFrequency; + private final Map, Double> tokenSequenceFrequencies; private static final double MINIMUM_PROPORTIONAL_WEIGHT = 0.01; private static final double MINIMUM_WEIGHT = 1.0; private static final double MAXIMUM_WEIGHT = 2.0; @@ -31,30 +32,30 @@ public class MatchFrequencyWeighting { * Constructor defines comparisons and strategy for the similarity calculation. * @param comparisons considered comparisons to calculate the similarity score for * @param strategy chosen weighting function - * @param matchFrequency the matchFrequency containing the map that maps a match to its frequency + * @param tokenSequenceFrequencies the matchFrequency containing the map that maps a match to its frequency */ - public MatchFrequencyWeighting(List comparisons, MatchWeightingFunction strategy, MatchFrequency matchFrequency) { + public MatchFrequencyWeighting(List comparisons, MatchWeightingFunction strategy, + Map, Double> tokenSequenceFrequencies) { this.comparisons = comparisons; this.strategy = strategy; - this.matchFrequency = matchFrequency; + this.tokenSequenceFrequencies = tokenSequenceFrequencies; } /** * Calculates the similarity score for a comparison. * @param comparison considered comparison to calculate the similarity score for * @param weightingFactor weighting factor, is factor for the (max) influence of the frequency - * @param isFrequencyAnalysisEnabled if the frequency shall be considered * @return similarity of the comparison */ - public JPlagComparison weightedComparisonSimilarity(JPlagComparison comparison, double weightingFactor, boolean isFrequencyAnalysisEnabled) { + public JPlagComparison weightedComparisonSimilarity(JPlagComparison comparison, double weightingFactor) { double frequencyWeightedSimilarity = frequencySimilarity(comparison, weightingFactor); - return new JPlagComparison(comparison, frequencyWeightedSimilarity, isFrequencyAnalysisEnabled); + return new JPlagComparison(comparison, frequencyWeightedSimilarity, true); } private double getFrequencyFromMap(JPlagComparison comparison, Match match) { List submissionTokenTypes = comparison.firstSubmission().getTokenList().stream().map(Token::getType).toList(); List matchTokens = FrequencyUtil.matchesToMatchTokenTypes(match, submissionTokenTypes); - return matchFrequency.matchFrequencyMap().getOrDefault(matchTokens, DEFAULT_MINIMUM_FREQUENCY); + return tokenSequenceFrequencies.getOrDefault(matchTokens, DEFAULT_MINIMUM_FREQUENCY); } /** @@ -137,10 +138,10 @@ public int getWeightedMatchLength(JPlagComparison comparison, double frequencyWe * @return this frequency */ private double getMaximumFoundFrequency(double maximumFoundFrequency) { - if (matchFrequency.matchFrequencyMap().isEmpty()) { + if (tokenSequenceFrequencies.isEmpty()) { maximumFoundFrequency = DEFAULT_MAXIMUM_FREQUENCY; } else { - for (double frequency : matchFrequency.matchFrequencyMap().values()) { + for (double frequency : tokenSequenceFrequencies.values()) { if (frequency > maximumFoundFrequency) { maximumFoundFrequency = frequency; } diff --git a/core/src/main/java/de/jplag/highlightextraction/MatchFrequencyWeightingFunction.java b/core/src/main/java/de/jplag/frequency/MatchFrequencyWeightingFunction.java similarity index 96% rename from core/src/main/java/de/jplag/highlightextraction/MatchFrequencyWeightingFunction.java rename to core/src/main/java/de/jplag/frequency/MatchFrequencyWeightingFunction.java index 87e26d39b9..bb1319db7b 100644 --- a/core/src/main/java/de/jplag/highlightextraction/MatchFrequencyWeightingFunction.java +++ b/core/src/main/java/de/jplag/frequency/MatchFrequencyWeightingFunction.java @@ -1,4 +1,4 @@ -package de.jplag.highlightextraction; +package de.jplag.frequency; /** * This class contains the possible weighting functions for a match, in the frequency analysis. diff --git a/core/src/main/java/de/jplag/highlightextraction/MatchWeightingFunction.java b/core/src/main/java/de/jplag/frequency/MatchWeightingFunction.java similarity index 92% rename from core/src/main/java/de/jplag/highlightextraction/MatchWeightingFunction.java rename to core/src/main/java/de/jplag/frequency/MatchWeightingFunction.java index b58ba3e701..669f32180a 100644 --- a/core/src/main/java/de/jplag/highlightextraction/MatchWeightingFunction.java +++ b/core/src/main/java/de/jplag/frequency/MatchWeightingFunction.java @@ -1,4 +1,4 @@ -package de.jplag.highlightextraction; +package de.jplag.frequency; /** * Strategy for calculating the weight of a match. diff --git a/core/src/main/java/de/jplag/highlightextraction/SubMatchesStrategy.java b/core/src/main/java/de/jplag/frequency/SubMatchesStrategy.java similarity index 98% rename from core/src/main/java/de/jplag/highlightextraction/SubMatchesStrategy.java rename to core/src/main/java/de/jplag/frequency/SubMatchesStrategy.java index 5f7e00a14e..ff97424a1e 100644 --- a/core/src/main/java/de/jplag/highlightextraction/SubMatchesStrategy.java +++ b/core/src/main/java/de/jplag/frequency/SubMatchesStrategy.java @@ -1,4 +1,4 @@ -package de.jplag.highlightextraction; +package de.jplag.frequency; import java.util.ArrayList; import java.util.List; diff --git a/core/src/main/java/de/jplag/highlightextraction/SubSequenceUtil.java b/core/src/main/java/de/jplag/frequency/SubSequenceUtil.java similarity index 98% rename from core/src/main/java/de/jplag/highlightextraction/SubSequenceUtil.java rename to core/src/main/java/de/jplag/frequency/SubSequenceUtil.java index 3b5a8a4f26..a496306e96 100644 --- a/core/src/main/java/de/jplag/highlightextraction/SubSequenceUtil.java +++ b/core/src/main/java/de/jplag/frequency/SubSequenceUtil.java @@ -1,4 +1,4 @@ -package de.jplag.highlightextraction; +package de.jplag.frequency; import java.util.ArrayList; import java.util.List; diff --git a/core/src/main/java/de/jplag/highlightextraction/WindowOfMatchesStrategy.java b/core/src/main/java/de/jplag/frequency/WindowOfMatchesStrategy.java similarity index 98% rename from core/src/main/java/de/jplag/highlightextraction/WindowOfMatchesStrategy.java rename to core/src/main/java/de/jplag/frequency/WindowOfMatchesStrategy.java index f32c5f1408..1bb167ffd4 100644 --- a/core/src/main/java/de/jplag/highlightextraction/WindowOfMatchesStrategy.java +++ b/core/src/main/java/de/jplag/frequency/WindowOfMatchesStrategy.java @@ -1,4 +1,4 @@ -package de.jplag.highlightextraction; +package de.jplag.frequency; import java.util.ArrayList; import java.util.LinkedList; diff --git a/core/src/main/java/de/jplag/highlightextraction/FrequencyAnalysisOptions.java b/core/src/main/java/de/jplag/highlightextraction/FrequencyAnalysisOptions.java deleted file mode 100644 index 6c1cc921a7..0000000000 --- a/core/src/main/java/de/jplag/highlightextraction/FrequencyAnalysisOptions.java +++ /dev/null @@ -1,48 +0,0 @@ -package de.jplag.highlightextraction; - -/** - * Options for Frequency Analysis. - * @param frequencyStrategy the strategy used to determine the frequency of a Match - * @param frequencyStrategyMinValue the minimum considered size of Subsequences from matches in the frequencyStrategy - * @param weightingStrategy strategy used to influence the similarity based on Match frequency - * @param weightingFactor how strong the impact of the weightingStrategy is - */ -public record FrequencyAnalysisOptions(FrequencyAnalysisStrategy frequencyStrategy, int frequencyStrategyMinValue, - MatchFrequencyWeightingFunction weightingStrategy, double weightingFactor) { - - /** - * Default options for frequency Analysis. - */ - public FrequencyAnalysisOptions() { - this(FrequencyAnalysisStrategy.COMPLETE_MATCHES, 1, MatchFrequencyWeightingFunction.SIGMOID, 0.25); - } - - /** - * Chosen FrequencyStrategy. - */ - public FrequencyAnalysisOptions withFrequencyStrategy(FrequencyAnalysisStrategy strategy) { - return new FrequencyAnalysisOptions(strategy, frequencyStrategyMinValue, weightingStrategy, weightingFactor); - } - - /** - * Minimum considered subsequence length. - */ - public FrequencyAnalysisOptions withFrequencyStrategyMinimumConsideredMatchSubsequenceSize(int minimumConsideredMatchSubsequenceSize) { - return new FrequencyAnalysisOptions(frequencyStrategy, minimumConsideredMatchSubsequenceSize, weightingStrategy, weightingFactor); - } - - /** - * Chosen weightingStrategy. - */ - public FrequencyAnalysisOptions withWeightingStrategy(MatchFrequencyWeightingFunction strategy) { - return new FrequencyAnalysisOptions(frequencyStrategy, frequencyStrategyMinValue, strategy, weightingFactor); - } - - /** - * Weighting maximumInfluenceOfMatchFrequencyConsidered for weightingStrategy. - */ - public FrequencyAnalysisOptions withWeightingFactor(double maximumInfluenceOfMatchFrequencyConsidered) { - return new FrequencyAnalysisOptions(frequencyStrategy, frequencyStrategyMinValue, weightingStrategy, - maximumInfluenceOfMatchFrequencyConsidered); - } -} diff --git a/core/src/main/java/de/jplag/highlightextraction/FrequencyAnalysisStrategy.java b/core/src/main/java/de/jplag/highlightextraction/FrequencyAnalysisStrategy.java deleted file mode 100644 index 74c7c669f2..0000000000 --- a/core/src/main/java/de/jplag/highlightextraction/FrequencyAnalysisStrategy.java +++ /dev/null @@ -1,27 +0,0 @@ -package de.jplag.highlightextraction; - -/** - * Enum representing the different strategies for frequency similarity calculation. - */ -public enum FrequencyAnalysisStrategy { - COMPLETE_MATCHES(new CompleteMatchesStrategy()), - CONTAINED_MATCHES(new ContainedMatchesStrategy()), - SUB_MATCHES(new SubMatchesStrategy()), - WINDOW_OF_MATCHES(new WindowOfMatchesStrategy()); - - private final FrequencyStrategy strategy; - - /** - * @param strategy FrequencyStrategy chosen for Frequency Determination in the frequency Analysis. - */ - FrequencyAnalysisStrategy(FrequencyStrategy strategy) { - this.strategy = strategy; - } - - /** - * @return the frequency similarity strategy of the enum constant. - */ - public FrequencyStrategy getStrategy() { - return strategy; - } -} diff --git a/core/src/main/java/de/jplag/highlightextraction/FrequencyMatchWeighter.java b/core/src/main/java/de/jplag/highlightextraction/FrequencyMatchWeighter.java deleted file mode 100644 index 535944f9fe..0000000000 --- a/core/src/main/java/de/jplag/highlightextraction/FrequencyMatchWeighter.java +++ /dev/null @@ -1,37 +0,0 @@ -package de.jplag.highlightextraction; - -import java.util.List; - -import de.jplag.JPlagComparison; -import de.jplag.JPlagResult; -import de.jplag.options.JPlagOptions; - -/** - * Contains the logic of the frequency based weighting of the Matches in all Comparisons, influencing the similarity - * between two comparisons according to the FrequencyStrategy and Similarity strategy. isFrequencyAnalysisEnabled = - * false would use the old similarity. - */ -public class FrequencyMatchWeighter { - /** - * @param options JPlagOptions - * @param result JPlagResult - * @return the new Comparisons with a weighted similarity. - */ - public List useMatchFrequencyToInfluenceSimilarity(JPlagOptions options, JPlagResult result) { - final boolean weightingFactor = isFrequencyAnalysisEnabled(options.weightingFactor()); - FrequencyDetermination frequencyDetermination = new FrequencyDetermination(options.frequencyAnalysisStrategy().getStrategy(), - Math.max(options.frequencyStrategyMinValue(), options.minimumTokenMatch())); - frequencyDetermination.buildFrequencyMap(result.getAllComparisons()); - MatchWeightCalculator matchWeighting = new MatchWeightCalculator(options.frequencyAnalysisStrategy().getStrategy(), - frequencyDetermination.getMatchFrequencyMap()); - MatchFrequency matchFrequency = matchWeighting.weightAllComparisons(result.getAllComparisons()); - MatchFrequencyWeighting similarity = new MatchFrequencyWeighting(result.getAllComparisons(), options.weightingStrategy(), matchFrequency); - return result.getAllComparisons().stream() - .map(comparison -> similarity.weightedComparisonSimilarity(comparison, options.weightingFactor(), weightingFactor)).toList(); - } - - private static boolean isFrequencyAnalysisEnabled(double weightingFactor) { - return weightingFactor != -1; - } - -} diff --git a/core/src/main/java/de/jplag/highlightextraction/MatchFrequency.java b/core/src/main/java/de/jplag/highlightextraction/MatchFrequency.java deleted file mode 100644 index d5c9b70f9e..0000000000 --- a/core/src/main/java/de/jplag/highlightextraction/MatchFrequency.java +++ /dev/null @@ -1,20 +0,0 @@ -package de.jplag.highlightextraction; - -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import de.jplag.TokenType; - -/** - * Contains the map that maps each Match to its frequency Calculated according to the chosen FrequencyStrategy. - * @param matchFrequencyMap maps each Match to its frequency. - */ -public record MatchFrequency(Map, Double> matchFrequencyMap) { - /** - * Constructor. - */ - public MatchFrequency() { - this(new HashMap<>()); - } -} diff --git a/core/src/main/java/de/jplag/highlightextraction/MatchWeightCalculator.java b/core/src/main/java/de/jplag/highlightextraction/MatchWeightCalculator.java deleted file mode 100644 index a42e9d1b8f..0000000000 --- a/core/src/main/java/de/jplag/highlightextraction/MatchWeightCalculator.java +++ /dev/null @@ -1,68 +0,0 @@ -package de.jplag.highlightextraction; - -import java.util.List; -import java.util.Map; - -import de.jplag.JPlagComparison; -import de.jplag.Match; -import de.jplag.Token; -import de.jplag.TokenType; - -/** - * Calculates frequency values of the matches and writes them into a map. - */ -public class MatchWeightCalculator { - private final FrequencyStrategy strategy; - private final Map, Integer> frequencyMap; - private final MatchFrequency matchFrequency; - - /** - * Constructor defining the used frequency strategy and frequency map. - * @param strategy chosen to determine the frequency of a match - * @param frequencyMap build frequencyMap based on the strategy - */ - public MatchWeightCalculator(FrequencyStrategy strategy, Map, Integer> frequencyMap) { - this.strategy = strategy; - this.frequencyMap = frequencyMap; - this.matchFrequency = new MatchFrequency(); - } - - /** - * Calculates frequency value for a match. - * @param match the match to determine the frequency for - * @param matchToken token sequence of the match - */ - public void weightMatch(Match match, List matchToken) { - double matchWeight = strategy.calculateMatchFrequency(match, frequencyMap, matchToken); - matchFrequency.matchFrequencyMap().put(matchToken, matchWeight); - } - - /** - * Calculates frequency value for all matches. - * @param matches the matches to determine the frequency for - * @param firstSubmissionToken token sequence of the comparison - * @return the frequency of the match - */ - public MatchFrequency weightAllMatches(List matches, List firstSubmissionToken) { - for (Match match : matches) { - List matchTokens = FrequencyUtil.matchesToMatchTokenTypes(match, firstSubmissionToken); - weightMatch(match, matchTokens); - } - return matchFrequency; - } - - /** - * Calculates frequency value for all matches. - * @param comparisons list of comparisons to weight - * @return the frequency of the match - */ - public MatchFrequency weightAllComparisons(List comparisons) { - for (JPlagComparison comparison : comparisons) { - List firstSubmissionToken = comparison.firstSubmission().getTokenList(); - List firstSubmissionTokenTypes = firstSubmissionToken.stream().map(Token::getType).toList(); - weightAllMatches(comparison.matches(), firstSubmissionTokenTypes); - } - return matchFrequency; - } - -} diff --git a/core/src/main/java/de/jplag/options/JPlagOptions.java b/core/src/main/java/de/jplag/options/JPlagOptions.java index 07a3099d95..b6787665fe 100644 --- a/core/src/main/java/de/jplag/options/JPlagOptions.java +++ b/core/src/main/java/de/jplag/options/JPlagOptions.java @@ -16,8 +16,7 @@ import de.jplag.Language; import de.jplag.clustering.ClusteringOptions; import de.jplag.exceptions.BasecodeException; -import de.jplag.highlightextraction.FrequencyAnalysisStrategy; -import de.jplag.highlightextraction.MatchFrequencyWeightingFunction; +import de.jplag.frequency.FrequencyAnalysisOptions; import de.jplag.merging.MergingOptions; import de.jplag.reporting.jsonfactory.serializer.FileSerializer; import de.jplag.reporting.jsonfactory.serializer.LanguageSerializer; @@ -51,10 +50,7 @@ * @param mergingOptions are the options related to the subsequence match merging mechanism that opposed obfuscation. * @param normalize enables additional normalization mechanisms. Only supported by some language modules. * @param analyzeComments If true, comments will be extracted from the submissions. - * @param frequencyAnalysisStrategy strategy for determining the frequency - * @param frequencyStrategyMinValue min considered subsequence length in frequencyStrategies - * @param weightingStrategy weighting function used in the frequency Analysis - * @param weightingFactor factor how strong the considered influence of the weighting function (maximal) can be + * @param frequencyAnalysisOptions are the options for the frequency analysis which factors in match rarity. */ @RecordBuilder() public record JPlagOptions(@JsonSerialize(using = LanguageSerializer.class) Language language, Integer minimumTokenMatch, @@ -63,8 +59,7 @@ public record JPlagOptions(@JsonSerialize(using = LanguageSerializer.class) Lang @JsonSerialize(using = FileSerializer.class) File baseCodeSubmissionDirectory, String subdirectoryName, List fileSuffixes, String exclusionFileName, SimilarityMetric similarityMetric, double similarityThreshold, int maximumNumberOfComparisons, ClusteringOptions clusteringOptions, boolean debugParser, MergingOptions mergingOptions, boolean normalize, boolean analyzeComments, - FrequencyAnalysisStrategy frequencyAnalysisStrategy, int frequencyStrategyMinValue, MatchFrequencyWeightingFunction weightingStrategy, - double weightingFactor) implements JPlagOptionsBuilder.With { + FrequencyAnalysisOptions frequencyAnalysisOptions) implements JPlagOptionsBuilder.With { /** Default value for the similarity threshold. **/ public static final double DEFAULT_SIMILARITY_THRESHOLD = 0; @@ -98,7 +93,7 @@ public JPlagOptions withLanguageOption(Language lang) { public JPlagOptions(Language language, Set submissionDirectories, Set oldSubmissionDirectories) { this(language, null, submissionDirectories, oldSubmissionDirectories, null, null, null, null, DEFAULT_SIMILARITY_METRIC, DEFAULT_SIMILARITY_THRESHOLD, DEFAULT_SHOWN_COMPARISONS, new ClusteringOptions(), false, new MergingOptions(), false, false, - FrequencyAnalysisStrategy.COMPLETE_MATCHES, 1, MatchFrequencyWeightingFunction.SIGMOID, 0.25); + new FrequencyAnalysisOptions()); } /** @@ -119,17 +114,13 @@ DEFAULT_SIMILARITY_THRESHOLD, DEFAULT_SHOWN_COMPARISONS, new ClusteringOptions() * @param mergingOptions Options related to subsequence merging to oppose obfuscation * @param normalize Enables additional normalization mechanisms (language-dependent) * @param analyzeComments Whether to extract comments from submissions - * @param frequencyAnalysisStrategy strategy for determining the frequency - * @param frequencyStrategyMinValue min considered subsequence length in frequencyStrategies - * @param weightingStrategy weighting function used in the frequency Analysis - * @param weightingFactor factor how strong the considered influence of the weighting function (maximal) can be + * @param frequencyAnalysisOptions are the options for the frequency analysis which factors in match rarity. */ public JPlagOptions(Language language, Integer minimumTokenMatch, Set submissionDirectories, Set oldSubmissionDirectories, File baseCodeSubmissionDirectory, String subdirectoryName, List fileSuffixes, String exclusionFileName, SimilarityMetric similarityMetric, double similarityThreshold, int maximumNumberOfComparisons, ClusteringOptions clusteringOptions, boolean debugParser, MergingOptions mergingOptions, boolean normalize, boolean analyzeComments, - FrequencyAnalysisStrategy frequencyAnalysisStrategy, int frequencyStrategyMinValue, MatchFrequencyWeightingFunction weightingStrategy, - double weightingFactor) { + FrequencyAnalysisOptions frequencyAnalysisOptions) { this.language = language; this.debugParser = debugParser; this.fileSuffixes = fileSuffixes == null || fileSuffixes.isEmpty() ? null : Collections.unmodifiableList(fileSuffixes); @@ -146,10 +137,7 @@ public JPlagOptions(Language language, Integer minimumTokenMatch, Set subm this.mergingOptions = mergingOptions; this.normalize = normalize; this.analyzeComments = analyzeComments; - this.frequencyAnalysisStrategy = frequencyAnalysisStrategy; - this.frequencyStrategyMinValue = frequencyStrategyMinValue; - this.weightingStrategy = weightingStrategy; - this.weightingFactor = weightingFactor; + this.frequencyAnalysisOptions = frequencyAnalysisOptions; } /** @@ -242,22 +230,17 @@ private Integer normalizeMinimumTokenMatch(Integer minimumTokenMatch) { * set to {@link #SHOW_ALL_COMPARISONS} all comparisons will be shown. * @param clusteringOptions Clustering options * @param debugParser If true, submissions that cannot be parsed will be stored in a separate directory. - * @param frequencyAnalysisStrategy strategy for determining the frequency - * @param frequencyStrategyMinValue min considered subsequence length in frequencyStrategies - * @param weightingStrategy weighting function used in the frequency Analysis - * @param weightingFactor factor how strong the considered influence of the weighting function (maximal) can be * @deprecated Use the default initializer with @{{@link #baseCodeSubmissionDirectory} instead. */ @Deprecated(since = "4.0.0", forRemoval = true) public JPlagOptions(Language language, Integer minimumTokenMatch, File submissionDirectory, Set oldSubmissionDirectories, String baseCodeSubmissionName, String subdirectoryName, List fileSuffixes, String exclusionFileName, SimilarityMetric similarityMetric, double similarityThreshold, int maximumNumberOfComparisons, ClusteringOptions clusteringOptions, - boolean debugParser, MergingOptions mergingOptions, FrequencyAnalysisStrategy frequencyAnalysisStrategy, int frequencyStrategyMinValue, - MatchFrequencyWeightingFunction weightingStrategy, double weightingFactor) throws BasecodeException { + boolean debugParser, MergingOptions mergingOptions) throws BasecodeException { this(language, minimumTokenMatch, Set.of(submissionDirectory), oldSubmissionDirectories, convertLegacyBaseCodeToFile(baseCodeSubmissionName, submissionDirectory), subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons, clusteringOptions, debugParser, mergingOptions, false, false, - frequencyAnalysisStrategy, frequencyStrategyMinValue, weightingStrategy, weightingFactor); + new FrequencyAnalysisOptions()); } /** @@ -280,8 +263,7 @@ public JPlagOptions withBaseCodeSubmissionName(String baseCodeSubmissionName) { try { return new JPlagOptions(language, minimumTokenMatch, submissionDirectory, oldSubmissionDirectories, baseCodeSubmissionName, subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons, - clusteringOptions, debugParser, mergingOptions, frequencyAnalysisStrategy, frequencyStrategyMinValue, weightingStrategy, - weightingFactor); + clusteringOptions, debugParser, mergingOptions); } catch (BasecodeException e) { throw new IllegalArgumentException(e.getMessage(), e.getCause()); } diff --git a/core/src/test/java/de/jplag/highlightextraction/frequencydetermination/MatchFrequencyWeightingTest.java b/core/src/test/java/de/jplag/highlightextraction/frequencydetermination/MatchFrequencyWeightingTest.java index 0618581ed9..74ae503f99 100644 --- a/core/src/test/java/de/jplag/highlightextraction/frequencydetermination/MatchFrequencyWeightingTest.java +++ b/core/src/test/java/de/jplag/highlightextraction/frequencydetermination/MatchFrequencyWeightingTest.java @@ -22,14 +22,13 @@ import de.jplag.TokenType; import de.jplag.comparison.LongestCommonSubsequenceSearch; import de.jplag.exceptions.ExitException; -import de.jplag.highlightextraction.CompleteMatchesStrategy; -import de.jplag.highlightextraction.ContainedMatchesStrategy; -import de.jplag.highlightextraction.FrequencyStrategy; -import de.jplag.highlightextraction.FrequencyUtil; -import de.jplag.highlightextraction.MatchFrequency; -import de.jplag.highlightextraction.MatchWeightCalculator; -import de.jplag.highlightextraction.SubMatchesStrategy; -import de.jplag.highlightextraction.WindowOfMatchesStrategy; +import de.jplag.frequency.CompleteMatchesStrategy; +import de.jplag.frequency.ContainedMatchesStrategy; +import de.jplag.frequency.FrequencyStrategy; +import de.jplag.frequency.FrequencyUtil; +import de.jplag.frequency.MatchFrequencyEvaluator; +import de.jplag.frequency.SubMatchesStrategy; +import de.jplag.frequency.WindowOfMatchesStrategy; import de.jplag.options.JPlagOptions; /** @@ -111,11 +110,10 @@ void testWeightMatch_setsCorrectWeight_completeMatchesStrategy() { assertEquals(1.0, weight, 0.01, "only one Match added"); COMPLETE_MATCHES_STRATEGY.processMatchTokenTypes(matchToken, this::addSequenceKey, this::addSequence, 0); - MatchWeightCalculator weighting = new MatchWeightCalculator(COMPLETE_MATCHES_STRATEGY, frequencyMap); - MatchFrequency matchFrequency = weighting.weightAllMatches(List.of(testMatch), submissionToken); + MatchFrequencyEvaluator weighting = new MatchFrequencyEvaluator(COMPLETE_MATCHES_STRATEGY, frequencyMap); + Map, Double> matchFrequency = weighting.computeMatchFrequencies(List.of(testMatch), submissionToken); List testSubmissionTokenTypes = testSubmission.getTokenList().stream().map(Token::getType).toList(); - double matchFrequencyCalculated = matchFrequency.matchFrequencyMap() - .get(FrequencyUtil.matchesToMatchTokenTypes(testMatch, testSubmissionTokenTypes)); + double matchFrequencyCalculated = matchFrequency.get(FrequencyUtil.matchesToMatchTokenTypes(testMatch, testSubmissionTokenTypes)); assertEquals(0.0, matchFrequencyCalculated, 0.01, "only one Match added twice"); } @@ -140,14 +138,12 @@ void testWeightMatch_setsCorrectWeight_containedMatchesStrategy() { } CONTAINED_MATCHES_STRATEGY.processMatchTokenTypes(matchToken, this::addSequenceKey, this::addSequence, 100); CONTAINED_MATCHES_STRATEGY.processMatchTokenTypes(matchContainedToken, this::addSequenceKey, this::addSequence, 100); - MatchWeightCalculator weighting = new MatchWeightCalculator(CONTAINED_MATCHES_STRATEGY, frequencyMap); - weighting.weightAllMatches(List.of(testMatch, matchContained), submissionToken); - MatchFrequency matchFrequency = weighting.weightAllMatches(List.of(testMatch), submissionToken); + MatchFrequencyEvaluator weighting = new MatchFrequencyEvaluator(CONTAINED_MATCHES_STRATEGY, frequencyMap); + weighting.computeMatchFrequencies(List.of(testMatch, matchContained), submissionToken); + Map, Double> matchFrequency = weighting.computeMatchFrequencies(List.of(testMatch), submissionToken); List testSubmissionTokenTypes = testSubmission.getTokenList().stream().map(Token::getType).toList(); - double matchFrequencyCalculated = matchFrequency.matchFrequencyMap() - .get(FrequencyUtil.matchesToMatchTokenTypes(testMatch, testSubmissionTokenTypes)); - double matchFrequencyCalculated1 = matchFrequency.matchFrequencyMap() - .get(FrequencyUtil.matchesToMatchTokenTypes(matchContained, testSubmissionTokenTypes)); + double matchFrequencyCalculated = matchFrequency.get(FrequencyUtil.matchesToMatchTokenTypes(testMatch, testSubmissionTokenTypes)); + double matchFrequencyCalculated1 = matchFrequency.get(FrequencyUtil.matchesToMatchTokenTypes(matchContained, testSubmissionTokenTypes)); assertEquals(1.0, matchFrequencyCalculated, 0.01, "weight for 2 considered subsequences"); assertEquals(0.0, matchFrequencyCalculated1, 0.01, "once found"); } @@ -173,14 +169,12 @@ void testWeightMatch_setsCorrectWeight_subMatchStrategy() { } SUB_MATCHES_STRATEGY.processMatchTokenTypes(matchToken, this::addSequenceKey, this::addSequence, 100); SUB_MATCHES_STRATEGY.processMatchTokenTypes(matchContainedToken, this::addSequenceKey, this::addSequence, 100); - MatchWeightCalculator weighting = new MatchWeightCalculator(SUB_MATCHES_STRATEGY, frequencyMap); - weighting.weightAllMatches(List.of(testMatch, matchContained), submissionToken); - MatchFrequency matchFrequency = weighting.weightAllMatches(List.of(testMatch), submissionToken); + MatchFrequencyEvaluator weighting = new MatchFrequencyEvaluator(SUB_MATCHES_STRATEGY, frequencyMap); + weighting.computeMatchFrequencies(List.of(testMatch, matchContained), submissionToken); + Map, Double> matchFrequency = weighting.computeMatchFrequencies(List.of(testMatch), submissionToken); List testSubmissionTokenTypes = testSubmission.getTokenList().stream().map(Token::getType).toList(); - double matchFrequencyCalculated = matchFrequency.matchFrequencyMap() - .get(FrequencyUtil.matchesToMatchTokenTypes(testMatch, testSubmissionTokenTypes)); - double matchFrequencyCalculated1 = matchFrequency.matchFrequencyMap() - .get(FrequencyUtil.matchesToMatchTokenTypes(matchContained, testSubmissionTokenTypes)); + double matchFrequencyCalculated = matchFrequency.get(FrequencyUtil.matchesToMatchTokenTypes(testMatch, testSubmissionTokenTypes)); + double matchFrequencyCalculated1 = matchFrequency.get(FrequencyUtil.matchesToMatchTokenTypes(matchContained, testSubmissionTokenTypes)); assertEquals(2.0, matchFrequencyCalculated, 0.01, "considered subsequences"); assertEquals(2.0, matchFrequencyCalculated1, 0.01, "considered subsequences"); } @@ -206,14 +200,12 @@ void testWeightMatch_setsCorrectWeight_windowOfMatchesStrategy() { } WINDOW_OF_MATCHES_STRATEGY.processMatchTokenTypes(matchToken, this::addSequenceKey, this::addSequence, 100); WINDOW_OF_MATCHES_STRATEGY.processMatchTokenTypes(matchContainedToken, this::addSequenceKey, this::addSequence, 100); - MatchWeightCalculator weighting = new MatchWeightCalculator(WINDOW_OF_MATCHES_STRATEGY, frequencyMap); - weighting.weightAllMatches(List.of(testMatch, matchContained), submissionToken); - MatchFrequency matchFrequency = weighting.weightAllMatches(List.of(testMatch), submissionToken); + MatchFrequencyEvaluator weighting = new MatchFrequencyEvaluator(WINDOW_OF_MATCHES_STRATEGY, frequencyMap); + weighting.computeMatchFrequencies(List.of(testMatch, matchContained), submissionToken); + Map, Double> matchFrequency = weighting.computeMatchFrequencies(List.of(testMatch), submissionToken); List testSubmissionTokenTypes = testSubmission.getTokenList().stream().map(Token::getType).toList(); - double matchFrequencyCalculated = matchFrequency.matchFrequencyMap() - .get(FrequencyUtil.matchesToMatchTokenTypes(testMatch, testSubmissionTokenTypes)); - double matchFrequencyCalculated1 = matchFrequency.matchFrequencyMap() - .get(FrequencyUtil.matchesToMatchTokenTypes(matchContained, testSubmissionTokenTypes)); + double matchFrequencyCalculated = matchFrequency.get(FrequencyUtil.matchesToMatchTokenTypes(testMatch, testSubmissionTokenTypes)); + double matchFrequencyCalculated1 = matchFrequency.get(FrequencyUtil.matchesToMatchTokenTypes(matchContained, testSubmissionTokenTypes)); assertEquals(2.0, matchFrequencyCalculated, 0.01, "considered subsequences"); assertEquals(2.0, matchFrequencyCalculated1, 0.01, "considered subsequences"); } diff --git a/core/src/test/java/de/jplag/highlightextraction/frequencydetermination/StrategyIntegrationTest.java b/core/src/test/java/de/jplag/highlightextraction/frequencydetermination/StrategyIntegrationTest.java index 8be509b341..ce7df79a60 100644 --- a/core/src/test/java/de/jplag/highlightextraction/frequencydetermination/StrategyIntegrationTest.java +++ b/core/src/test/java/de/jplag/highlightextraction/frequencydetermination/StrategyIntegrationTest.java @@ -19,12 +19,12 @@ import de.jplag.TokenType; import de.jplag.comparison.LongestCommonSubsequenceSearch; import de.jplag.exceptions.ExitException; -import de.jplag.highlightextraction.CompleteMatchesStrategy; -import de.jplag.highlightextraction.ContainedMatchesStrategy; -import de.jplag.highlightextraction.FrequencyDetermination; -import de.jplag.highlightextraction.FrequencyStrategy; -import de.jplag.highlightextraction.SubMatchesStrategy; -import de.jplag.highlightextraction.WindowOfMatchesStrategy; +import de.jplag.frequency.CompleteMatchesStrategy; +import de.jplag.frequency.ContainedMatchesStrategy; +import de.jplag.frequency.FrequencyDetermination; +import de.jplag.frequency.FrequencyStrategy; +import de.jplag.frequency.SubMatchesStrategy; +import de.jplag.frequency.WindowOfMatchesStrategy; import de.jplag.options.JPlagOptions; /** @@ -63,8 +63,8 @@ void prepareMatchResult() throws ExitException { void testFrequencyAnalysisStrategiesCompleteMatches() { FrequencyStrategy strategy = new CompleteMatchesStrategy(); FrequencyDetermination fd = new FrequencyDetermination(strategy, 1); - fd.buildFrequencyMap(result.getAllComparisons()); - Map, Integer> tokenFrequencyMap = fd.getMatchFrequencyMap(); + + Map, Integer> tokenFrequencyMap = fd.buildFrequencyMap(result.getAllComparisons()); assertFalse(tokenFrequencyMap.isEmpty(), "Map should not be empty"); printTestResult(tokenFrequencyMap); } @@ -77,8 +77,7 @@ void testFrequencyAnalysisStrategiesCompleteMatches() { void testFrequencyAnalysisStrategiesContainedMatches() { FrequencyStrategy strategy = new ContainedMatchesStrategy(); FrequencyDetermination fd = new FrequencyDetermination(strategy, 300); - fd.buildFrequencyMap(result.getAllComparisons()); - Map, Integer> tokenFrequencyMap = fd.getMatchFrequencyMap(); + Map, Integer> tokenFrequencyMap = fd.buildFrequencyMap(result.getAllComparisons()); assertFalse(tokenFrequencyMap.isEmpty(), "Map should not be empty"); printTestResult(tokenFrequencyMap); } @@ -91,8 +90,7 @@ void testFrequencyAnalysisStrategiesContainedMatches() { void testFrequencyAnalysisStrategiesSubMatches() { FrequencyStrategy strategy = new SubMatchesStrategy(); FrequencyDetermination fd = new FrequencyDetermination(strategy, 300); - fd.buildFrequencyMap(result.getAllComparisons()); - Map, Integer> tokenFrequencyMap = fd.getMatchFrequencyMap(); + Map, Integer> tokenFrequencyMap = fd.buildFrequencyMap(result.getAllComparisons()); assertFalse(tokenFrequencyMap.isEmpty(), "Map should not be empty"); printTestResult(tokenFrequencyMap); } @@ -105,8 +103,7 @@ void testFrequencyAnalysisStrategiesSubMatches() { void testFrequencyAnalysisStrategiesWindowOfMatches() { FrequencyStrategy strategy = new WindowOfMatchesStrategy(); FrequencyDetermination fd = new FrequencyDetermination(strategy, 300); - fd.buildFrequencyMap(result.getAllComparisons()); - Map, Integer> tokenFrequencyMap = fd.getMatchFrequencyMap(); + Map, Integer> tokenFrequencyMap = fd.buildFrequencyMap(result.getAllComparisons()); assertFalse(tokenFrequencyMap.isEmpty(), "Map should not be empty"); printTestResult(tokenFrequencyMap); } diff --git a/core/src/test/java/de/jplag/highlightextraction/frequencydetermination/StrategyTest.java b/core/src/test/java/de/jplag/highlightextraction/frequencydetermination/StrategyTest.java index c1691a1976..40e5a89870 100644 --- a/core/src/test/java/de/jplag/highlightextraction/frequencydetermination/StrategyTest.java +++ b/core/src/test/java/de/jplag/highlightextraction/frequencydetermination/StrategyTest.java @@ -28,12 +28,12 @@ import de.jplag.TokenType; import de.jplag.comparison.LongestCommonSubsequenceSearch; import de.jplag.exceptions.ExitException; -import de.jplag.highlightextraction.CompleteMatchesStrategy; -import de.jplag.highlightextraction.ContainedMatchesStrategy; -import de.jplag.highlightextraction.FrequencyDetermination; -import de.jplag.highlightextraction.FrequencyStrategy; -import de.jplag.highlightextraction.SubMatchesStrategy; -import de.jplag.highlightextraction.WindowOfMatchesStrategy; +import de.jplag.frequency.CompleteMatchesStrategy; +import de.jplag.frequency.ContainedMatchesStrategy; +import de.jplag.frequency.FrequencyDetermination; +import de.jplag.frequency.FrequencyStrategy; +import de.jplag.frequency.SubMatchesStrategy; +import de.jplag.frequency.WindowOfMatchesStrategy; import de.jplag.options.JPlagOptions; /** @@ -178,8 +178,7 @@ void testCompleteMatchesStrategy() { int strategyNumber = 9; FrequencyStrategy strategy = new CompleteMatchesStrategy(); FrequencyDetermination frequencyDetermination = new FrequencyDetermination(strategy, strategyNumber); - frequencyDetermination.buildFrequencyMap(TEST_COMPARISONS); - Map, Integer> tokenFrequencyMap = frequencyDetermination.getMatchFrequencyMap(); + Map, Integer> tokenFrequencyMap = frequencyDetermination.buildFrequencyMap(TEST_COMPARISONS); STRATEGY_INTEGRATION_TEST.printTestResult(tokenFrequencyMap); assertTokenFrequencyAndContainsMatch(matchAppearsOnce, 1, tokenFrequencyMap); @@ -322,8 +321,7 @@ void testCompleteMatchesIncludedInContainedStrategyForMatchesLongerMin() { int strategyNumber = 100; FrequencyStrategy strategy = new ContainedMatchesStrategy(); FrequencyDetermination frequencyDetermination = new FrequencyDetermination(strategy, strategyNumber); - frequencyDetermination.buildFrequencyMap(TEST_COMPARISONS); - Map, Integer> matchFrequencyMap = frequencyDetermination.getMatchFrequencyMap(); + Map, Integer> matchFrequencyMap = frequencyDetermination.buildFrequencyMap(TEST_COMPARISONS); Map, Integer> frequencyCount = new HashMap<>(); for (JPlagComparison comparison : TEST_COMPARISONS) { for (Match match : comparison.matches()) { diff --git a/core/src/test/java/de/jplag/highlightextraction/frequencysimilarity/FrequencyWeightingTest.java b/core/src/test/java/de/jplag/highlightextraction/frequencysimilarity/FrequencyWeightingTest.java index f44f57c658..64a264ce92 100644 --- a/core/src/test/java/de/jplag/highlightextraction/frequencysimilarity/FrequencyWeightingTest.java +++ b/core/src/test/java/de/jplag/highlightextraction/frequencysimilarity/FrequencyWeightingTest.java @@ -2,8 +2,10 @@ import static org.junit.jupiter.api.Assertions.assertEquals; +import java.util.HashMap; import java.util.LinkedList; import java.util.List; +import java.util.Map; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.DisplayName; @@ -20,10 +22,9 @@ import de.jplag.TokenType; import de.jplag.comparison.LongestCommonSubsequenceSearch; import de.jplag.exceptions.ExitException; -import de.jplag.highlightextraction.FrequencyUtil; -import de.jplag.highlightextraction.MatchFrequency; -import de.jplag.highlightextraction.MatchFrequencyWeighting; -import de.jplag.highlightextraction.MatchFrequencyWeightingFunction; +import de.jplag.frequency.FrequencyUtil; +import de.jplag.frequency.MatchFrequencyWeighting; +import de.jplag.frequency.MatchFrequencyWeightingFunction; import de.jplag.options.JPlagOptions; /** @@ -120,10 +121,10 @@ private void buildTestComparisons(TestSubmissions testSubmissions) { @Test @DisplayName("Test the weighting functions") void testWeightingFunction() { - MatchFrequency matchFrequency = new MatchFrequency(); + Map, Double> matchFrequency = new HashMap<>(); List testSubmissionTokenTypes = testSubmission.getTokenList().stream().map(Token::getType).toList(); - matchFrequency.matchFrequencyMap().put(FrequencyUtil.matchesToMatchTokenTypes(TEST_MATCHES.getFirst(), testSubmissionTokenTypes), 5.0); - matchFrequency.matchFrequencyMap().put(FrequencyUtil.matchesToMatchTokenTypes(matchShort, testSubmissionTokenTypes), 1.0); + matchFrequency.put(FrequencyUtil.matchesToMatchTokenTypes(TEST_MATCHES.getFirst(), testSubmissionTokenTypes), 5.0); + matchFrequency.put(FrequencyUtil.matchesToMatchTokenTypes(matchShort, testSubmissionTokenTypes), 1.0); MatchFrequencyWeighting matchFrequencyWeightingLinear = new MatchFrequencyWeighting(TEST_COMPARISONS, MatchFrequencyWeightingFunction.LINEAR, matchFrequency); @@ -148,8 +149,8 @@ void testWeightingFunction() { assertEquals(315, quadraticWeightedMatchLength, 0.0001); assertEquals(317, sigmoidWeightedMatchLength, 0.0001); - MatchFrequency matchFrequency1 = new MatchFrequency(); - matchFrequency1.matchFrequencyMap().put(FrequencyUtil.matchesToMatchTokenTypes(TEST_MATCHES.getFirst(), testSubmissionTokenTypes), 5.0); + Map, Double> matchFrequency1 = new HashMap<>(); + matchFrequency1.put(FrequencyUtil.matchesToMatchTokenTypes(TEST_MATCHES.getFirst(), testSubmissionTokenTypes), 5.0); MatchFrequencyWeighting matchFrequencyWeightingLinear1 = new MatchFrequencyWeighting(TEST_COMPARISONS, MatchFrequencyWeightingFunction.LINEAR, matchFrequency); diff --git a/languages/multi-language/src/main/java/de/jplag/multilang/MultiLanguageOptions.java b/languages/multi-language/src/main/java/de/jplag/multilang/MultiLanguageOptions.java index 151b1506ee..c0dff39b83 100644 --- a/languages/multi-language/src/main/java/de/jplag/multilang/MultiLanguageOptions.java +++ b/languages/multi-language/src/main/java/de/jplag/multilang/MultiLanguageOptions.java @@ -30,6 +30,7 @@ public class MultiLanguageOptions extends LanguageOptions { */ public List getLanguages() { if (this.languages == null) { + LanguageLoader.getAllAvailableLanguageIdentifiers().forEach(System.out::println); if (languageNames.getValue() == null) { this.languages = LanguageLoader.getAllAvailableLanguages().values().stream().filter(Language::supportsMultiLanguage).toList(); } else {