2525import static org .variantsync .vevos .extraction .gt .GroundTruth .*;
2626
2727
28+ /**
29+ * A full ground truth extraction that extracts the ground truth for all code files of all commits
30+ * in the repositories. Due to the effort of extracting and saving a ground truth for all files of
31+ * each commit, this extraction may require a very long time and large amounts of free disk space.
32+ *
33+ * <p>
34+ * Essentially, the full ground truth extraction first performs a fast ground truth extraction and
35+ * then incrementally combines the ground truths of all commits.
36+ * </p>
37+ */
2838public class FullGroundTruthExtraction extends GroundTruthExtraction {
2939
3040 public FullGroundTruthExtraction (Properties properties ) {
3141 super (properties );
32- Logger .info ("Starting full ground truth extraction that extracts a ground truth for all files of each commit." );
42+ Logger .info (
43+ "Starting full ground truth extraction that extracts a ground truth for all files of each commit." );
3344 }
3445
3546 protected BiConsumer <Repository , Path > extractionRunner () {
3647 return (repo , repoOutputDir ) -> {
3748 FullVariabilityAnalysis analysis =
3849 new FullVariabilityAnalysis (Path .of (properties .getProperty (DD_OUTPUT_DIR )),
3950 Boolean .parseBoolean (properties .getProperty (IGNORE_PC_CHANGES )));
40- final BiFunction <Repository , Path , Analysis > AnalysisFactory = (r , out ) -> new Analysis (
41- "PCAnalysis" ,
42- List .of (
43- analysis
44- ),
45- r ,
46- out
47- );
48- final int availableProcessors = Runtime .getRuntime ().availableProcessors ();
49- final int commitsToProcessPerThread = 256 ;
50-
51- Analysis .forEachCommit (() -> AnalysisFactory .apply (repo , repoOutputDir ), commitsToProcessPerThread , availableProcessors );
51+ final BiFunction <Repository , Path , Analysis > AnalysisFactory =
52+ (r , out ) -> new Analysis ("PCAnalysis" , List .of (analysis ), r , out );
53+
54+ Analysis .forEachCommit (() -> AnalysisFactory .apply (repo , repoOutputDir ),
55+ diffDetectiveBatchSize (), numProcessors ());
5256
5357 ArrayList <RevCommit > commits = new ArrayList <>();
5458 try (Git gitRepo = repo .getGitRepo ().run ()) {
@@ -61,7 +65,7 @@ protected BiConsumer<Repository, Path> extractionRunner() {
6165
6266 ExecutorService threadPool = null ;
6367 try {
64- threadPool = Executors .newFixedThreadPool (availableProcessors );
68+ threadPool = Executors .newFixedThreadPool (numProcessors () );
6569 postprocess (repo , commits , threadPool );
6670 } finally {
6771 if (threadPool != null ) {
@@ -75,14 +79,16 @@ protected BiConsumer<Repository, Path> extractionRunner() {
7579
7680
7781 /**
78- * Incrementally combines the ground truths from the first to the last commit. The ground truth for unmodified files
79- * are reused. New file ground truths are added for created files, and old ground truths are updated for modified files.
82+ * Incrementally combines the ground truths from the first to the last commit. The ground truth
83+ * for unmodified files are reused. New file ground truths are added for created files, and old
84+ * ground truths are updated for modified files.
8085 *
81- * @param repo The repo that has been analyzed
82- * @param commits A list of commits in the repo
86+ * @param repo The repo that has been analyzed
87+ * @param commits A list of commits in the repo
8388 * @param threadPool A thread pool for multithreading of IO operations
8489 */
85- private void postprocess (Repository repo , ArrayList <RevCommit > commits , ExecutorService threadPool ) {
90+ private void postprocess (Repository repo , ArrayList <RevCommit > commits ,
91+ ExecutorService threadPool ) {
8692 boolean print = Boolean .parseBoolean (this .properties .getProperty (PRINT_ENABLED ));
8793 int processedCount = 0 ;
8894 RevCommit lastCommit = null ;
@@ -92,16 +98,19 @@ private void postprocess(Repository repo, ArrayList<RevCommit> commits, Executor
9298 if (lastCommit != null ) {
9399 // Check whether the last commit is the first parent of this commit.
94100 // If this is the case, we can continue with the existing ground truth.
95- // If this is not the case, we have to load the completed ground truth of the parent.
101+ // If this is not the case, we have to load the completed ground truth of the
102+ // parent.
96103 RevCommit firstParent = Arrays .stream (commit .getParents ()).findFirst ().orElse (null );
97104 if (firstParent == null ) {
98105 completedGroundTruth = new GroundTruth (new HashMap <>(), new HashSet <>());
99106 } else if (!firstParent .equals (lastCommit )) {
100- File parentGT = new File (diffDetectiveCache + "/pc/" + repo .getRepositoryName () + "/" + firstParent .getName () + ".gt" );
107+ File parentGT = new File (diffDetectiveCache + "/pc/" + repo .getRepositoryName ()
108+ + "/" + firstParent .getName () + ".gt" );
101109 completedGroundTruth = Serde .deserialize (parentGT );
102110 }
103111 }
104- File currentGTFile = new File (diffDetectiveCache + "/pc/" + repo .getRepositoryName () + "/" + commit .getName () + ".gt" );
112+ File currentGTFile = new File (diffDetectiveCache + "/pc/" + repo .getRepositoryName ()
113+ + "/" + commit .getName () + ".gt" );
105114 if (Files .exists (currentGTFile .toPath ())) {
106115 GroundTruth loadedGT = Serde .deserialize (currentGTFile );
107116 if (processedCount % 1_000 == 0 ) {
@@ -124,19 +133,24 @@ private void postprocess(Repository repo, ArrayList<RevCommit> commits, Executor
124133 throw new UncheckedIOException (e );
125134 }
126135 String variablesList = completedGroundTruth .variablesListAsString ();
127- threadPool .submit (() -> Serde .writeToFile (commitSaveDir .resolve (VARIABLES_FILE ), variablesList ));
136+ threadPool .submit (
137+ () -> Serde .writeToFile (commitSaveDir .resolve (VARIABLES_FILE ), variablesList ));
128138
129139 String groundTruthAsCSV = completedGroundTruth .asPcCsvString ();
130- threadPool .submit (() -> Serde .writeToFile (commitSaveDir .resolve (CODE_VARIABILITY_CSV ), groundTruthAsCSV ));
140+ threadPool .submit (() -> Serde .writeToFile (commitSaveDir .resolve (CODE_VARIABILITY_CSV ),
141+ groundTruthAsCSV ));
131142
132- threadPool .submit (() -> Serde .writeToFile (commitSaveDir .resolve (COMMIT_MESSAGE_FILE ), commit .getFullMessage ()));
143+ threadPool .submit (() -> Serde .writeToFile (commitSaveDir .resolve (COMMIT_MESSAGE_FILE ),
144+ commit .getFullMessage ()));
133145
134- Optional <String > parentIds = Arrays .stream (commit .getParents ()).map (RevCommit ::getName ).reduce ((s , s2 ) -> s + " " + s2 );
146+ Optional <String > parentIds = Arrays .stream (commit .getParents ()).map (RevCommit ::getName )
147+ .reduce ((s , s2 ) -> s + " " + s2 );
135148 threadPool .submit (() -> parentIds .ifPresentOrElse (
136149 s -> Serde .writeToFile (commitSaveDir .resolve (COMMIT_PARENTS_FILE ), s ),
137150 () -> Serde .writeToFile (commitSaveDir .resolve (COMMIT_PARENTS_FILE ), "" )));
138151
139- threadPool .submit (() -> Serde .appendText (resultsRoot .resolve (SUCCESS_COMMIT_FILE ), commit .getName () + "\n " ));
152+ threadPool .submit (() -> Serde .appendText (resultsRoot .resolve (SUCCESS_COMMIT_FILE ),
153+ commit .getName () + "\n " ));
140154
141155 if (Boolean .parseBoolean (properties .getProperty (EXTRACT_CODE_MATCHING ))) {
142156 String matchingAsCSV = completedGroundTruth .asMatchingCsvString ();
@@ -146,7 +160,8 @@ private void postprocess(Repository repo, ArrayList<RevCommit> commits, Executor
146160 }
147161
148162 if (processedCount % 1_000 == 0 ) {
149- Logger .info ("Saved ground truth for commit {} of {}" , processedCount + 1 , commits .size ());
163+ Logger .info ("Saved ground truth for commit {} of {}" , processedCount + 1 ,
164+ commits .size ());
150165 }
151166 lastCommit = commit ;
152167 processedCount ++;
0 commit comments