Skip to content

Commit f6276f7

Browse files
authored
Merge pull request #50 from Myyyvothrr/main
async reader: filter files in target location
2 parents 01c7fae + d3d2883 commit f6276f7

File tree

1 file changed

+74
-7
lines changed

1 file changed

+74
-7
lines changed

src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/AsyncCollectionReader.java

Lines changed: 74 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,7 @@
1818
import java.nio.file.Files;
1919
import java.nio.file.Path;
2020
import java.nio.file.Paths;
21-
import java.util.ArrayList;
22-
import java.util.Collections;
23-
import java.util.Random;
21+
import java.util.*;
2422
import java.util.concurrent.CompletableFuture;
2523
import java.util.concurrent.ConcurrentLinkedQueue;
2624
import java.util.concurrent.atomic.AtomicInteger;
@@ -64,6 +62,11 @@ public class AsyncCollectionReader {
6462

6563
private int debugCount = 25;
6664

65+
/**
66+
* If a target location is specified, documents in the source directory that already exist in the target are skipped automatically
67+
*/
68+
private String targetLocation = null;
69+
6770
public AsyncCollectionReader(String folder, String ending) {
6871
this(folder, ending, 25, -1, false, "", false, null);
6972
}
@@ -117,11 +120,15 @@ public AsyncCollectionReader(String folder, String ending, int debugCount, int s
117120
}
118121

119122
public AsyncCollectionReader(String folder, String ending, int debugCount, int sampleSize, DUUI_ASYNC_COLLECTION_READER_SAMPLE_MODE sampleMode, String savePath, boolean bAddMetadata, String language, int skipSmallerFiles) {
120-
this(folder, ending, debugCount, getRandomFromMode(sampleMode, sampleSize), getSortFromMode(sampleMode), savePath, bAddMetadata, language, skipSmallerFiles);
123+
this(folder, ending, debugCount, getRandomFromMode(sampleMode, sampleSize), getSortFromMode(sampleMode), savePath, bAddMetadata, language, skipSmallerFiles, null, "");
121124
}
122125

123126
public AsyncCollectionReader(String folder, String ending, int debugCount, int iRandom, boolean bSort, String savePath, boolean bAddMetadata, String language) {
124-
this(folder, ending, debugCount, iRandom, bSort, savePath, bAddMetadata, language, 0);
127+
this(folder, ending, debugCount, iRandom, bSort, savePath, bAddMetadata, language, 0, null, "");
128+
}
129+
130+
public AsyncCollectionReader(String folder, String ending, int debugCount, int iRandom, boolean bSort, String savePath, boolean bAddMetadata, String language, String targetLocation, String targetEnding) {
131+
this(folder, ending, debugCount, iRandom, bSort, savePath, bAddMetadata, language, 0, targetLocation, targetEnding);
125132
}
126133

127134
/***
@@ -135,9 +142,10 @@ public AsyncCollectionReader(String folder, String ending, int debugCount, int i
135142
* @param bAddMetadata Add metadata to the documents
136143
* @param language Add language to the documents
137144
* @param skipSmallerFiles Skip files smaller than this value in bytes
145+
* @param targetLocation If a target location is specified, documents in the source directory that already exist in the target are skipped automatically
138146
*/
139-
public AsyncCollectionReader(String folder, String ending, int debugCount, int iRandom, boolean bSort, String savePath, boolean bAddMetadata, String language, int skipSmallerFiles) {
140-
147+
public AsyncCollectionReader(String folder, String ending, int debugCount, int iRandom, boolean bSort, String savePath, boolean bAddMetadata, String language, int skipSmallerFiles, String targetLocation, String targetEnding) {
148+
this.targetLocation = targetLocation;
141149
_addMetadata = bAddMetadata;
142150
_language = language;
143151
_filePaths = new ConcurrentLinkedQueue<>();
@@ -205,6 +213,12 @@ else if(iRandom>0){
205213
}
206214
}
207215

216+
// remove files that are already in the target location
217+
// NOTE we do this after saving the file list, as we do not want to change anything but only avoid processing files multiple times
218+
if (this.targetLocation != null) {
219+
_filePaths = removeIfInTarget(_filePaths, this.targetLocation, targetEnding, this._path, ending);
220+
}
221+
208222
_filePathsBackup.addAll(_filePaths);
209223

210224
this.debugCount = debugCount;
@@ -452,6 +466,59 @@ else if (n < 0){
452466
return rQueue;
453467
}
454468

469+
/***
470+
* Removes files that are present in the target location
471+
* @param paths List of paths
472+
* @param targetLocation Target location where to check for files
473+
* @return A new queue without files that are present in the target location
474+
*/
475+
public static ConcurrentLinkedQueue<String> removeIfInTarget(ConcurrentLinkedQueue<String> paths, String targetLocation, String targetEnding, String sourceLocation, String sourceEnding){
476+
System.out.println("Chacking target location for files: " + targetLocation);
477+
ConcurrentLinkedQueue<String> targetFilePaths = new ConcurrentLinkedQueue<>();
478+
File targetDir = new File(targetLocation);
479+
if (!targetDir.exists()) {
480+
// This might not be an error, e.g. if it is the first run
481+
System.err.println("The targetLocation " + targetFilePaths + " does not exist! Continuing without removing files from target location.");
482+
}
483+
else if (targetDir.exists() && !targetDir.isDirectory()) {
484+
throw new RuntimeException("The targetLocation " + targetFilePaths + " is not a directory!");
485+
}
486+
else {
487+
addFilesToConcurrentList(targetDir, targetEnding, targetFilePaths);
488+
}
489+
System.out.println("Found " + targetFilePaths.size() + " files in target location");
490+
491+
List<String> cleanList = new ArrayList<>();
492+
if (!targetFilePaths.isEmpty()) {
493+
System.out.println("Checking against " + targetFilePaths.size() + " files in target location");
494+
Set<String> existingFiles = targetFilePaths.stream()
495+
.map(Paths::get)
496+
.filter(Files::isRegularFile)
497+
.map(f -> targetDir.toPath().relativize(f).toString())
498+
.map(f -> f.replaceAll(targetEnding, ""))
499+
.map(f -> f.replaceAll(sourceEnding, ""))
500+
.collect(Collectors.toSet());
501+
502+
Path sourceDir = Paths.get(sourceLocation);
503+
for (String f : paths) {
504+
Path p = Paths.get(f);
505+
String fn = sourceDir.relativize(p).toString();
506+
fn = fn.replaceAll(sourceEnding, "");
507+
boolean found = existingFiles.contains(fn);
508+
if (!found) {
509+
cleanList.add(f);
510+
}
511+
}
512+
}
513+
else {
514+
System.out.println("No files in target location found, keeping all files from source location");
515+
cleanList.addAll(paths);
516+
}
517+
System.out.println("Removed " + (paths.size() - cleanList.size()) + " files from source location that are already present in target location, keeping " + cleanList.size() + " files");
518+
519+
return new ConcurrentLinkedQueue<>(cleanList);
520+
}
521+
455522
public static String getSize(String sPath){
456523
return FileUtils.byteCountToDisplaySize(new File(sPath).length());
457524
}

0 commit comments

Comments
 (0)