1818import java .nio .file .Files ;
1919import java .nio .file .Path ;
2020import java .nio .file .Paths ;
21- import java .util .ArrayList ;
22- import java .util .Collections ;
23- import java .util .Random ;
21+ import java .util .*;
2422import java .util .concurrent .CompletableFuture ;
2523import java .util .concurrent .ConcurrentLinkedQueue ;
2624import java .util .concurrent .atomic .AtomicInteger ;
@@ -64,6 +62,11 @@ public class AsyncCollectionReader {
6462
6563 private int debugCount = 25 ;
6664
65+ /**
66+ * If a target location is specified, documents in the source directory that already exist in the target are skipped automatically
67+ */
68+ private String targetLocation = null ;
69+
6770 public AsyncCollectionReader (String folder , String ending ) {
6871 this (folder , ending , 25 , -1 , false , "" , false , null );
6972 }
@@ -117,11 +120,15 @@ public AsyncCollectionReader(String folder, String ending, int debugCount, int s
117120 }
118121
119122 public AsyncCollectionReader (String folder , String ending , int debugCount , int sampleSize , DUUI_ASYNC_COLLECTION_READER_SAMPLE_MODE sampleMode , String savePath , boolean bAddMetadata , String language , int skipSmallerFiles ) {
120- this (folder , ending , debugCount , getRandomFromMode (sampleMode , sampleSize ), getSortFromMode (sampleMode ), savePath , bAddMetadata , language , skipSmallerFiles );
123+ this (folder , ending , debugCount , getRandomFromMode (sampleMode , sampleSize ), getSortFromMode (sampleMode ), savePath , bAddMetadata , language , skipSmallerFiles , null , "" );
121124 }
122125
123126 public AsyncCollectionReader (String folder , String ending , int debugCount , int iRandom , boolean bSort , String savePath , boolean bAddMetadata , String language ) {
124- this (folder , ending , debugCount , iRandom , bSort , savePath , bAddMetadata , language , 0 );
127+ this (folder , ending , debugCount , iRandom , bSort , savePath , bAddMetadata , language , 0 , null , "" );
128+ }
129+
130+ public AsyncCollectionReader (String folder , String ending , int debugCount , int iRandom , boolean bSort , String savePath , boolean bAddMetadata , String language , String targetLocation , String targetEnding ) {
131+ this (folder , ending , debugCount , iRandom , bSort , savePath , bAddMetadata , language , 0 , targetLocation , targetEnding );
125132 }
126133
127134 /***
@@ -135,9 +142,10 @@ public AsyncCollectionReader(String folder, String ending, int debugCount, int i
135142 * @param bAddMetadata Add metadata to the documents
136143 * @param language Add language to the documents
137144 * @param skipSmallerFiles Skip files smaller than this value in bytes
145+ * @param targetLocation If a target location is specified, documents in the source directory that already exist in the target are skipped automatically
138146 */
139- public AsyncCollectionReader (String folder , String ending , int debugCount , int iRandom , boolean bSort , String savePath , boolean bAddMetadata , String language , int skipSmallerFiles ) {
140-
147+ public AsyncCollectionReader (String folder , String ending , int debugCount , int iRandom , boolean bSort , String savePath , boolean bAddMetadata , String language , int skipSmallerFiles , String targetLocation , String targetEnding ) {
148+ this . targetLocation = targetLocation ;
141149 _addMetadata = bAddMetadata ;
142150 _language = language ;
143151 _filePaths = new ConcurrentLinkedQueue <>();
@@ -205,6 +213,12 @@ else if(iRandom>0){
205213 }
206214 }
207215
216+ // remove files that are already in the target location
217+ // NOTE we do this after saving the file list, as we do not want to change anything but only avoid processing files multiple times
218+ if (this .targetLocation != null ) {
219+ _filePaths = removeIfInTarget (_filePaths , this .targetLocation , targetEnding , this ._path , ending );
220+ }
221+
208222 _filePathsBackup .addAll (_filePaths );
209223
210224 this .debugCount = debugCount ;
@@ -452,6 +466,59 @@ else if (n < 0){
452466 return rQueue ;
453467 }
454468
469+ /***
470+ * Removes files that are present in the target location
471+ * @param paths List of paths
472+ * @param targetLocation Target location where to check for files
473+ * @return A new queue without files that are present in the target location
474+ */
475+ public static ConcurrentLinkedQueue <String > removeIfInTarget (ConcurrentLinkedQueue <String > paths , String targetLocation , String targetEnding , String sourceLocation , String sourceEnding ){
476+ System .out .println ("Chacking target location for files: " + targetLocation );
477+ ConcurrentLinkedQueue <String > targetFilePaths = new ConcurrentLinkedQueue <>();
478+ File targetDir = new File (targetLocation );
479+ if (!targetDir .exists ()) {
480+ // This might not be an error, e.g. if it is the first run
481+ System .err .println ("The targetLocation " + targetFilePaths + " does not exist! Continuing without removing files from target location." );
482+ }
483+ else if (targetDir .exists () && !targetDir .isDirectory ()) {
484+ throw new RuntimeException ("The targetLocation " + targetFilePaths + " is not a directory!" );
485+ }
486+ else {
487+ addFilesToConcurrentList (targetDir , targetEnding , targetFilePaths );
488+ }
489+ System .out .println ("Found " + targetFilePaths .size () + " files in target location" );
490+
491+ List <String > cleanList = new ArrayList <>();
492+ if (!targetFilePaths .isEmpty ()) {
493+ System .out .println ("Checking against " + targetFilePaths .size () + " files in target location" );
494+ Set <String > existingFiles = targetFilePaths .stream ()
495+ .map (Paths ::get )
496+ .filter (Files ::isRegularFile )
497+ .map (f -> targetDir .toPath ().relativize (f ).toString ())
498+ .map (f -> f .replaceAll (targetEnding , "" ))
499+ .map (f -> f .replaceAll (sourceEnding , "" ))
500+ .collect (Collectors .toSet ());
501+
502+ Path sourceDir = Paths .get (sourceLocation );
503+ for (String f : paths ) {
504+ Path p = Paths .get (f );
505+ String fn = sourceDir .relativize (p ).toString ();
506+ fn = fn .replaceAll (sourceEnding , "" );
507+ boolean found = existingFiles .contains (fn );
508+ if (!found ) {
509+ cleanList .add (f );
510+ }
511+ }
512+ }
513+ else {
514+ System .out .println ("No files in target location found, keeping all files from source location" );
515+ cleanList .addAll (paths );
516+ }
517+ System .out .println ("Removed " + (paths .size () - cleanList .size ()) + " files from source location that are already present in target location, keeping " + cleanList .size () + " files" );
518+
519+ return new ConcurrentLinkedQueue <>(cleanList );
520+ }
521+
455522 public static String getSize (String sPath ){
456523 return FileUtils .byteCountToDisplaySize (new File (sPath ).length ());
457524 }
0 commit comments