26
26
import java .time .Instant ;
27
27
import java .util .Comparator ;
28
28
import java .util .List ;
29
+ import java .util .Map ;
29
30
import java .util .concurrent .ExecutorService ;
30
31
import java .util .concurrent .Executors ;
31
32
import java .util .concurrent .ThreadLocalRandom ;
@@ -63,6 +64,7 @@ public class RadarHdfsRestructure {
63
64
private final Configuration conf ;
64
65
private final FileStoreFactory fileStoreFactory ;
65
66
private final RecordPathFactory pathFactory ;
67
+ private final long maxFilesPerTopic ;
66
68
67
69
private LongAdder processedFileCount ;
68
70
private LongAdder processedRecordsCount ;
@@ -71,6 +73,11 @@ public RadarHdfsRestructure(FileStoreFactory factory) {
71
73
conf = factory .getHdfsSettings ().getConfiguration ();
72
74
conf .set ("fs.defaultFS" , "hdfs://" + factory .getHdfsSettings ().getHdfsName ());
73
75
this .numThreads = factory .getSettings ().getNumThreads ();
76
+ long maxFiles = factory .getSettings ().getMaxFilesPerTopic ();
77
+ if (maxFiles < 1 ) {
78
+ maxFiles = Long .MAX_VALUE ;
79
+ }
80
+ this .maxFilesPerTopic = maxFiles ;
74
81
this .fileStoreFactory = factory ;
75
82
this .pathFactory = factory .getPathFactory ();
76
83
}
@@ -93,7 +100,7 @@ public void start(String directoryName) throws IOException {
93
100
94
101
Instant timeStart = Instant .now ();
95
102
// Get filenames to process
96
- TopicFileList topicPaths = getTopicPaths (fs , path , accountant .getOffsets ());
103
+ List < TopicFileList > topicPaths = getTopicPaths (fs , path , accountant .getOffsets ());
97
104
logger .info ("Time retrieving file list: {}" ,
98
105
formatTime (Duration .between (timeStart , Instant .now ())));
99
106
@@ -104,12 +111,16 @@ public void start(String directoryName) throws IOException {
104
111
}
105
112
}
106
113
107
- private TopicFileList getTopicPaths (FileSystem fs , Path path , OffsetRangeSet seenFiles ) {
108
- return new TopicFileList ( walk (fs , path )
114
+ private List < TopicFileList > getTopicPaths (FileSystem fs , Path path , OffsetRangeSet seenFiles ) {
115
+ Map < String , List < TopicFile >> topics = walk (fs , path )
109
116
.filter (f -> f .getName ().endsWith (".avro" ))
110
117
.map (f -> new TopicFile (f .getParent ().getParent ().getName (), f ))
111
118
.filter (f -> !seenFiles .contains (f .range ))
112
- .collect (Collectors .toList ()));
119
+ .collect (Collectors .groupingBy (TopicFile ::getTopic ));
120
+
121
+ return topics .values ().stream ()
122
+ .map (v -> new TopicFileList (v .stream ().limit (maxFilesPerTopic )))
123
+ .collect (Collectors .toList ());
113
124
}
114
125
115
126
private Stream <Path > walk (FileSystem fs , Path path ) {
@@ -133,9 +144,16 @@ private Stream<Path> walk(FileSystem fs, Path path) {
133
144
});
134
145
}
135
146
136
- private void processPaths (TopicFileList topicPaths , Accountant accountant ) throws InterruptedException {
147
+ private void processPaths (List <TopicFileList > topicPaths , Accountant accountant ) throws InterruptedException {
148
+ int numFiles = topicPaths .stream ()
149
+ .mapToInt (TopicFileList ::numberOfFiles )
150
+ .sum ();
151
+ long numOffsets = topicPaths .stream ()
152
+ .mapToLong (TopicFileList ::numberOfOffsets )
153
+ .sum ();
154
+
137
155
logger .info ("Converting {} files with {} records" ,
138
- topicPaths . files . size () , NumberFormat .getNumberInstance ().format (topicPaths . size ));
156
+ numFiles , NumberFormat .getNumberInstance ().format (numOffsets ));
139
157
140
158
processedFileCount = new LongAdder ();
141
159
processedRecordsCount = new LongAdder ();
@@ -144,14 +162,13 @@ private void processPaths(TopicFileList topicPaths, Accountant accountant) throw
144
162
145
163
ExecutorService executor = Executors .newWorkStealingPool (pathFactory .isTopicPartitioned () ? this .numThreads : 1 );
146
164
147
- ProgressBar progressBar = new ProgressBar (topicPaths . size , 50 , 500 , TimeUnit .MILLISECONDS );
165
+ ProgressBar progressBar = new ProgressBar (numOffsets , 50 , 500 , TimeUnit .MILLISECONDS );
148
166
149
167
// Actually process the files
150
- topicPaths .files .stream ()
151
- .collect (Collectors .groupingBy (TopicFile ::getTopic )).values ().stream ()
152
- .map (TopicFileList ::new )
168
+
169
+ topicPaths .stream ()
153
170
// ensure that largest values go first on the executor queue
154
- .sorted (Comparator .comparingLong (TopicFileList ::getSize ).reversed ())
171
+ .sorted (Comparator .comparingLong (TopicFileList ::numberOfOffsets ).reversed ())
155
172
.forEach (paths -> {
156
173
String size = NumberFormat .getNumberInstance ().format (paths .size );
157
174
String topic = paths .files .get (0 ).topic ;
@@ -186,7 +203,7 @@ private void processPaths(TopicFileList topicPaths, Accountant accountant) throw
186
203
187
204
executor .shutdown ();
188
205
executor .awaitTermination (Long .MAX_VALUE , TimeUnit .SECONDS );
189
- progressBar .update (topicPaths . size );
206
+ progressBar .update (numOffsets );
190
207
}
191
208
192
209
private void processFile (TopicFile file , FileCacheStore cache ,
@@ -260,14 +277,18 @@ private static class TopicFileList {
260
277
private final List <TopicFile > files ;
261
278
private final long size ;
262
279
263
- public TopicFileList (List <TopicFile > files ) {
264
- this .files = files ;
265
- this .size = files .stream ()
280
+ public TopicFileList (Stream <TopicFile > files ) {
281
+ this .files = files . collect ( Collectors . toList ()) ;
282
+ this .size = this . files .stream ()
266
283
.mapToInt (TopicFile ::size )
267
284
.sum ();
268
285
}
269
286
270
- public long getSize () {
287
+ public int numberOfFiles () {
288
+ return this .files .size ();
289
+ }
290
+
291
+ public long numberOfOffsets () {
271
292
return size ;
272
293
}
273
294
}
0 commit comments