Skip to content

Commit af1d14b

Browse files
authored
Merge pull request #38 from RADAR-base/release-0.5.4
Release 0.5.4
2 parents f8e0a5c + 483277d commit af1d14b

File tree

5 files changed

+54
-17
lines changed

5 files changed

+54
-17
lines changed

build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ plugins {
77
}
88

99
group 'org.radarcns'
10-
version '0.5.3'
10+
version '0.5.4'
1111
mainClassName = 'org.radarcns.hdfs.Application'
1212

1313
sourceCompatibility = '1.8'

src/main/java/org/radarcns/hdfs/Application.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ public static void main(String [] args) {
114114
.doDeduplicate(commandLineArgs.deduplicate)
115115
.tempDir(commandLineArgs.tmpDir)
116116
.numThreads(commandLineArgs.numThreads)
117+
.maxFilesPerTopic(commandLineArgs.maxFilesPerTopic)
117118
.build();
118119

119120
HdfsSettings hdfsSettings = new HdfsSettings.Builder(commandLineArgs.hdfsName)

src/main/java/org/radarcns/hdfs/RadarHdfsRestructure.java

Lines changed: 37 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import java.time.Instant;
2727
import java.util.Comparator;
2828
import java.util.List;
29+
import java.util.Map;
2930
import java.util.concurrent.ExecutorService;
3031
import java.util.concurrent.Executors;
3132
import java.util.concurrent.ThreadLocalRandom;
@@ -63,6 +64,7 @@ public class RadarHdfsRestructure {
6364
private final Configuration conf;
6465
private final FileStoreFactory fileStoreFactory;
6566
private final RecordPathFactory pathFactory;
67+
private final long maxFilesPerTopic;
6668

6769
private LongAdder processedFileCount;
6870
private LongAdder processedRecordsCount;
@@ -71,6 +73,11 @@ public RadarHdfsRestructure(FileStoreFactory factory) {
7173
conf = factory.getHdfsSettings().getConfiguration();
7274
conf.set("fs.defaultFS", "hdfs://" + factory.getHdfsSettings().getHdfsName());
7375
this.numThreads = factory.getSettings().getNumThreads();
76+
long maxFiles = factory.getSettings().getMaxFilesPerTopic();
77+
if (maxFiles < 1) {
78+
maxFiles = Long.MAX_VALUE;
79+
}
80+
this.maxFilesPerTopic = maxFiles;
7481
this.fileStoreFactory = factory;
7582
this.pathFactory = factory.getPathFactory();
7683
}
@@ -93,7 +100,7 @@ public void start(String directoryName) throws IOException {
93100

94101
Instant timeStart = Instant.now();
95102
// Get filenames to process
96-
TopicFileList topicPaths = getTopicPaths(fs, path, accountant.getOffsets());
103+
List<TopicFileList> topicPaths = getTopicPaths(fs, path, accountant.getOffsets());
97104
logger.info("Time retrieving file list: {}",
98105
formatTime(Duration.between(timeStart, Instant.now())));
99106

@@ -104,12 +111,16 @@ public void start(String directoryName) throws IOException {
104111
}
105112
}
106113

107-
private TopicFileList getTopicPaths(FileSystem fs, Path path, OffsetRangeSet seenFiles) {
108-
return new TopicFileList(walk(fs, path)
114+
private List<TopicFileList> getTopicPaths(FileSystem fs, Path path, OffsetRangeSet seenFiles) {
115+
Map<String, List<TopicFile>> topics = walk(fs, path)
109116
.filter(f -> f.getName().endsWith(".avro"))
110117
.map(f -> new TopicFile(f.getParent().getParent().getName(), f))
111118
.filter(f -> !seenFiles.contains(f.range))
112-
.collect(Collectors.toList()));
119+
.collect(Collectors.groupingBy(TopicFile::getTopic));
120+
121+
return topics.values().stream()
122+
.map(v -> new TopicFileList(v.stream().limit(maxFilesPerTopic)))
123+
.collect(Collectors.toList());
113124
}
114125

115126
private Stream<Path> walk(FileSystem fs, Path path) {
@@ -133,9 +144,16 @@ private Stream<Path> walk(FileSystem fs, Path path) {
133144
});
134145
}
135146

136-
private void processPaths(TopicFileList topicPaths, Accountant accountant) throws InterruptedException {
147+
private void processPaths(List<TopicFileList> topicPaths, Accountant accountant) throws InterruptedException {
148+
int numFiles = topicPaths.stream()
149+
.mapToInt(TopicFileList::numberOfFiles)
150+
.sum();
151+
long numOffsets = topicPaths.stream()
152+
.mapToLong(TopicFileList::numberOfOffsets)
153+
.sum();
154+
137155
logger.info("Converting {} files with {} records",
138-
topicPaths.files.size(), NumberFormat.getNumberInstance().format(topicPaths.size));
156+
numFiles, NumberFormat.getNumberInstance().format(numOffsets));
139157

140158
processedFileCount = new LongAdder();
141159
processedRecordsCount = new LongAdder();
@@ -144,14 +162,13 @@ private void processPaths(TopicFileList topicPaths, Accountant accountant) throw
144162

145163
ExecutorService executor = Executors.newWorkStealingPool(pathFactory.isTopicPartitioned() ? this.numThreads : 1);
146164

147-
ProgressBar progressBar = new ProgressBar(topicPaths.size, 50, 500, TimeUnit.MILLISECONDS);
165+
ProgressBar progressBar = new ProgressBar(numOffsets, 50, 500, TimeUnit.MILLISECONDS);
148166

149167
// Actually process the files
150-
topicPaths.files.stream()
151-
.collect(Collectors.groupingBy(TopicFile::getTopic)).values().stream()
152-
.map(TopicFileList::new)
168+
169+
topicPaths.stream()
153170
// ensure that largest values go first on the executor queue
154-
.sorted(Comparator.comparingLong(TopicFileList::getSize).reversed())
171+
.sorted(Comparator.comparingLong(TopicFileList::numberOfOffsets).reversed())
155172
.forEach(paths -> {
156173
String size = NumberFormat.getNumberInstance().format(paths.size);
157174
String topic = paths.files.get(0).topic;
@@ -186,7 +203,7 @@ private void processPaths(TopicFileList topicPaths, Accountant accountant) throw
186203

187204
executor.shutdown();
188205
executor.awaitTermination(Long.MAX_VALUE, TimeUnit.SECONDS);
189-
progressBar.update(topicPaths.size);
206+
progressBar.update(numOffsets);
190207
}
191208

192209
private void processFile(TopicFile file, FileCacheStore cache,
@@ -260,14 +277,18 @@ private static class TopicFileList {
260277
private final List<TopicFile> files;
261278
private final long size;
262279

263-
public TopicFileList(List<TopicFile> files) {
264-
this.files = files;
265-
this.size = files.stream()
280+
public TopicFileList(Stream<TopicFile> files) {
281+
this.files = files.collect(Collectors.toList());
282+
this.size = this.files.stream()
266283
.mapToInt(TopicFile::size)
267284
.sum();
268285
}
269286

270-
public long getSize() {
287+
public int numberOfFiles() {
288+
return this.files.size();
289+
}
290+
291+
public long numberOfOffsets() {
271292
return size;
272293
}
273294
}

src/main/java/org/radarcns/hdfs/config/RestructureSettings.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ public class RestructureSettings {
3333
private final Path tempDir;
3434
private final Path outputPath;
3535
private final int numThreads;
36+
private final int maxFilesPerTopic;
3637

3738
private RestructureSettings(Builder builder) {
3839
this.compression = builder.compression;
@@ -42,6 +43,7 @@ private RestructureSettings(Builder builder) {
4243
this.tempDir = builder.tempDir;
4344
this.outputPath = builder.outputPath;
4445
this.numThreads = builder.numThreads;
46+
this.maxFilesPerTopic = builder.maxFilesPerTopic;
4547
}
4648

4749
public String getCompression() {
@@ -72,6 +74,10 @@ public int getNumThreads() {
7274
return this.numThreads;
7375
}
7476

77+
public int getMaxFilesPerTopic() {
78+
return maxFilesPerTopic;
79+
}
80+
7581
public static class Builder {
7682
private int numThreads = 1;
7783
private String compression;
@@ -80,6 +86,7 @@ public static class Builder {
8086
private int cacheSize = CACHE_SIZE_DEFAULT;
8187
private Path tempDir;
8288
private final Path outputPath;
89+
public int maxFilesPerTopic;
8390

8491
public Builder(String outputPath) {
8592
this.outputPath = Paths.get(outputPath.replaceAll("/+$", ""));
@@ -124,6 +131,11 @@ public Builder numThreads(int num) {
124131
return this;
125132
}
126133

134+
public Builder maxFilesPerTopic(int num) {
135+
this.maxFilesPerTopic = num;
136+
return this;
137+
}
138+
127139
public RestructureSettings build() {
128140
compression = nonNullOrDefault(compression, () -> "identity");
129141
format = nonNullOrDefault(format, () -> "csv");

src/main/java/org/radarcns/hdfs/util/commandline/CommandLineArgs.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,9 @@ public class CommandLineArgs {
9090
@Parameter(names = {"-s", "--cache-size"}, description = "Number of files to keep in cache in a single thread.", validateWith = PositiveInteger.class)
9191
public int cacheSize = CACHE_SIZE_DEFAULT;
9292

93+
@Parameter(names = {"--max-files-per-topic"}, description = "Maximum number of records to process, per topic. Set below 1 to disable this option.")
94+
public int maxFilesPerTopic = 0;
95+
9396
public static <T> T nonNullOrDefault(T value, Supplier<T> defaultValue) {
9497
return value != null ? value : defaultValue.get();
9598
}

0 commit comments

Comments
 (0)