Skip to content

Commit cea4c16

Browse files
author
Maxim Moinat
committed
Skip already processed files
1 parent eeb14e1 commit cea4c16

File tree

1 file changed

+61
-15
lines changed

1 file changed

+61
-15
lines changed

src/main/java/org/radarcns/restructureAvroRecords.java

Lines changed: 61 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -18,31 +18,44 @@
1818

1919
public class restructureAvroRecords {
2020

21-
String outputPath = ".";
22-
String OUTPUT_FILE_EXTENSION = "json";
23-
String OFFSETS_FILE_NAME = "offsets.csv";
24-
Configuration conf = new Configuration();
25-
SimpleDateFormat dateFormatFileName = new SimpleDateFormat("yyyyMMdd_HH");
21+
private final String OUTPUT_FILE_EXTENSION = "json";
22+
private final String OFFSETS_FILE_NAME = "offsets.csv";
23+
24+
private String outputPath = ".";
25+
private String offsetsPath = outputPath + "/" + OFFSETS_FILE_NAME;
26+
private Set<String> seenFiles = new HashSet<>();
27+
28+
private Configuration conf = new Configuration();
29+
private final SimpleDateFormat dateFormatFileName = new SimpleDateFormat("yyyyMMdd_HH");
30+
31+
private int processedFileCount;
2632

2733
public static void main(String [] args) throws Exception {
2834

29-
restructureAvroRecords restr = new restructureAvroRecords();
30-
restr.setInputWebHdfsURL(args[0]); //"webhdfs://radar-test.thehyve.net:50070");
31-
restr.setOutputPath(args[2]); //"output3/");
35+
// restructureAvroRecords restr = new restructureAvroRecords(args[0], args[2]);
36+
// restr.start(args[1]);
37+
38+
restructureAvroRecords restr = new restructureAvroRecords("webhdfs://radar-test.thehyve.net:50070", "output3/");
39+
restr.start("/topicAndroidPhoneNew/");
3240

33-
restr.start(args[1]); //"/topicAndroidPhoneNew/");
3441
// restructureAvroRecords.processTopic("/topicE4/android_empatica_e4_inter_beat_interval/partition=0/");
3542
// restructureAvroRecords.processAvroFile(new Path("/topicE4/android_empatica_e4_inter_beat_interval/partition=0/android_empatica_e4_inter_beat_interval+0+0000031485+0000031488.avro") );
3643
// restructureAvroRecords.processAvroFile(new Path("/testE4Time/android_phone_acceleration/partition=0/android_phone_acceleration+0+0000590000+0000599999.avro"),"wazaa" );
3744
}
3845

46+
public restructureAvroRecords(String inputPath, String outputPath) {
47+
this.setInputWebHdfsURL(inputPath);
48+
this.setOutputPath(outputPath);
49+
}
50+
3951
public void setInputWebHdfsURL(String fileSystemURL) {
40-
conf.set("fs.defaultFS",fileSystemURL);
52+
conf.set("fs.defaultFS", fileSystemURL);
4153
}
4254

4355
public void setOutputPath(String path) {
4456
// Remove trailing backslash
4557
outputPath = path.replaceAll("/$","");
58+
offsetsPath = outputPath + "/" + OFFSETS_FILE_NAME;
4659
}
4760

4861
public void start(String directoryName) throws IOException {
@@ -51,7 +64,11 @@ public void start(String directoryName) throws IOException {
5164
FileSystem fs = FileSystem.get(conf);
5265
RemoteIterator<LocatedFileStatus> files = fs.listLocatedStatus(path);
5366

67+
// Load seen offsets from file
68+
readSeenOffsets();
69+
5470
// Process the directories topics
71+
processedFileCount = 0;
5572
while (files.hasNext()) {
5673
LocatedFileStatus locatedFileStatus = files.next();
5774
Path filePath = locatedFileStatus.getPath();
@@ -64,6 +81,7 @@ public void start(String directoryName) throws IOException {
6481
processTopic(filePath);
6582
}
6683
}
84+
System.out.printf("%d files processed\n", processedFileCount);
6785
}
6886

6987
public void processTopic(Path topicPath) throws IOException {
@@ -80,6 +98,7 @@ public void processTopic(Path topicPath) throws IOException {
8098

8199
if (locatedFileStatus.isFile())
82100
this.processAvroFile( locatedFileStatus.getPath(), topicName );
101+
83102
}
84103
}
85104

@@ -88,7 +107,12 @@ public void processAvroFile(Path filePath, String topicName) throws IOException
88107

89108
// Skip if extension is not .avro
90109
if (! FilenameUtils.getExtension(fileName).equals("avro")) {
91-
System.out.printf("Skipped non avro file: %s\n", filePath.getName());
110+
System.out.printf("Skipped non avro file: %s\n", fileName);
111+
return;
112+
}
113+
114+
// Skip already processed avro files
115+
if (seenFiles.contains(fileName)) {
92116
return;
93117
}
94118

@@ -106,6 +130,7 @@ record = dataFileReader.next(record);
106130
}
107131

108132
this.writeSeenOffsets(fileName);
133+
processedFileCount++;
109134
}
110135

111136
public void writeRecord(GenericRecord record, String topicName) throws IOException {
@@ -139,8 +164,10 @@ public String createFilePathFromTimestamp(Double time) {
139164
public void appendToFile(String directoryName, String fileName, String data) {
140165
File directory = new File(directoryName);
141166
if (! directory.exists()){
142-
directory.mkdirs();
143-
System.out.printf("Created directory: %s\n", directory.getAbsolutePath());
167+
if (directory.mkdirs())
168+
System.out.printf("Created directory: %s\n", directory.getAbsolutePath());
169+
else
170+
System.out.printf("FAILED to create directory: %s\n", directory.getAbsolutePath());
144171
}
145172

146173
String filePath = directoryName + "/" + fileName;
@@ -175,9 +202,9 @@ private void writeSeenOffsets(String fileName) {
175202
return;
176203
}
177204

178-
String data = String.join(",", topicName, partition, fromOffset.toString(), toOffset.toString());
205+
String data = String.join(",", fileName, topicName, partition, fromOffset.toString(), toOffset.toString());
179206

180-
try(FileWriter fw = new FileWriter(outputPath + "/" + OFFSETS_FILE_NAME, true);
207+
try(FileWriter fw = new FileWriter(offsetsPath, true);
181208
BufferedWriter bw = new BufferedWriter(fw);
182209
PrintWriter out = new PrintWriter(bw))
183210
{
@@ -187,4 +214,23 @@ private void writeSeenOffsets(String fileName) {
187214
e.printStackTrace();
188215
}
189216
}
217+
218+
private void readSeenOffsets() {
219+
220+
try (FileReader fr = new FileReader(offsetsPath);
221+
BufferedReader br = new BufferedReader(fr))
222+
{
223+
// Read in all file names from csv
224+
String line;
225+
while ( (line = br.readLine()) != null ) {
226+
String[] columns = line.split(",");
227+
seenFiles.add(columns[0]);
228+
}
229+
230+
} catch (IOException e) {
231+
// TODO
232+
e.printStackTrace();
233+
}
234+
235+
}
190236
}

0 commit comments

Comments
 (0)