19
19
import java .io .File ;
20
20
import java .io .IOException ;
21
21
import java .text .SimpleDateFormat ;
22
+ import java .util .ArrayList ;
22
23
import java .util .Date ;
24
+ import java .util .HashMap ;
25
+ import java .util .List ;
26
+ import java .util .Map ;
23
27
import java .util .TimeZone ;
24
28
import org .apache .avro .Schema .Field ;
25
29
import org .apache .avro .file .DataFileReader ;
26
30
import org .apache .avro .generic .GenericDatumReader ;
27
31
import org .apache .avro .generic .GenericRecord ;
28
32
import org .apache .avro .mapred .FsInput ;
29
- import org .apache .commons .io .FilenameUtils ;
30
33
import org .apache .hadoop .conf .Configuration ;
31
34
import org .apache .hadoop .fs .FileSystem ;
32
35
import org .apache .hadoop .fs .LocatedFileStatus ;
35
38
import org .radarcns .util .CsvAvroConverter ;
36
39
import org .radarcns .util .FileCacheStore ;
37
40
import org .radarcns .util .JsonAvroConverter ;
41
+ import org .radarcns .util .ProgressBar ;
38
42
import org .radarcns .util .RecordConverterFactory ;
39
43
import org .slf4j .Logger ;
40
44
import org .slf4j .LoggerFactory ;
@@ -55,18 +59,17 @@ public class RestructureAvroRecords {
55
59
56
60
private File outputPath ;
57
61
private File offsetsPath ;
58
- private OffsetRangeSet seenFiles ;
59
62
private Frequency bins ;
60
63
61
64
private final Configuration conf = new Configuration ();
62
65
63
- private int processedFileCount ;
64
- private int processedRecordsCount ;
66
+ private long processedFileCount ;
67
+ private long processedRecordsCount ;
65
68
private static final boolean USE_GZIP = "gzip" .equalsIgnoreCase (System .getProperty ("org.radarcns.compression" ));
66
69
67
70
public static void main (String [] args ) throws Exception {
68
71
if (args .length != 3 ) {
69
- System .out .println ("Usage: hadoop jar restructurehdfs-all-0.1.0. jar <webhdfs_url> <hdfs_topic > <output_folder>" );
72
+ System .out .println ("Usage: hadoop jar restructurehdfs-all-0.2. jar <webhdfs_url> <hdfs_root_directory > <output_folder>" );
70
73
System .exit (1 );
71
74
}
72
75
@@ -120,80 +123,89 @@ public void setOutputPath(String path) {
120
123
bins = Frequency .read (new File (outputPath , BINS_FILE_NAME ));
121
124
}
122
125
123
- public int getProcessedFileCount () {
126
+ public long getProcessedFileCount () {
124
127
return processedFileCount ;
125
128
}
126
129
127
- public int getProcessedRecordsCount () {
130
+ public long getProcessedRecordsCount () {
128
131
return processedRecordsCount ;
129
132
}
130
133
131
134
public void start (String directoryName ) throws IOException {
132
135
// Get files and directories
133
136
Path path = new Path (directoryName );
134
137
FileSystem fs = FileSystem .get (conf );
135
- RemoteIterator < LocatedFileStatus > files = fs . listLocatedStatus ( path );
138
+
136
139
137
140
try (OffsetRangeFile offsets = new OffsetRangeFile (offsetsPath )) {
141
+ OffsetRangeSet seenFiles ;
138
142
try {
139
143
seenFiles = offsets .read ();
140
144
} catch (IOException ex ) {
141
145
logger .error ("Error reading offsets file. Processing all offsets." );
142
146
seenFiles = new OffsetRangeSet ();
143
147
}
144
- // Process the directories topics
145
- processedFileCount = 0 ;
148
+ logger .info ("Retrieving file list from {}" , path );
149
+ // Get filenames to process
150
+ Map <String , List <Path >> topicPaths = new HashMap <>();
151
+ long toProcessFileCount = 0L ;
152
+ processedFileCount = 0L ;
153
+ RemoteIterator <LocatedFileStatus > files = fs .listFiles (path , true );
146
154
while (files .hasNext ()) {
147
155
LocatedFileStatus locatedFileStatus = files .next ();
148
- Path filePath = locatedFileStatus .getPath ();
149
-
150
- if (filePath .toString ().contains ("+tmp" )) {
156
+ if (locatedFileStatus .isDirectory ()) {
151
157
continue ;
152
158
}
159
+ Path filePath = locatedFileStatus .getPath ();
153
160
154
- if (locatedFileStatus .isDirectory ()) {
155
- processTopic (filePath , converterFactory , offsets );
161
+ String topic = getTopic (filePath , seenFiles );
162
+ if (topic != null ) {
163
+ topicPaths .computeIfAbsent (topic , k -> new ArrayList <>()).add (filePath );
164
+ toProcessFileCount ++;
156
165
}
157
166
}
158
- }
159
- }
160
167
161
- private void processTopic (Path topicPath , RecordConverterFactory converterFactory ,
162
- OffsetRangeFile offsets ) throws IOException {
163
- // Get files in this topic directory
164
- FileSystem fs = FileSystem .get (conf );
165
- RemoteIterator <LocatedFileStatus > files = fs .listFiles (topicPath , true );
168
+ logger .info ("Converting {} files" , toProcessFileCount );
166
169
167
- String topicName = topicPath .getName ();
170
+ ProgressBar progressBar = new ProgressBar (toProcessFileCount , 10 );
171
+ progressBar .update (0 );
168
172
169
- try (FileCacheStore cache = new FileCacheStore (converterFactory , 100 , USE_GZIP )) {
170
- while (files .hasNext ()) {
171
- LocatedFileStatus locatedFileStatus = files .next ();
172
-
173
- if (locatedFileStatus .isFile ()) {
174
- this .processFile (locatedFileStatus .getPath (), topicName , cache , offsets );
173
+ // Actually process the files
174
+ for (Map .Entry <String , List <Path >> entry : topicPaths .entrySet ()) {
175
+ try (FileCacheStore cache = new FileCacheStore (converterFactory , 100 , USE_GZIP )) {
176
+ for (Path filePath : entry .getValue ()) {
177
+ this .processFile (filePath , entry .getKey (), cache , offsets );
178
+ progressBar .update (++processedFileCount );
179
+ }
175
180
}
176
181
}
177
182
}
178
183
}
179
184
180
- private void processFile (Path filePath , String topicName , FileCacheStore cache ,
181
- OffsetRangeFile offsets ) throws IOException {
182
- String fileName = filePath .getName ();
185
+ private static String getTopic (Path filePath , OffsetRangeSet seenFiles ) {
186
+ if (filePath .toString ().contains ("+tmp" )) {
187
+ return null ;
188
+ }
183
189
190
+ String fileName = filePath .getName ();
184
191
// Skip if extension is not .avro
185
- if (!FilenameUtils . getExtension ( fileName ). equals ( " avro" )) {
186
- logger .info ("Skipped non-avro file: {}" , fileName );
187
- return ;
192
+ if (!fileName . endsWith ( ". avro" )) {
193
+ logger .info ("Skipping non-avro file: {}" , fileName );
194
+ return null ;
188
195
}
189
196
190
197
OffsetRange range = OffsetRange .parse (fileName );
191
198
// Skip already processed avro files
192
199
if (seenFiles .contains (range )) {
193
- return ;
200
+ return null ;
194
201
}
195
202
196
- logger .info ("{}" , filePath );
203
+ return filePath .getParent ().getParent ().getName ();
204
+ }
205
+
206
+ private void processFile (Path filePath , String topicName , FileCacheStore cache ,
207
+ OffsetRangeFile offsets ) throws IOException {
208
+ logger .debug ("Reading {}" , filePath );
197
209
198
210
// Read and parse avro file
199
211
FsInput input = new FsInput (filePath , conf );
@@ -210,12 +222,12 @@ record = dataFileReader.next(record);
210
222
211
223
// Write which file has been processed and update bins
212
224
try {
225
+ OffsetRange range = OffsetRange .parse (filePath .getName ());
213
226
offsets .write (range );
214
227
bins .write ();
215
228
} catch (IOException ex ) {
216
229
logger .warn ("Failed to update status. Continuing processing." , ex );
217
230
}
218
- processedFileCount ++;
219
231
}
220
232
221
233
private void writeRecord (GenericRecord record , String topicName , FileCacheStore cache )
@@ -265,4 +277,5 @@ public static String createHourTimestamp(GenericRecord valueField, Field timeFie
265
277
Date date = new Date ((long ) (time * 1000d ));
266
278
return FILE_DATE_FORMAT .format (date );
267
279
}
280
+
268
281
}
0 commit comments