19
19
import java .io .File ;
20
20
import java .io .IOException ;
21
21
import java .text .SimpleDateFormat ;
22
+ import java .util .ArrayList ;
22
23
import java .util .Date ;
24
+ import java .util .HashMap ;
25
+ import java .util .List ;
26
+ import java .util .Map ;
23
27
import java .util .TimeZone ;
24
28
import org .apache .avro .Schema .Field ;
25
29
import org .apache .avro .file .DataFileReader ;
26
30
import org .apache .avro .generic .GenericDatumReader ;
27
31
import org .apache .avro .generic .GenericRecord ;
28
32
import org .apache .avro .mapred .FsInput ;
29
- import org .apache .commons .io .FilenameUtils ;
30
33
import org .apache .hadoop .conf .Configuration ;
31
34
import org .apache .hadoop .fs .FileSystem ;
32
35
import org .apache .hadoop .fs .LocatedFileStatus ;
33
36
import org .apache .hadoop .fs .Path ;
34
37
import org .apache .hadoop .fs .RemoteIterator ;
35
38
import org .radarcns .util .CsvAvroConverter ;
36
- import org .radarcns .util .FileCache ;
39
+ import org .radarcns .util .FileCacheStore ;
37
40
import org .radarcns .util .JsonAvroConverter ;
41
+ import org .radarcns .util .ProgressBar ;
38
42
import org .radarcns .util .RecordConverterFactory ;
39
43
import org .slf4j .Logger ;
40
44
import org .slf4j .LoggerFactory ;
@@ -55,17 +59,17 @@ public class RestructureAvroRecords {
55
59
56
60
private File outputPath ;
57
61
private File offsetsPath ;
58
- private OffsetRangeSet seenFiles ;
59
62
private Frequency bins ;
60
63
61
64
private final Configuration conf = new Configuration ();
62
65
63
- private int processedFileCount ;
64
- private int processedRecordsCount ;
66
+ private long processedFileCount ;
67
+ private long processedRecordsCount ;
68
+ private static final boolean USE_GZIP = "gzip" .equalsIgnoreCase (System .getProperty ("org.radarcns.compression" ));
65
69
66
70
public static void main (String [] args ) throws Exception {
67
71
if (args .length != 3 ) {
68
- System .out .println ("Usage: hadoop jar restructurehdfs-all-0.1.0. jar <webhdfs_url> <hdfs_topic > <output_folder>" );
72
+ System .out .println ("Usage: hadoop jar restructurehdfs-all-0.2. jar <webhdfs_url> <hdfs_root_directory > <output_folder>" );
69
73
System .exit (1 );
70
74
}
71
75
@@ -91,13 +95,21 @@ public RestructureAvroRecords(String inputPath, String outputPath) {
91
95
this .setInputWebHdfsURL (inputPath );
92
96
this .setOutputPath (outputPath );
93
97
98
+ String extension ;
94
99
if (System .getProperty ("org.radarcns.format" , "csv" ).equalsIgnoreCase ("json" )) {
100
+ logger .info ("Writing output files in JSON format" );
95
101
converterFactory = JsonAvroConverter .getFactory ();
96
- outputFileExtension = "json" ;
102
+ extension = "json" ;
97
103
} else {
104
+ logger .info ("Writing output files in CSV format" );
98
105
converterFactory = CsvAvroConverter .getFactory ();
99
- outputFileExtension = "csv" ;
106
+ extension = "csv" ;
100
107
}
108
+ if (USE_GZIP ) {
109
+ logger .info ("Compressing output files in GZIP format" );
110
+ extension += ".gz" ;
111
+ }
112
+ outputFileExtension = extension ;
101
113
}
102
114
103
115
public void setInputWebHdfsURL (String fileSystemURL ) {
@@ -111,83 +123,100 @@ public void setOutputPath(String path) {
111
123
bins = Frequency .read (new File (outputPath , BINS_FILE_NAME ));
112
124
}
113
125
114
- public int getProcessedFileCount () {
126
+ public long getProcessedFileCount () {
115
127
return processedFileCount ;
116
128
}
117
129
118
- public int getProcessedRecordsCount () {
130
+ public long getProcessedRecordsCount () {
119
131
return processedRecordsCount ;
120
132
}
121
133
122
134
public void start (String directoryName ) throws IOException {
123
135
// Get files and directories
124
136
Path path = new Path (directoryName );
125
137
FileSystem fs = FileSystem .get (conf );
126
- RemoteIterator < LocatedFileStatus > files = fs . listLocatedStatus ( path );
138
+
127
139
128
140
try (OffsetRangeFile offsets = new OffsetRangeFile (offsetsPath )) {
141
+ OffsetRangeSet seenFiles ;
129
142
try {
130
143
seenFiles = offsets .read ();
131
144
} catch (IOException ex ) {
132
145
logger .error ("Error reading offsets file. Processing all offsets." );
133
146
seenFiles = new OffsetRangeSet ();
134
147
}
135
- // Process the directories topics
136
- processedFileCount = 0 ;
148
+ logger .info ("Retrieving file list from {}" , path );
149
+ // Get filenames to process
150
+ Map <String , List <Path >> topicPaths = new HashMap <>();
151
+ long toProcessFileCount = 0L ;
152
+ processedFileCount = 0L ;
153
+ RemoteIterator <LocatedFileStatus > files = fs .listFiles (path , true );
137
154
while (files .hasNext ()) {
138
155
LocatedFileStatus locatedFileStatus = files .next ();
139
- Path filePath = locatedFileStatus .getPath ();
140
-
141
- if (filePath .toString ().contains ("+tmp" )) {
156
+ if (locatedFileStatus .isDirectory ()) {
142
157
continue ;
143
158
}
159
+ Path filePath = locatedFileStatus .getPath ();
144
160
145
- if (locatedFileStatus .isDirectory ()) {
146
- processTopic (filePath , converterFactory , offsets );
161
+ String topic = getTopic (filePath , seenFiles );
162
+ if (topic != null ) {
163
+ topicPaths .computeIfAbsent (topic , k -> new ArrayList <>()).add (filePath );
164
+ toProcessFileCount ++;
147
165
}
148
166
}
149
- }
150
- }
151
167
152
- private void processTopic (Path topicPath , RecordConverterFactory converterFactory ,
153
- OffsetRangeFile offsets ) throws IOException {
154
- // Get files in this topic directory
155
- FileSystem fs = FileSystem .get (conf );
156
- RemoteIterator <LocatedFileStatus > files = fs .listFiles (topicPath , true );
168
+ logger .info ("Converting {} files" , toProcessFileCount );
157
169
158
- String topicName = topicPath .getName ();
159
-
160
- try (FileCache cache = new FileCache (converterFactory , 100 )) {
161
- while (files .hasNext ()) {
162
- LocatedFileStatus locatedFileStatus = files .next ();
170
+ ProgressBar progressBar = new ProgressBar (toProcessFileCount , 10 );
171
+ progressBar .update (0 );
163
172
164
- if (locatedFileStatus .isFile ()) {
165
- this .processFile (locatedFileStatus .getPath (), topicName , cache , offsets );
173
+ // Actually process the files
174
+ for (Map .Entry <String , List <Path >> entry : topicPaths .entrySet ()) {
175
+ try (FileCacheStore cache = new FileCacheStore (converterFactory , 100 , USE_GZIP )) {
176
+ for (Path filePath : entry .getValue ()) {
177
+ this .processFile (filePath , entry .getKey (), cache , offsets );
178
+ progressBar .update (++processedFileCount );
179
+ }
166
180
}
167
181
}
168
182
}
169
183
}
170
184
171
- private void processFile (Path filePath , String topicName , FileCache cache ,
172
- OffsetRangeFile offsets ) throws IOException {
173
- String fileName = filePath .getName ();
185
+ private static String getTopic (Path filePath , OffsetRangeSet seenFiles ) {
186
+ if (filePath .toString ().contains ("+tmp" )) {
187
+ return null ;
188
+ }
174
189
190
+ String fileName = filePath .getName ();
175
191
// Skip if extension is not .avro
176
- if (!FilenameUtils . getExtension ( fileName ). equals ( " avro" )) {
177
- logger .info ("Skipped non-avro file: {}" , fileName );
178
- return ;
192
+ if (!fileName . endsWith ( ". avro" )) {
193
+ logger .info ("Skipping non-avro file: {}" , fileName );
194
+ return null ;
179
195
}
180
196
181
197
OffsetRange range = OffsetRange .parse (fileName );
182
198
// Skip already processed avro files
183
199
if (seenFiles .contains (range )) {
184
- return ;
200
+ return null ;
185
201
}
186
202
187
- logger .info ("{}" , filePath );
203
+ return filePath .getParent ().getParent ().getName ();
204
+ }
205
+
206
+ private void processFile (Path filePath , String topicName , FileCacheStore cache ,
207
+ OffsetRangeFile offsets ) throws IOException {
208
+ logger .debug ("Reading {}" , filePath );
188
209
189
210
// Read and parse avro file
190
211
FsInput input = new FsInput (filePath , conf );
212
+
213
+ // processing zero-length files may trigger a stall. See:
214
+ // https://github.com/RADAR-CNS/Restructure-HDFS-topic/issues/3
215
+ if (input .length () == 0 ) {
216
+ logger .warn ("File {} has zero length, skipping." , filePath );
217
+ return ;
218
+ }
219
+
191
220
DataFileReader <GenericRecord > dataFileReader = new DataFileReader <>(input ,
192
221
new GenericDatumReader <>());
193
222
@@ -201,15 +230,15 @@ record = dataFileReader.next(record);
201
230
202
231
// Write which file has been processed and update bins
203
232
try {
233
+ OffsetRange range = OffsetRange .parse (filePath .getName ());
204
234
offsets .write (range );
205
235
bins .write ();
206
236
} catch (IOException ex ) {
207
237
logger .warn ("Failed to update status. Continuing processing." , ex );
208
238
}
209
- processedFileCount ++;
210
239
}
211
240
212
- private void writeRecord (GenericRecord record , String topicName , FileCache cache )
241
+ private void writeRecord (GenericRecord record , String topicName , FileCacheStore cache )
213
242
throws IOException {
214
243
GenericRecord keyField = (GenericRecord ) record .get ("key" );
215
244
GenericRecord valueField = (GenericRecord ) record .get ("value" );
@@ -256,4 +285,5 @@ public static String createHourTimestamp(GenericRecord valueField, Field timeFie
256
285
Date date = new Date ((long ) (time * 1000d ));
257
286
return FILE_DATE_FORMAT .format (date );
258
287
}
288
+
259
289
}
0 commit comments