16
16
17
17
package org .radarcns ;
18
18
19
+ import com .fasterxml .jackson .databind .JsonMappingException ;
19
20
import org .apache .avro .Schema .Field ;
20
21
import org .apache .avro .file .DataFileReader ;
21
22
import org .apache .avro .generic .GenericDatumReader ;
@@ -70,7 +71,9 @@ public class RestructureAvroRecords {
70
71
private long processedFileCount ;
71
72
private long processedRecordsCount ;
72
73
private static final boolean USE_GZIP = "gzip" .equalsIgnoreCase (System .getProperty ("org.radarcns.compression" ));
73
- private static final boolean DO_DEDUPLICATE = "true" .equalsIgnoreCase (System .getProperty ("org.radarcns.deduplicate" , "true" ));
74
+
75
+ // Default set to false because causes loss of records from Biovotion data. https://github.com/RADAR-base/Restructure-HDFS-topic/issues/16
76
+ private static final boolean DO_DEDUPLICATE = "true" .equalsIgnoreCase (System .getProperty ("org.radarcns.deduplicate" , "false" ));
74
77
75
78
public static void main (String [] args ) throws Exception {
76
79
if (args .length != 3 ) {
@@ -178,7 +181,12 @@ public void start(String directoryName) throws IOException {
178
181
for (Map .Entry <String , List <Path >> entry : topicPaths .entrySet ()) {
179
182
try (FileCacheStore cache = new FileCacheStore (converterFactory , 100 , USE_GZIP , DO_DEDUPLICATE )) {
180
183
for (Path filePath : entry .getValue ()) {
181
- this .processFile (filePath , entry .getKey (), cache , offsets );
184
+ // If JsonMappingException occurs, log the error and continue with other files
185
+ try {
186
+ this .processFile (filePath , entry .getKey (), cache , offsets );
187
+ } catch (JsonMappingException exc ) {
188
+ logger .error ("Cannot map values" , exc );
189
+ }
182
190
progressBar .update (++processedFileCount );
183
191
}
184
192
}
@@ -232,7 +240,7 @@ private void processFile(Path filePath, String topicName, FileCacheStore cache,
232
240
record = dataFileReader .next (record );
233
241
234
242
// Get the fields
235
- this .writeRecord (record , topicName , cache );
243
+ this .writeRecord (record , topicName , cache , 0 );
236
244
}
237
245
238
246
// Write which file has been processed and update bins
@@ -245,7 +253,7 @@ record = dataFileReader.next(record);
245
253
}
246
254
}
247
255
248
- private void writeRecord (GenericRecord record , String topicName , FileCacheStore cache )
256
+ private void writeRecord (GenericRecord record , String topicName , FileCacheStore cache , int suffix )
249
257
throws IOException {
250
258
GenericRecord keyField = (GenericRecord ) record .get ("key" );
251
259
GenericRecord valueField = (GenericRecord ) record .get ("value" );
@@ -256,37 +264,63 @@ private void writeRecord(GenericRecord record, String topicName, FileCacheStore
256
264
}
257
265
258
266
Date time = getDate (keyField , valueField );
259
- java .nio .file .Path outputFileName = createFilename (time );
267
+ java .nio .file .Path outputFileName = createFilename (time , suffix );
268
+
269
+ String projectId ;
270
+
271
+ if (keyField .get ("projectId" ) == null ) {
272
+ projectId = "unknown-project" ;
273
+ } else {
274
+ // Clean Project id for use in final pathname
275
+ projectId = keyField .get ("projectId" ).toString ().replaceAll ("[^a-zA-Z0-9_-]+" , "" );
276
+ }
260
277
261
278
// Clean user id and create final output pathname
262
279
String userId = keyField .get ("userId" ).toString ().replaceAll ("[^a-zA-Z0-9_-]+" , "" );
263
- java .nio .file .Path userDir = this .outputPath .resolve (userId );
280
+
281
+ java .nio .file .Path projectDir = this .outputPath .resolve (projectId );
282
+ java .nio .file .Path userDir = projectDir .resolve (userId );
264
283
java .nio .file .Path userTopicDir = userDir .resolve (topicName );
265
284
java .nio .file .Path outputPath = userTopicDir .resolve (outputFileName );
266
285
267
286
// Write data
268
- cache .writeRecord (outputPath , record );
287
+ int response = cache .writeRecord (outputPath , record );
269
288
270
- java .nio .file .Path schemaPath = userTopicDir .resolve (SCHEMA_OUTPUT_FILE_NAME );
271
- if (!Files .exists (schemaPath )) {
272
- try (Writer writer = Files .newBufferedWriter (schemaPath )) {
273
- writer .write (record .getSchema ().toString (true ));
289
+ if (response == FileCacheStore .CACHE_AND_NO_WRITE || response == FileCacheStore .NO_CACHE_AND_NO_WRITE ) {
290
+ // Write was unsuccessful due to different number of columns,
291
+ // try again with new file name
292
+ writeRecord (record , topicName , cache , ++suffix );
293
+ } else {
294
+ // Write was successful, finalize the write
295
+ java .nio .file .Path schemaPath = userTopicDir .resolve (SCHEMA_OUTPUT_FILE_NAME );
296
+ if (!Files .exists (schemaPath )) {
297
+ try (Writer writer = Files .newBufferedWriter (schemaPath )) {
298
+ writer .write (record .getSchema ().toString (true ));
299
+ }
274
300
}
275
- }
276
301
277
- // Count data (binned and total)
278
- bins .add (topicName , keyField .get ("sourceId" ).toString (), time );
279
- processedRecordsCount ++;
302
+ // Count data (binned and total)
303
+ bins .add (topicName , keyField .get ("sourceId" ).toString (), time );
304
+ processedRecordsCount ++;
305
+ }
280
306
}
281
307
282
- private java .nio .file .Path createFilename (Date date ) {
308
+ private java .nio .file .Path createFilename (Date date , int suffix ) {
283
309
if (date == null ) {
284
310
logger .warn ("Time field of record valueField is not set" );
285
311
return Paths .get ("unknown_date." + outputFileExtension );
286
312
}
313
+
314
+ String finalSuffix ;
315
+ if (suffix == 0 ) {
316
+ finalSuffix = "" ;
317
+ } else {
318
+ finalSuffix = "_" + suffix ;
319
+ }
320
+
287
321
// Make a timestamped filename YYYYMMDD_HH00.json
288
322
String hourlyTimestamp = createHourTimestamp (date );
289
- return Paths .get (hourlyTimestamp + "00." + outputFileExtension );
323
+ return Paths .get (hourlyTimestamp + "00" + finalSuffix + " ." + outputFileExtension );
290
324
}
291
325
292
326
public static String createHourTimestamp (Date date ) {
0 commit comments