1616
1717package org .radarcns ;
1818
19+ import com .fasterxml .jackson .databind .JsonMappingException ;
1920import org .apache .avro .Schema .Field ;
2021import org .apache .avro .file .DataFileReader ;
2122import org .apache .avro .generic .GenericDatumReader ;
@@ -70,7 +71,9 @@ public class RestructureAvroRecords {
7071 private long processedFileCount ;
7172 private long processedRecordsCount ;
7273 private static final boolean USE_GZIP = "gzip" .equalsIgnoreCase (System .getProperty ("org.radarcns.compression" ));
73- private static final boolean DO_DEDUPLICATE = "true" .equalsIgnoreCase (System .getProperty ("org.radarcns.deduplicate" , "true" ));
74+
75+ // Default set to false because causes loss of records from Biovotion data. https://github.com/RADAR-base/Restructure-HDFS-topic/issues/16
76+ private static final boolean DO_DEDUPLICATE = "true" .equalsIgnoreCase (System .getProperty ("org.radarcns.deduplicate" , "false" ));
7477
7578 public static void main (String [] args ) throws Exception {
7679 if (args .length != 3 ) {
@@ -178,7 +181,12 @@ public void start(String directoryName) throws IOException {
178181 for (Map .Entry <String , List <Path >> entry : topicPaths .entrySet ()) {
179182 try (FileCacheStore cache = new FileCacheStore (converterFactory , 100 , USE_GZIP , DO_DEDUPLICATE )) {
180183 for (Path filePath : entry .getValue ()) {
181- this .processFile (filePath , entry .getKey (), cache , offsets );
184+ // If JsonMappingException occurs, log the error and continue with other files
185+ try {
186+ this .processFile (filePath , entry .getKey (), cache , offsets );
187+ } catch (JsonMappingException exc ) {
188+ logger .error ("Cannot map values" , exc );
189+ }
182190 progressBar .update (++processedFileCount );
183191 }
184192 }
@@ -232,7 +240,7 @@ private void processFile(Path filePath, String topicName, FileCacheStore cache,
232240 record = dataFileReader .next (record );
233241
234242 // Get the fields
235- this .writeRecord (record , topicName , cache );
243+ this .writeRecord (record , topicName , cache , 0 );
236244 }
237245
238246 // Write which file has been processed and update bins
@@ -245,7 +253,7 @@ record = dataFileReader.next(record);
245253 }
246254 }
247255
248- private void writeRecord (GenericRecord record , String topicName , FileCacheStore cache )
256+ private void writeRecord (GenericRecord record , String topicName , FileCacheStore cache , int suffix )
249257 throws IOException {
250258 GenericRecord keyField = (GenericRecord ) record .get ("key" );
251259 GenericRecord valueField = (GenericRecord ) record .get ("value" );
@@ -256,37 +264,63 @@ private void writeRecord(GenericRecord record, String topicName, FileCacheStore
256264 }
257265
258266 Date time = getDate (keyField , valueField );
259- java .nio .file .Path outputFileName = createFilename (time );
267+ java .nio .file .Path outputFileName = createFilename (time , suffix );
268+
269+ String projectId ;
270+
271+ if (keyField .get ("projectId" ) == null ) {
272+ projectId = "unknown-project" ;
273+ } else {
274+ // Clean Project id for use in final pathname
275+ projectId = keyField .get ("projectId" ).toString ().replaceAll ("[^a-zA-Z0-9_-]+" , "" );
276+ }
260277
261278 // Clean user id and create final output pathname
262279 String userId = keyField .get ("userId" ).toString ().replaceAll ("[^a-zA-Z0-9_-]+" , "" );
263- java .nio .file .Path userDir = this .outputPath .resolve (userId );
280+
281+ java .nio .file .Path projectDir = this .outputPath .resolve (projectId );
282+ java .nio .file .Path userDir = projectDir .resolve (userId );
264283 java .nio .file .Path userTopicDir = userDir .resolve (topicName );
265284 java .nio .file .Path outputPath = userTopicDir .resolve (outputFileName );
266285
267286 // Write data
268- cache .writeRecord (outputPath , record );
287+ int response = cache .writeRecord (outputPath , record );
269288
270- java .nio .file .Path schemaPath = userTopicDir .resolve (SCHEMA_OUTPUT_FILE_NAME );
271- if (!Files .exists (schemaPath )) {
272- try (Writer writer = Files .newBufferedWriter (schemaPath )) {
273- writer .write (record .getSchema ().toString (true ));
289+ if (response == FileCacheStore .CACHE_AND_NO_WRITE || response == FileCacheStore .NO_CACHE_AND_NO_WRITE ) {
290+ // Write was unsuccessful due to different number of columns,
291+ // try again with new file name
292+ writeRecord (record , topicName , cache , ++suffix );
293+ } else {
294+ // Write was successful, finalize the write
295+ java .nio .file .Path schemaPath = userTopicDir .resolve (SCHEMA_OUTPUT_FILE_NAME );
296+ if (!Files .exists (schemaPath )) {
297+ try (Writer writer = Files .newBufferedWriter (schemaPath )) {
298+ writer .write (record .getSchema ().toString (true ));
299+ }
274300 }
275- }
276301
277- // Count data (binned and total)
278- bins .add (topicName , keyField .get ("sourceId" ).toString (), time );
279- processedRecordsCount ++;
302+ // Count data (binned and total)
303+ bins .add (topicName , keyField .get ("sourceId" ).toString (), time );
304+ processedRecordsCount ++;
305+ }
280306 }
281307
282- private java .nio .file .Path createFilename (Date date ) {
308+ private java .nio .file .Path createFilename (Date date , int suffix ) {
283309 if (date == null ) {
284310 logger .warn ("Time field of record valueField is not set" );
285311 return Paths .get ("unknown_date." + outputFileExtension );
286312 }
313+
314+ String finalSuffix ;
315+ if (suffix == 0 ) {
316+ finalSuffix = "" ;
317+ } else {
318+ finalSuffix = "_" + suffix ;
319+ }
320+
287321 // Make a timestamped filename YYYYMMDD_HH00.json
288322 String hourlyTimestamp = createHourTimestamp (date );
289- return Paths .get (hourlyTimestamp + "00." + outputFileExtension );
323+ return Paths .get (hourlyTimestamp + "00" + finalSuffix + " ." + outputFileExtension );
290324 }
291325
292326 public static String createHourTimestamp (Date date ) {
0 commit comments