Added documentation, removed some unnecessary checks, added a new file for GCSOutputCommitter

flakrimjusufi · flakrimjusufi · commit e41017581300 · 2021-01-06T11:49:23.000+01:00
diff --git a/docs/GCS-batchsink.md b/docs/GCS-batchsink.md
@@ -46,7 +46,17 @@ The delimiter will be ignored if the format is anything other than 'delimited'.
 **Location:** The location where the gcs bucket will get created. This value is ignored if the bucket already exists.
 
 **Content Type:** The Content Type entity is used to indicate the media type of the resource.
-Defaults to 'application/octet-stream'.
+Defaults to 'application/octet-stream'. The following table shows valid content types for each format.
+
+| Format type   | Content type                                                              |
+|---------------|---------------------------------------------------------------------------|
+| avro          | application/avro, application/octet-stream                                |
+| csv           | text/csv, application/csv, text/plain, application/octet-stream           |
+| delimited     | text/csv, application/csv, text/tsv, text/plain, application/octet-stream |
+| json          | application/json, text/plain, application/octet-stream                    |
+| orc           | application/octet-stream                                                  |
+| parquet       | application/octet-stream                                                  |
+| tsv           | text/tab-separated-values, application/octet-stream                       |
 
 **Custom Content Type:** The Custom Content Type is used when the value of Content-Type is set to other.
 User can provide specific Content-Type, different from the options in the dropdown.
diff --git a/src/main/java/io/cdap/plugin/gcp/gcs/sink/GCSBatchSink.java b/src/main/java/io/cdap/plugin/gcp/gcs/sink/GCSBatchSink.java
@@ -357,14 +357,6 @@ public void validate(FailureCollector collector) {
         }
       }
 
-      if (containsMacro(NAME_CONTENT_TYPE)) {
-        contentType = null;
-      }
-
-      if (containsMacro(NAME_CUSTOM_CONTENT_TYPE)) {
-        customContentType = null;
-      }
-
       if (!containsMacro(NAME_CONTENT_TYPE) && !containsMacro(NAME_CUSTOM_CONTENT_TYPE)
         && !Strings.isNullOrEmpty(contentType) && !contentType.equalsIgnoreCase(CONTENT_TYPE_OTHER)) {
         if (!contentType.equalsIgnoreCase(DEFAULT_CONTENT_TYPE)) {
@@ -401,8 +393,7 @@ public void validateContentType(FailureCollector failureCollector) {
             && !contentType.equalsIgnoreCase(CONTENT_TYPE_TEXT_PLAIN)) {
             failureCollector.addFailure(String.format(
               "Valid content types for json are %s, %s", CONTENT_TYPE_APPLICATION_JSON,
-              CONTENT_TYPE_TEXT_PLAIN),
-                                        null
+              CONTENT_TYPE_TEXT_PLAIN), null
             ).withConfigProperty(NAME_CONTENT_TYPE);
           }
           break;
@@ -413,8 +404,7 @@ public void validateContentType(FailureCollector failureCollector) {
             failureCollector.addFailure(String.format(
               "Valid content types for csv are %s, %s, %s", CONTENT_TYPE_APPLICATION_CSV,
               CONTENT_TYPE_TEXT_PLAIN,
-              CONTENT_TYPE_TEXT_CSV),
-                                        null
+              CONTENT_TYPE_TEXT_CSV), null
             ).withConfigProperty(NAME_CONTENT_TYPE);
           }
           break;
@@ -446,8 +436,7 @@ public void validateContentType(FailureCollector failureCollector) {
           if (!contentType.equalsIgnoreCase(CONTENT_TYPE_TEXT_PLAIN)
             && !contentType.equalsIgnoreCase(CONTENT_TYPE_TEXT_TSV)) {
             failureCollector.addFailure(String.format(
-              "Valid content types for tsv are %s, %s", CONTENT_TYPE_TEXT_TSV, CONTENT_TYPE_TEXT_PLAIN),
-                                        null
+              "Valid content types for tsv are %s, %s", CONTENT_TYPE_TEXT_TSV, CONTENT_TYPE_TEXT_PLAIN), null
             ).withConfigProperty(NAME_CONTENT_TYPE);
           }
           break;
diff --git a/src/main/java/io/cdap/plugin/gcp/gcs/sink/GCSOutputCommitter.java b/src/main/java/io/cdap/plugin/gcp/gcs/sink/GCSOutputCommitter.java
@@ -0,0 +1,149 @@
+package io.cdap.plugin.gcp.gcs.sink;
+
+import com.google.cloud.storage.Blob;
+import com.google.common.annotations.VisibleForTesting;
+import io.cdap.plugin.gcp.common.GCPUtils;
+import io.cdap.plugin.gcp.gcs.StorageClient;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.JobStatus;
+import org.apache.hadoop.mapreduce.OutputCommitter;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * OutputCommitter for GCS
+ */
+public class GCSOutputCommitter extends OutputCommitter {
+
+  private static final Logger LOG = LoggerFactory.getLogger(GCSOutputFormatProvider.class);
+  public static final String RECORD_COUNT_FORMAT = "recordcount.%s";
+
+  private final OutputCommitter delegate;
+
+  public GCSOutputCommitter(OutputCommitter delegate) {
+    this.delegate = delegate;
+  }
+
+  @Override
+  public void setupJob(JobContext jobContext) throws IOException {
+    delegate.setupJob(jobContext);
+  }
+
+  @Override
+  public void cleanupJob(JobContext jobContext) throws IOException {
+    delegate.cleanupJob(jobContext);
+  }
+
+  @Override
+  public void commitJob(JobContext jobContext) throws IOException {
+    delegate.commitJob(jobContext);
+  }
+
+  @Override
+  public void abortJob(JobContext jobContext, JobStatus.State state) throws IOException {
+    delegate.abortJob(jobContext, state);
+  }
+
+  @Override
+  public void setupTask(TaskAttemptContext taskAttemptContext) throws IOException {
+    delegate.setupTask(taskAttemptContext);
+  }
+
+  @Override
+  public boolean needsTaskCommit(TaskAttemptContext taskAttemptContext) throws IOException {
+    return delegate.needsTaskCommit(taskAttemptContext);
+  }
+
+  @Override
+  public void commitTask(TaskAttemptContext taskAttemptContext) throws IOException {
+      /*On commit task, there seems to be some inconsistency across different hadoop implementations regarding the path
+       where output file is stored. For some implementations it appears in the path returned by FileOutputCommitter
+       getCommittedTaskPath and for some it does not.Before commit, the files appear to be consistently present in path
+       returned by FileOutputCommitter getTaskAttemptPath. Hence, find the output file from taskAttemptPath and add
+       metadata before commit happens. After commit, file would have been moved out of the taskAttemptPath. */
+    try {
+      updateMetricMetaData(taskAttemptContext);
+    } catch (Exception exception) {
+      LOG.warn("Unable to record metric for task. Metric emitted for the number of affected rows may be incorrect.",
+               exception);
+    }
+
+    delegate.commitTask(taskAttemptContext);
+  }
+
+  private void updateMetricMetaData(TaskAttemptContext taskAttemptContext) throws IOException {
+    if (!(delegate instanceof FileOutputCommitter)) {
+      return;
+    }
+
+    FileOutputCommitter fileOutputCommitter = (FileOutputCommitter) delegate;
+    Configuration configuration = taskAttemptContext.getConfiguration();
+    //Task is not yet committed, so should be available in attempt path
+    Path taskAttemptPath = fileOutputCommitter.getTaskAttemptPath(taskAttemptContext);
+    if (configuration == null || taskAttemptPath == null) {
+      return;
+    }
+
+    //read the count from configuration
+    String keyInConfig = String.format(RECORD_COUNT_FORMAT, taskAttemptContext.getTaskAttemptID());
+    Map<String, String> metaData = new HashMap<>();
+    metaData.put(GCSBatchSink.RECORD_COUNT, String.valueOf(configuration.getLong(keyInConfig, 0L)));
+    StorageClient storageClient = getStorageClient(configuration);
+    //update metadata on the output file present in the directory for this task
+    Blob blob = storageClient.pickABlob(taskAttemptPath.toString());
+    if (blob == null) {
+      LOG.info("Could not find a file in path %s to apply count metadata.", taskAttemptPath.toString());
+      return;
+    }
+    blob.toBuilder().setContentType(configuration.get(GCSBatchSink.CONTENT_TYPE)).setMetadata(metaData).build()
+      .update();
+  }
+
+  @VisibleForTesting
+  StorageClient getStorageClient(Configuration configuration) throws IOException {
+    String project = configuration.get(GCPUtils.FS_GS_PROJECT_ID);
+    String serviceAccount = null;
+    boolean isServiceAccountFile = GCPUtils.SERVICE_ACCOUNT_TYPE_FILE_PATH
+      .equals(configuration.get(GCPUtils.SERVICE_ACCOUNT_TYPE));
+    if (isServiceAccountFile) {
+      serviceAccount = configuration.get(GCPUtils.CLOUD_JSON_KEYFILE, null);
+    } else {
+      serviceAccount = configuration.get(String.format("%s.%s", GCPUtils.CLOUD_JSON_KEYFILE_PREFIX,
+                                                       GCPUtils.CLOUD_ACCOUNT_JSON_SUFFIX));
+    }
+    return StorageClient.create(project, serviceAccount, isServiceAccountFile);
+  }
+
+  @Override
+  public void abortTask(TaskAttemptContext taskAttemptContext) throws IOException {
+    delegate.abortTask(taskAttemptContext);
+  }
+
+  @Override
+  public boolean isCommitJobRepeatable(JobContext jobContext) throws IOException {
+    return delegate.isCommitJobRepeatable(jobContext);
+  }
+
+  @Override
+  public boolean isRecoverySupported(JobContext jobContext) throws IOException {
+    return delegate.isRecoverySupported(jobContext);
+  }
+
+  @Override
+  public boolean isRecoverySupported() {
+    return delegate.isRecoverySupported();
+  }
+
+  @Override
+  public void recoverTask(TaskAttemptContext taskContext) throws IOException {
+    delegate.recoverTask(taskContext);
+  }
+}
diff --git a/src/main/java/io/cdap/plugin/gcp/gcs/sink/GCSOutputFormatProvider.java b/src/main/java/io/cdap/plugin/gcp/gcs/sink/GCSOutputFormatProvider.java
@@ -1,25 +1,16 @@
 package io.cdap.plugin.gcp.gcs.sink;
 
-import com.google.cloud.storage.Blob;
-import com.google.common.annotations.VisibleForTesting;
 import io.cdap.cdap.api.data.format.StructuredRecord;
 import io.cdap.cdap.etl.api.validation.FormatContext;
 import io.cdap.cdap.etl.api.validation.ValidatingOutputFormat;
-import io.cdap.plugin.gcp.common.GCPUtils;
-import io.cdap.plugin.gcp.gcs.StorageClient;
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.NullWritable;
 import org.apache.hadoop.mapreduce.JobContext;
-import org.apache.hadoop.mapreduce.JobStatus;
 import org.apache.hadoop.mapreduce.OutputCommitter;
 import org.apache.hadoop.mapreduce.OutputFormat;
 import org.apache.hadoop.mapreduce.RecordWriter;
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
-import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
 import org.apache.hadoop.util.ReflectionUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 import java.io.IOException;
 import java.util.HashMap;
@@ -29,8 +20,6 @@
  * OutputFormatProvider for GCSSink
  */
 public class GCSOutputFormatProvider implements ValidatingOutputFormat {
-
-  private static final Logger LOG = LoggerFactory.getLogger(GCSOutputFormatProvider.class);
   private static final String DELEGATE_OUTPUTFORMAT_CLASSNAME = "gcssink.delegate.outputformat.classname";
   private static final String OUTPUT_FOLDER = "gcssink.metric.output.folder";
   public static final String RECORD_COUNT_FORMAT = "recordcount.%s";
@@ -102,133 +91,6 @@ public OutputCommitter getOutputCommitter(TaskAttemptContext taskAttemptContext)
     }
   }
 
-  /**
-   * OutputCommitter for GCS
-   */
-  public static class GCSOutputCommitter extends OutputCommitter {
-
-    private final OutputCommitter delegate;
-
-    public GCSOutputCommitter(OutputCommitter delegate) {
-      this.delegate = delegate;
-    }
-
-    @Override
-    public void setupJob(JobContext jobContext) throws IOException {
-      delegate.setupJob(jobContext);
-    }
-
-    @Override
-    public void cleanupJob(JobContext jobContext) throws IOException {
-      delegate.cleanupJob(jobContext);
-    }
-
-    @Override
-    public void commitJob(JobContext jobContext) throws IOException {
-      delegate.commitJob(jobContext);
-    }
-
-    @Override
-    public void abortJob(JobContext jobContext, JobStatus.State state) throws IOException {
-      delegate.abortJob(jobContext, state);
-    }
-
-    @Override
-    public void setupTask(TaskAttemptContext taskAttemptContext) throws IOException {
-      delegate.setupTask(taskAttemptContext);
-    }
-
-    @Override
-    public boolean needsTaskCommit(TaskAttemptContext taskAttemptContext) throws IOException {
-      return delegate.needsTaskCommit(taskAttemptContext);
-    }
-
-    @Override
-    public void commitTask(TaskAttemptContext taskAttemptContext) throws IOException {
-      /*On commit task, there seems to be some inconsistency across different hadoop implementations regarding the path
-       where output file is stored. For some implementations it appears in the path returned by FileOutputCommitter
-       getCommittedTaskPath and for some it does not.Before commit, the files appear to be consistently present in path
-       returned by FileOutputCommitter getTaskAttemptPath. Hence, find the output file from taskAttemptPath and add
-       metadata before commit happens. After commit, file would have been moved out of the taskAttemptPath. */
-      try {
-        updateMetricMetaData(taskAttemptContext);
-      } catch (Exception exception) {
-        LOG.warn("Unable to record metric for task. Metric emitted for the number of affected rows may be incorrect.",
-                 exception);
-      }
-
-      delegate.commitTask(taskAttemptContext);
-    }
-
-    private void updateMetricMetaData(TaskAttemptContext taskAttemptContext) throws IOException {
-      if (!(delegate instanceof FileOutputCommitter)) {
-        return;
-      }
-
-      FileOutputCommitter fileOutputCommitter = (FileOutputCommitter) delegate;
-      Configuration configuration = taskAttemptContext.getConfiguration();
-      //Task is not yet committed, so should be available in attempt path
-      Path taskAttemptPath = fileOutputCommitter.getTaskAttemptPath(taskAttemptContext);
-      if (configuration == null || taskAttemptPath == null) {
-        return;
-      }
-
-      //read the count from configuration
-      String keyInConfig = String.format(RECORD_COUNT_FORMAT, taskAttemptContext.getTaskAttemptID());
-      Map<String, String> metaData = new HashMap<>();
-      metaData.put(GCSBatchSink.RECORD_COUNT, String.valueOf(configuration.getLong(keyInConfig, 0L)));
-      StorageClient storageClient = getStorageClient(configuration);
-      //update metadata on the output file present in the directory for this task
-      Blob blob = storageClient.pickABlob(taskAttemptPath.toString());
-      if (blob == null) {
-        LOG.info("Could not find a file in path %s to apply count metadata.", taskAttemptPath.toString());
-        return;
-      }
-      blob.toBuilder().setContentType(configuration.get(GCSBatchSink.CONTENT_TYPE)).setMetadata(metaData).build()
-        .update();
-    }
-
-    @VisibleForTesting
-    StorageClient getStorageClient(Configuration configuration) throws IOException {
-      String project = configuration.get(GCPUtils.FS_GS_PROJECT_ID);
-      String serviceAccount = null;
-      boolean isServiceAccountFile = GCPUtils.SERVICE_ACCOUNT_TYPE_FILE_PATH
-        .equals(configuration.get(GCPUtils.SERVICE_ACCOUNT_TYPE));
-      if (isServiceAccountFile) {
-        serviceAccount = configuration.get(GCPUtils.CLOUD_JSON_KEYFILE, null);
-      } else {
-        serviceAccount = configuration.get(String.format("%s.%s", GCPUtils.CLOUD_JSON_KEYFILE_PREFIX,
-                                                         GCPUtils.CLOUD_ACCOUNT_JSON_SUFFIX));
-      }
-      return StorageClient.create(project, serviceAccount, isServiceAccountFile);
-    }
-
-    @Override
-    public void abortTask(TaskAttemptContext taskAttemptContext) throws IOException {
-      delegate.abortTask(taskAttemptContext);
-    }
-
-    @Override
-    public boolean isCommitJobRepeatable(JobContext jobContext) throws IOException {
-      return delegate.isCommitJobRepeatable(jobContext);
-    }
-
-    @Override
-    public boolean isRecoverySupported(JobContext jobContext) throws IOException {
-      return delegate.isRecoverySupported(jobContext);
-    }
-
-    @Override
-    public boolean isRecoverySupported() {
-      return delegate.isRecoverySupported();
-    }
-
-    @Override
-    public void recoverTask(TaskAttemptContext taskContext) throws IOException {
-      delegate.recoverTask(taskContext);
-    }
-  }
-
   /**
    * RecordWriter for GCSSink
    */
diff --git a/src/main/java/io/cdap/plugin/gcp/gcs/sink/RecordFilterOutputFormat.java b/src/main/java/io/cdap/plugin/gcp/gcs/sink/RecordFilterOutputFormat.java
diff --git a/src/test/java/io/cdap/plugin/gcp/gcs/sink/GCSOutputformatProviderTest.java b/src/test/java/io/cdap/plugin/gcp/gcs/sink/GCSOutputformatProviderTest.java