@@ -67,6 +67,7 @@ public class GCSBatchSink extends AbstractFileSink<GCSBatchSink.GCSBatchSinkConf
6767 private static final String RECORDS_UPDATED_METRIC = "records.updated" ;
6868 public static final String AVRO_NAMED_OUTPUT = "avro.mo.config.namedOutput" ;
6969 public static final String COMMON_NAMED_OUTPUT = "mapreduce.output.basename" ;
70+ public static final String CONTENT_TYPE = "io.cdap.gcs.batch.sink.content.type" ;
7071
7172 private final GCSBatchSinkConfig config ;
7273 private String outputPath ;
@@ -125,6 +126,7 @@ public void prepareRun(BatchSinkContext context) throws Exception {
125126 @ Override
126127 protected Map <String , String > getFileSystemProperties (BatchSinkContext context ) {
127128 Map <String , String > properties = GCPUtils .getFileSystemProperties (config , config .getPath (), new HashMap <>());
129+ properties .put (GCSBatchSink .CONTENT_TYPE , config .getContentType ());
128130 properties .putAll (config .getFileSystemProperties ());
129131 String outputFileBaseName = config .getOutputFileNameBase ();
130132 if (outputFileBaseName == null || outputFileBaseName .isEmpty ()) {
@@ -242,6 +244,23 @@ public static class GCSBatchSinkConfig extends GCPReferenceSinkConfig implements
242244 private static final String NAME_LOCATION = "location" ;
243245 private static final String NAME_FS_PROPERTIES = "fileSystemProperties" ;
244246 private static final String NAME_FILE_NAME_BASE = "outputFileNameBase" ;
247+ private static final String NAME_CONTENT_TYPE = "contentType" ;
248+ private static final String NAME_CUSTOM_CONTENT_TYPE = "customContentType" ;
249+ private static final String DEFAULT_CONTENT_TYPE = "application/octet-stream" ;
250+ private static final String CONTENT_TYPE_OTHER = "other" ;
251+ private static final String CONTENT_TYPE_APPLICATION_JSON = "application/json" ;
252+ private static final String CONTENT_TYPE_APPLICATION_AVRO = "application/avro" ;
253+ private static final String CONTENT_TYPE_APPLICATION_CSV = "application/csv" ;
254+ private static final String CONTENT_TYPE_TEXT_PLAIN = "text/plain" ;
255+ private static final String CONTENT_TYPE_TEXT_CSV = "text/csv" ;
256+ private static final String CONTENT_TYPE_TEXT_TSV = "text/tab-separated-values" ;
257+ private static final String FORMAT_AVRO = "avro" ;
258+ private static final String FORMAT_CSV = "csv" ;
259+ private static final String FORMAT_JSON = "json" ;
260+ private static final String FORMAT_TSV = "tsv" ;
261+ private static final String FORMAT_DELIMITED = "delimited" ;
262+ private static final String FORMAT_ORC = "orc" ;
263+ private static final String FORMAT_PARQUET = "parquet" ;
245264
246265 private static final String SCHEME = "gs://" ;
247266 @ Name (NAME_PATH )
@@ -280,6 +299,18 @@ public static class GCSBatchSinkConfig extends GCPReferenceSinkConfig implements
280299 "This value is ignored if the bucket already exists" )
281300 protected String location ;
282301
302+ @ Macro
303+ @ Description ("The Content Type property is used to indicate the media type of the resource." +
304+ "Defaults to 'application/octet-stream'." )
305+ @ Nullable
306+ protected String contentType ;
307+
308+ @ Macro
309+ @ Description ("The Custom Content Type is used when the value of Content-Type is set to other." +
310+ "User can provide specific Content-Type, different from the options in the dropdown." )
311+ @ Nullable
312+ protected String customContentType ;
313+
283314 @ Name (NAME_FS_PROPERTIES )
284315 @ Macro
285316 @ Nullable
@@ -326,10 +357,19 @@ public void validate(FailureCollector collector) {
326357 }
327358 }
328359
360+ if (!containsMacro (NAME_CONTENT_TYPE ) && !containsMacro (NAME_CUSTOM_CONTENT_TYPE )
361+ && !Strings .isNullOrEmpty (contentType ) && !contentType .equalsIgnoreCase (CONTENT_TYPE_OTHER )
362+ && !containsMacro (NAME_FORMAT )) {
363+ if (!contentType .equalsIgnoreCase (DEFAULT_CONTENT_TYPE )) {
364+ validateContentType (collector );
365+ }
366+ }
367+
329368 try {
330369 getSchema ();
331370 } catch (IllegalArgumentException e ) {
332- collector .addFailure (e .getMessage (), null ).withConfigProperty (NAME_SCHEMA ).withStacktrace (e .getStackTrace ());
371+ collector .addFailure (e .getMessage (), null ).withConfigProperty (NAME_SCHEMA )
372+ .withStacktrace (e .getStackTrace ());
333373 }
334374
335375 try {
@@ -340,6 +380,69 @@ public void validate(FailureCollector collector) {
340380 }
341381 }
342382
383+ //This method validates the specified content type for the used format.
384+ public void validateContentType (FailureCollector failureCollector ) {
385+ switch (format ) {
386+ case FORMAT_AVRO :
387+ if (!contentType .equalsIgnoreCase (CONTENT_TYPE_APPLICATION_AVRO )) {
388+ failureCollector .addFailure (String .format ("Valid content types for avro are %s, %s." ,
389+ CONTENT_TYPE_APPLICATION_AVRO , DEFAULT_CONTENT_TYPE ), null )
390+ .withConfigProperty (NAME_CONTENT_TYPE );
391+ }
392+ break ;
393+ case FORMAT_JSON :
394+ if (!contentType .equalsIgnoreCase (CONTENT_TYPE_APPLICATION_JSON )
395+ && !contentType .equalsIgnoreCase (CONTENT_TYPE_TEXT_PLAIN )) {
396+ failureCollector .addFailure (String .format (
397+ "Valid content types for json are %s, %s, %s." , CONTENT_TYPE_APPLICATION_JSON ,
398+ CONTENT_TYPE_TEXT_PLAIN , DEFAULT_CONTENT_TYPE ), null
399+ ).withConfigProperty (NAME_CONTENT_TYPE );
400+ }
401+ break ;
402+ case FORMAT_CSV :
403+ if (!contentType .equalsIgnoreCase (CONTENT_TYPE_APPLICATION_CSV )
404+ && !contentType .equalsIgnoreCase (CONTENT_TYPE_TEXT_CSV )
405+ && !contentType .equalsIgnoreCase (CONTENT_TYPE_TEXT_PLAIN )) {
406+ failureCollector .addFailure (String .format (
407+ "Valid content types for csv are %s, %s, %s, %s." , CONTENT_TYPE_APPLICATION_CSV ,
408+ CONTENT_TYPE_TEXT_PLAIN , CONTENT_TYPE_TEXT_CSV , DEFAULT_CONTENT_TYPE ), null
409+ ).withConfigProperty (NAME_CONTENT_TYPE );
410+ }
411+ break ;
412+ case FORMAT_DELIMITED :
413+ if (!contentType .equalsIgnoreCase (CONTENT_TYPE_TEXT_PLAIN )
414+ && !contentType .equalsIgnoreCase (CONTENT_TYPE_TEXT_CSV )
415+ && !contentType .equalsIgnoreCase (CONTENT_TYPE_APPLICATION_CSV )
416+ && !contentType .equalsIgnoreCase (CONTENT_TYPE_TEXT_TSV )) {
417+ failureCollector .addFailure (String .format (
418+ "Valid content types for delimited are %s, %s, %s, %s, %s." , CONTENT_TYPE_TEXT_PLAIN ,
419+ CONTENT_TYPE_TEXT_CSV , CONTENT_TYPE_APPLICATION_CSV , CONTENT_TYPE_TEXT_TSV , DEFAULT_CONTENT_TYPE ), null
420+ ).withConfigProperty (NAME_CONTENT_TYPE );
421+ }
422+ break ;
423+ case FORMAT_PARQUET :
424+ if (!contentType .equalsIgnoreCase (DEFAULT_CONTENT_TYPE )) {
425+ failureCollector .addFailure (String .format ("Valid content type for parquet is %s." , DEFAULT_CONTENT_TYPE ),
426+ null ).withConfigProperty (NAME_CONTENT_TYPE );
427+ }
428+ break ;
429+ case FORMAT_ORC :
430+ if (!contentType .equalsIgnoreCase (DEFAULT_CONTENT_TYPE )) {
431+ failureCollector .addFailure (String .format ("Valid content type for orc is %s." , DEFAULT_CONTENT_TYPE ),
432+ null ).withConfigProperty (NAME_CONTENT_TYPE );
433+ }
434+ break ;
435+ case FORMAT_TSV :
436+ if (!contentType .equalsIgnoreCase (CONTENT_TYPE_TEXT_PLAIN )
437+ && !contentType .equalsIgnoreCase (CONTENT_TYPE_TEXT_TSV )) {
438+ failureCollector .addFailure (String .format (
439+ "Valid content types for tsv are %s, %s, %s." , CONTENT_TYPE_TEXT_TSV , CONTENT_TYPE_TEXT_PLAIN ,
440+ DEFAULT_CONTENT_TYPE ), null ).withConfigProperty (NAME_CONTENT_TYPE );
441+ }
442+ break ;
443+ }
444+ }
445+
343446 public String getBucket () {
344447 return GCSPath .from (path ).getBucket ();
345448 }
@@ -383,6 +486,30 @@ public String getLocation() {
383486 return location ;
384487 }
385488
489+ /* This method gets the value of content type. Valid content types for each format are:
490+ *
491+ * avro -> application/avro, application/octet-stream
492+ * json -> application/json, text/plain, application/octet-stream
493+ * csv -> application/csv, text/csv, text/plain, application/octet-stream
494+ * delimited -> application/csv, text/csv, text/plain, text/tsv, application/octet-stream
495+ * orc -> application/octet-stream
496+ * parquet -> application/octet-stream
497+ * tsv -> text/tab-separated-values, application/octet-stream
498+ */
499+ @ Nullable
500+ public String getContentType () {
501+ if (!Strings .isNullOrEmpty (contentType )) {
502+ if (contentType .equals (CONTENT_TYPE_OTHER )) {
503+ if (Strings .isNullOrEmpty (customContentType )) {
504+ return DEFAULT_CONTENT_TYPE ;
505+ }
506+ return customContentType ;
507+ }
508+ return contentType ;
509+ }
510+ return DEFAULT_CONTENT_TYPE ;
511+ }
512+
386513 public Map <String , String > getFileSystemProperties () {
387514 if (fileSystemProperties == null || fileSystemProperties .isEmpty ()) {
388515 return Collections .emptyMap ();
0 commit comments