Merge pull request #13 from data-integrations/feature_release/CDAP-14481-widget-improvements

albertshau · web-flow · commit 476357baa517 · 2018-10-10T11:47:56.000-07:00
CDAP-14481 documentation and widget improvements
diff --git a/docs/S3-batchsink.md b/docs/S3-batchsink.md
@@ -3,74 +3,35 @@
 
 Description
 -----------
-A batch sink for writing to Amazon S3 in various formats.
-
-
-Use Case
---------
-This sink is used whenever you need to write to Amazon S3 in Avro format. For example,
+This sink is used whenever you need to write to Amazon S3 in various formats. For example,
 you might want to create daily snapshots of a database by reading the entire contents of a
 table, writing to this sink, and then other programs can analyze the contents of the
-specified file. The output of the run will be stored in a directory with suffix
-'yyyy-MM-dd-HH-mm' from the base path provided.
+specified file.
 
 
 Properties
 ----------
-**referenceName:** This will be used to uniquely identify this sink for lineage, annotating metadata, etc.
-
-**authenticationMethod:** Authentication method to access S3. Defaults to Access Credentials.
- User need to have AWS environment only to use IAM role based authentication. 
-  URI scheme should be s3a:// for S3AFileSystem or s3n:// for S3NativeFileSystem. (Macro-enabled)
-
-**accessID:** Access ID of the Amazon S3 instance to connect to. (Macro-enabled)
+**Reference Name:** Name used to uniquely identify this sink for lineage, annotating metadata, etc.
 
-**accessKey:** Access Key of the Amazon S3 instance to connect to. (Macro-enabled)
+**Path:** Path to write to. For example, s3a://<bucket>/path/to/output
 
-**basePath:** The S3 path where the data is stored. Example: 's3a://logs'. (Macro-enabled)
+**Path Suffix:** Time format for the output directory that will be appended to the path.
+For example, the format 'yyyy-MM-dd-HH-mm' will result in a directory of the form '2015-01-01-20-42'.
+If not specified, nothing will be appended to the path."
 
-**enableEncryption:** Server side encryption. Defaults to True. Sole supported algorithm is AES256. (Macro-enabled)
-
-**fileSystemProperties:** A JSON string representing a map of properties needed for the
-distributed file system. The property names needed for S3 (*accessID* and *accessKey*)
-will be included as ``'fs.s3a.access.key'`` and ``'fs.s3a.secret.key'`` for S3AFileSystem. 
-For S3NativeFileSystem ``'fs.s3n.awsSecretAccessKey'`` and ``'fs.s3n.awsAccessKeyId'`` will be used. (Macro-enabled)
+**Format:** Format to write the records in.
+The format must be one of 'json', 'avro', 'parquet', 'csv', 'tsv', or 'delimited'.
 
-**Directory Time Format:** The format for the path that will be suffixed to the basePath; for
-example: the format ``'yyyy-MM-dd-HH-mm'`` will create a file path ending in
-``'2015-01-01-20-42'``. Default format used is ``'yyyy-MM-dd-HH-mm'``. (Macro-enabled)
+**Delimiter:** Delimiter to use if the format is 'delimited'.
+The delimiter will be ignored if the format is anything other than 'delimited'.
 
-**Format:** Format to write the records in.
-The format must be one of 'json', 'avro', 'parquet', 'csv', 'tsv', or 'delimited'. (Macro-enabled)
+**Authentication Method:** Authentication method to access S3. The default value is Access Credentials.
+IAM can only be used if the plugin is run in an AWS environment, such as on EMR.
 
-**schema:** The Schema of the record being written to the sink as a JSON object. (Macro-enabled)
+**Access ID:** Amazon access ID required for authentication.
 
+**Access Key:** Amazon access key required for authentication.
 
-Example
--------
-This example will use Access Credentials authentication and write to an S3 output located at ``s3a://logs``. It will write data in
-Avro format compressed using Snappy format and using the given schema. Every time the pipeline 
-runs, a new output directory from the base path (``s3a://logs``) will be created which 
-will have the directory name corresponding to the start time in ``yyyy-MM-dd-HH-mm`` format:
+**Enable Encryption:** Whether to enable server side encryption. The sole supported algorithm is AES256.
 
-    {
-        "name": "S3Avro",
-        "type": "batchsink",
-        "properties": {
-            "authenticationMethod": "Access Credentials",
-            "accessKey": "key",
-            "accessID": "ID",
-            "basePath": "s3a://logs",
-            "pathFormat": "yyyy-MM-dd-HH-mm",
-            "compressionCodec": "Snappy",
-            "schema": "{
-                \"type\":\"record\",
-                \"name\":\"user\",
-                \"fields\":[
-                    {\"name\":\"id\",\"type\":\"long\"},
-                    {\"name\":\"name\",\"type\":\"string\"},
-                    {\"name\":\"birthyear\",\"type\":\"int\"}
-                ]
-            }"
-        }
-    }
+**File System Properties:** Additional properties to use with the OutputFormat when reading the data.
diff --git a/docs/S3-batchsource.md b/docs/S3-batchsource.md
@@ -3,68 +3,45 @@
 
 Description
 -----------
-Batch source to use Amazon S3 as a Source.
-
-
-Use Case
---------
 This source is used whenever you need to read from Amazon S3.
 For example, you may want to read in log files from S3 every hour and then store
 the logs in a TimePartitionedFileSet.
 
-
 Properties
 ----------
-**referenceName:** This will be used to uniquely identify this source for lineage, annotating metadata, etc.
-
-**authenticationMethod:** Authentication method to access S3. Defaults to Access Credentials.
- User need to have AWS environment only to use IAM role based authentication. URI scheme should be s3a:// for S3AFileSystem or s3n:// for S3NativeFileSystem. (Macro-enabled)
-
-**accessID:** Access ID of the Amazon S3 instance to connect to. Mandatory if authentication method is Access credentials. (Macro-enabled)
+**Reference Name:** Name used to uniquely identify this source for lineage, annotating metadata, etc.
 
-**accessKey:** Access Key of the Amazon S3 instance to connect to. Mandatory if authentication method is Access credentials. (Macro-enabled)
-
-**path:** Path to file(s) to be read. If a directory is specified,
-terminate the path name with a '/'. The path uses filename expansion (globbing) to read files. (Macro-enabled)
+**Path:** Path to read from. For example, s3a://<bucket>/path/to/input
 
 **Format:** Format of the data to read.
 The format must be one of 'avro', 'blob', 'csv', 'delimited', 'json', 'parquet', 'text', or 'tsv'.
 If the format is 'blob', every input file will be read into a separate record.
 The 'blob' format also requires a schema that contains a field named 'body' of type 'bytes'.
-If the format is 'text', the schema must contain a field named 'body' of type 'string'. (Macro-enabled)
+If the format is 'text', the schema must contain a field named 'body' of type 'string'.
+
+**Delimiter:** Delimiter to use when the format is 'delimited'. This will be ignored for other formats.
+
+**Authentication Method:** Authentication method to access S3. The default value is Access Credentials.
+IAM can only be used if the plugin is run in an AWS environment, such as on EMR.
+
+**Access ID:** Amazon access ID required for authentication.
 
-**fileRegex:** Regex to filter out files in the path. It accepts regular expression which is applied to the complete
-path and returns the list of files that match the specified pattern.
+**Access Key:** Amazon access key required for authentication.
 
-**maxSplitSize:** Maximum split-size for each mapper in the MapReduce Job. Defaults to 128MB. (Macro-enabled)
+**Maximum Split Size:** Maximum size in bytes for each input partition.
+Smaller partitions will increase the level of parallelism, but will require more resources and overhead.
+The default value is 128MB.
 
-**ignoreNonExistingFolders:** Identify if path needs to be ignored or not, for case when directory or file does not
-exists. If set to true it will treat the not present folder as 0 input and log a warning. Default is false.
+**Path Field:** Output field to place the path of the file that the record was read from.
+If not specified, the file path will not be included in output records.
+If specified, the field must exist in the output schema as a string.
 
-**recursive:** Boolean value to determine if files are to be read recursively from the path. Default is false.
+**Path Filename Only:** Whether to only use the filename instead of the URI of the file path when a path field is given.
+The default value is false.
 
+**Read Files Recursively:** Whether files are to be read recursively from the path. The default value is false.
 
-Example
--------
-This example connects to Amazon S3 using Access Credentials and reads in files found in the specified directory while
-using the stateful ``timefilter``, which ensures that each file is read only once. The ``timefilter``
-requires that files be named with either the convention "yy-MM-dd-HH..." (S3) or "...'.'yy-MM-dd-HH..."
-(Cloudfront). The stateful metadata is stored in a table named 'timeTable'. With the maxSplitSize
-set to 1MB, if the total size of the files being read is larger than 1MB, CDAP will
-configure Hadoop to use one mapper per MB:
+**Allow Empty Input:** Whether to allow an input path that contains no data. When set to false, the plugin
+will error when there is no data to read. When set to true, no error will be thrown and zero records will be read.
 
-    {
-        "name": "S3",
-        "type": "batchsource",
-        "properties": {
-            "authenticationMethod": "Access Credentials",
-            "accessKey": "key",
-            "accessID": "ID",
-            "path": "s3a://path/to/logs/",
-            "fileRegex": "timefilter",
-            "timeTable": "timeTable",
-            "maxSplitSize": "1048576",
-            "ignoreNonExistingFolders": "false",
-            "recursive": "false"
-        }
-    }
+**File System Properties:** Additional properties to use with the InputFormat when reading the data.
diff --git a/src/main/java/co/cask/aws/s3/source/S3BatchSource.java b/src/main/java/co/cask/aws/s3/source/S3BatchSource.java
@@ -20,9 +20,12 @@
 import co.cask.cdap.api.annotation.Macro;
 import co.cask.cdap.api.annotation.Name;
 import co.cask.cdap.api.annotation.Plugin;
+import co.cask.cdap.api.data.schema.Schema;
+import co.cask.cdap.api.plugin.EndpointPluginContext;
 import co.cask.cdap.etl.api.batch.BatchSource;
 import co.cask.cdap.etl.api.batch.BatchSourceContext;
 import co.cask.hydrator.common.LineageRecorder;
+import co.cask.hydrator.format.FileFormat;
 import co.cask.hydrator.format.input.PathTrackingInputFormat;
 import co.cask.hydrator.format.plugin.AbstractFileSource;
 import co.cask.hydrator.format.plugin.AbstractFileSourceConfig;
@@ -35,6 +38,7 @@
 import java.util.List;
 import java.util.Map;
 import javax.annotation.Nullable;
+import javax.ws.rs.Path;
 
 /**
  * A {@link BatchSource} that reads from Amazon S3.
@@ -81,6 +85,23 @@ protected void recordLineage(LineageRecorder lineageRecorder, List<String> outpu
     lineageRecorder.recordRead("Read", "Read from S3.", outputFields);
   }
 
+  /**
+   * Endpoint method to get the output schema of a source.
+   *
+   * @param config configuration for the source
+   * @param pluginContext context to create plugins
+   * @return schema of fields
+   */
+  @Path("getSchema")
+  public Schema getSchema(S3BatchConfig config, EndpointPluginContext pluginContext) {
+    FileFormat fileFormat = config.getFormat();
+    if (fileFormat == null) {
+      return config.getSchema();
+    }
+    Schema schema = fileFormat.getSchema(config.getPathField());
+    return schema == null ? config.getSchema() : schema;
+  }
+
   /**
    * Config class that contains properties needed for the S3 source.
    */
@@ -108,7 +129,7 @@ public static class S3BatchConfig extends AbstractFileSourceConfig {
     @Nullable
     @Description("Authentication method to access S3. " +
       "Defaults to Access Credentials. URI scheme should be s3a:// for S3AFileSystem or s3n:// for " +
-      "S3NativeFileSystem. (Macro-enabled)")
+      "S3NativeFileSystem.")
     private String authenticationMethod;
 
     @Macro
diff --git a/widgets/S3-batchsink.json b/widgets/S3-batchsink.json
@@ -6,44 +6,28 @@
   "display-name": "Amazon S3",
   "configuration-groups": [
     {
-      "label": "S3 Avro Batch Sink",
+      "label": "Basic",
       "properties": [
         {
           "widget-type": "textbox",
           "label": "Reference Name",
           "name": "referenceName"
         },
-        {
-          "widget-type": "select",
-          "label": "Authentication Method",
-          "name": "authenticationMethod",
-          "widget-attributes": {
-            "values": [
-              "Access Credentials",
-              "IAM"
-            ],
-            "default": "Access Credentials"
-          }
-        },
-        {
-          "widget-type": "textbox",
-          "label": "Access ID",
-          "name": "accessID"
-        },
-        {
-          "widget-type": "textbox",
-          "label": "Access Key",
-          "name": "accessKey"
-        },
         {
           "widget-type": "textbox",
           "label": "Path",
-          "name": "path"
+          "name": "path",
+          "widget-attributes": {
+            "placeholder": "s3a://<bucket>/path/to/output"
+          }
         },
         {
           "widget-type": "textbox",
-          "label": "Directory Time Format",
-          "name": "suffix"
+          "label": "Path Suffix",
+          "name": "suffix",
+          "widget-attributes" : {
+            "default": "yyyy-MM-dd-HH-mm"
+          }
         },
         {
           "widget-type": "select",
@@ -64,24 +48,79 @@
         {
           "widget-type": "textbox",
           "label": "Delimiter",
-          "name": "delimiter"
+          "name": "delimiter",
+          "widget-attributes": {
+            "placeholder": "Delimiter if the format is 'delimited'"
+          }
+        }
+      ]
+    },
+    {
+      "label": "Credentials",
+      "properties": [
+        {
+          "widget-type": "radio-group",
+          "label": "Authentication Method",
+          "name": "authenticationMethod",
+          "widget-attributes": {
+            "layout": "inline",
+            "default": "Access Credentials",
+            "options": [
+              {
+                "id": "Access Credentials",
+                "label": "Access Credentials"
+              },
+              {
+                "id": "IAM",
+                "label": "IAM"
+              }
+            ]
+          }
         },
         {
-          "widget-type": "select",
-          "label": "Server Side Encryption",
-          "name": "enableEncryption",
+          "widget-type": "textbox",
+          "label": "Access ID",
+          "name": "accessID",
           "widget-attributes": {
-            "values": [
-              "True",
-              "False"
-            ],
-            "default": "True"
+            "placeholder": "Amazon Access ID"
           }
         },
+        {
+          "widget-type": "password",
+          "label": "Access Key",
+          "name": "accessKey",
+          "widget-attributes": {
+            "placeholder": "Amazon Access Key"
+          }
+        }
+      ]
+    },
+    {
+      "label": "Advanced",
+      "properties": [
         {
           "widget-type": "json-editor",
           "label": "File System Properties",
           "name": "fileSystemProperties"
+        },
+        {
+          "widget-type": "select",
+          "label": "Enable Encryption",
+          "name": "enableEncryption",
+          "widget-attributes": {
+            "layout": "inline",
+            "default": "true",
+            "options": [
+              {
+                "id": "true",
+                "label": "True"
+              },
+              {
+                "id": "false",
+                "label": "False"
+              }
+            ]
+          }
         }
       ]
     }
@@ -98,8 +137,7 @@
           "float",
           "double",
           "bytes",
-          "string",
-          "map<string, string>"
+          "string"
         ],
         "schema-default-type": "string"
       }
diff --git a/widgets/S3-batchsource.json b/widgets/S3-batchsource.json