data-integrations
diff --git a/‎docs/Spanner-batchsink.md‎
Lines changed: 5 additions & 1 deletion b/‎docs/Spanner-batchsink.md‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/main/java/co/cask/gcp/bigquery/BigQuerySink.java‎
Lines changed: 16 additions & 14 deletions b/‎src/main/java/co/cask/gcp/bigquery/BigQuerySink.java‎
Lines changed: 16 additions & 14 deletions
diff --git a/‎src/main/java/co/cask/gcp/bigquery/BigQuerySinkConfig.java‎
Lines changed: 26 additions & 5 deletions b/‎src/main/java/co/cask/gcp/bigquery/BigQuerySinkConfig.java‎
Lines changed: 26 additions & 5 deletions
diff --git a/‎src/main/java/co/cask/gcp/common/GCPReferenceSinkConfig.java‎
Lines changed: 5 additions & 1 deletion b/‎src/main/java/co/cask/gcp/common/GCPReferenceSinkConfig.java‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/main/java/co/cask/gcp/gcs/sink/GCSBatchSink.java‎
Lines changed: 10 additions & 5 deletions b/‎src/main/java/co/cask/gcp/gcs/sink/GCSBatchSink.java‎
Lines changed: 10 additions & 5 deletions
diff --git a/‎src/main/java/co/cask/gcp/publisher/GooglePublisher.java‎
Lines changed: 31 additions & 9 deletions b/‎src/main/java/co/cask/gcp/publisher/GooglePublisher.java‎
Lines changed: 31 additions & 9 deletions
diff --git a/‎src/main/java/co/cask/gcp/publisher/PubSubOutputFormat.java‎
Lines changed: 1 addition & 1 deletion b/‎src/main/java/co/cask/gcp/publisher/PubSubOutputFormat.java‎
Lines changed: 1 addition & 1 deletion
@@ -30,11 +30,15 @@ It can be found on the Dashboard in the Google Cloud Platform Console.
  Instance is an allocation of resources that is used by Cloud Spanner databases created in that instance.
 
 **Database Name**: Database the Spanner table belongs to.
-Spanner database is contained within a specific Spanner instance.
+Spanner database is contained within a specific Spanner instance. If the database does not exist, it will get created.
 
 **Table Name**: Table to write to. A table contains individual records organized in rows.
 Each record is composed of columns (also called fields).
 Every table is defined by a schema that describes the column names, data types, and other information.
+If the table does not exist, it will get created.
+
+**Primary Key**: If the table does not exist, a primary key must be provided in order to auto-create the table.
+The key can be a composite key of multiple fields in the schema. This is not required if the table already exists.
 
 **Service Account File Path**: Path on the local file system of the service account key used for
 authorization. Can be set to 'auto-detect' when running on a Dataproc cluster.
 
@@ -96,12 +96,12 @@ public void configurePipeline(PipelineConfigurer pipelineConfigurer) {
   @Override
   public void prepareRun(BatchSinkContext context) throws Exception {
     BigQuery bigquery = BigQueryUtils.getBigQuery(config.getServiceAccountFilePath(), config.getProject());
-    // create dataset if dataset does not exist
-    if (bigquery.getDataset(config.dataset) == null) {
+    // create dataset if it does not exist
+    if (bigquery.getDataset(config.getDataset()) == null) {
       try {
-        bigquery.create(DatasetInfo.newBuilder(config.dataset).build());
+        bigquery.create(DatasetInfo.newBuilder(config.getDataset()).build());
       } catch (BigQueryException e) {
-        throw new RuntimeException("Exception occured while creating dataset " + config.dataset + ".", e);
+        throw new RuntimeException("Exception occurred while creating dataset " + config.getDataset() + ".", e);
       }
     }
 
@@ -120,8 +120,8 @@ public void prepareRun(BatchSinkContext context) throws Exception {
       fields.add(tableFieldSchema);
     }
 
-    String bucket = config.bucket;
-    if (config.bucket == null) {
+    String bucket = config.getBucket();
+    if (config.getBucket() == null) {
       bucket = uuid.toString();
       // By default, this option is false, meaning the job can not delete the bucket. So enable it only when bucket name
       // is not provided.
@@ -135,7 +135,7 @@ public void prepareRun(BatchSinkContext context) throws Exception {
 
     BigQueryOutputConfiguration.configure(
       configuration,
-      String.format("%s.%s", config.dataset, config.table),
+      String.format("%s.%s", config.getDataset(), config.getTable()),
       new BigQueryTableSchema().setFields(fields),
       temporaryGcsPath,
       BigQueryFileFormat.NEWLINE_DELIMITED_JSON,
@@ -165,7 +165,7 @@ public void transform(StructuredRecord input, Emitter<KeyValue<JsonObject, NullW
 
   @Override
   public void onRunFinish(boolean succeeded, BatchSinkContext context) {
-    if (config.bucket == null) {
+    if (config.getBucket() == null) {
       Path gcsPath = new Path(String.format("gs://%s", uuid.toString()));
       try {
         FileSystem fs = gcsPath.getFileSystem(configuration);
@@ -216,7 +216,7 @@ private LegacySQLTypeName getTableDataType(Schema schema) {
   }
 
   private void setOutputFormat(BatchSinkContext context) {
-    context.addOutput(Output.of(config.referenceName, new OutputFormatProvider() {
+    context.addOutput(Output.of(config.getReferenceName(), new OutputFormatProvider() {
       @Override
       public String getOutputFormatClassName() {
         return IndirectBigQueryOutputFormat.class.getName();
@@ -234,7 +234,7 @@ public Map<String, String> getOutputFormatConfiguration() {
   }
 
   private void emitLineage(BatchSinkContext context, List<BigQueryTableFieldSchema> fields) {
-    LineageRecorder lineageRecorder = new LineageRecorder(context, config.referenceName);
+    LineageRecorder lineageRecorder = new LineageRecorder(context, config.getReferenceName());
     lineageRecorder.createExternalDataset(config.getSchema());
 
     if (!fields.isEmpty()) {
@@ -302,7 +302,7 @@ private static void decodeSimpleTypes(JsonObject json, String name, StructuredRe
    */
   private void validateSchema() throws IOException {
     Table table = BigQueryUtils.getBigQueryTable(config.getServiceAccountFilePath(), config.getProject(),
-                                                 config.dataset, config.table);
+                                                 config.getDataset(), config.getTable());
     if (table == null) {
       // Table does not exist, so no further validation is required.
       return;
@@ -322,7 +322,8 @@ private void validateSchema() throws IOException {
     if (!diff.isEmpty()) {
       throw new IllegalArgumentException(
         String.format("The output schema does not match the BigQuery table schema for '%s.%s' table. " +
-                        "The table does not contain the '%s' column(s).", config.dataset, config.table, diff));
+                        "The table does not contain the '%s' column(s).",
+                      config.getDataset(), config.getTable(), diff));
     }
 
     // validate the missing columns in output schema are nullable fields in bigquery
@@ -332,14 +333,15 @@ private void validateSchema() throws IOException {
         throw new IllegalArgumentException(
           String.format("The output schema does not match the BigQuery table schema for '%s.%s'. " +
                           "The table requires column '%s', which is not in the output schema.",
-                        config.dataset, config.table, field));
+                        config.getDataset(), config.getTable(), field));
       }
     }
 
     // Match output schema field type with bigquery column type
     for (Schema.Field field : config.getSchema().getFields()) {
       validateSimpleTypes(field);
-      BigQueryUtils.validateFieldSchemaMatches(bqFields.get(field.getName()), field, config.dataset, config.table);
+      BigQueryUtils.validateFieldSchemaMatches(bqFields.get(field.getName()),
+                                               field, config.getDataset(), config.getTable());
     }
   }
 
 
@@ -18,7 +18,6 @@
 
 import co.cask.cdap.api.annotation.Description;
 import co.cask.cdap.api.annotation.Macro;
-import co.cask.cdap.api.annotation.Name;
 import co.cask.cdap.api.data.schema.Schema;
 import co.cask.gcp.common.GCPReferenceSinkConfig;
 
@@ -33,25 +32,47 @@ public final class BigQuerySinkConfig extends GCPReferenceSinkConfig {
   @Macro
   @Description("The dataset to write to. A dataset is contained within a specific project. "
     + "Datasets are top-level containers that are used to organize and control access to tables and views.")
-  public String dataset;
+  private String dataset;
 
   @Macro
   @Description("The table to write to. A table contains individual records organized in rows. "
     + "Each record is composed of columns (also called fields). "
     + "Every table is defined by a schema that describes the column names, data types, and other information.")
-  public String table;
+  private String table;
 
   @Macro
   @Nullable
   @Description("The Google Cloud Storage bucket to store temporary data in. "
     + "It will be automatically created if it does not exist, but will not be automatically deleted. "
     + "Cloud Storage data will be deleted after it is loaded into BigQuery. " +
     "If it is not provided, a unique bucket will be created and then deleted after the run finishes.")
-  public String bucket;
+  private String bucket;
 
   @Macro
   @Description("The schema of the data to write. Must be compatible with the table schema.")
-  public String schema;
+  private String schema;
+
+  public BigQuerySinkConfig(String referenceName, String dataset, String table,
+                            @Nullable String bucket, String schema) {
+    this.referenceName = referenceName;
+    this.dataset = dataset;
+    this.table = table;
+    this.bucket = bucket;
+    this.schema = schema;
+  }
+
+  public String getDataset() {
+    return dataset;
+  }
+
+  public String getTable() {
+    return table;
+  }
+
+  @Nullable
+  public String getBucket() {
+    return bucket;
+  }
 
   /**
    * @return the schema of the dataset
 
@@ -28,12 +28,16 @@ public class GCPReferenceSinkConfig extends GCPConfig {
   @Name("referenceName")
   @Description("This will be used to uniquely identify this sink for lineage, annotating metadata, etc.")
   @Macro
-  public String referenceName;
+  protected String referenceName;
 
   /**
    * Validates the given referenceName to consists of characters allowed to represent a dataset.
    */
   public void validate() {
     IdUtils.validateId(referenceName);
   }
+
+  public String getReferenceName() {
+    return referenceName;
+  }
 }
@@ -113,6 +113,16 @@ public static class GCSBatchSinkConfig extends GCPReferenceSinkConfig implements
     @Nullable
     private String schema;
 
+    public GCSBatchSinkConfig(String referenceName, String path, @Nullable String suffix, String format,
+                              @Nullable String delimiter, @Nullable String schema) {
+      this.referenceName = referenceName;
+      this.path = path;
+      this.suffix = suffix;
+      this.format = format;
+      this.delimiter = delimiter;
+      this.schema = schema;
+    }
+
     @Override
     public void validate() {
       super.validate();
@@ -129,11 +139,6 @@ public void validate() {
       getSchema();
     }
 
-    @Override
-    public String getReferenceName() {
-      return referenceName;
-    }
-
     @Override
     public String getPath() {
       return GCSConfigHelper.getPath(path).toString();
 
@@ -101,12 +101,12 @@ public void prepareRun(BatchSinkContext context) throws IOException {
     }
 
     Schema inputSchema = context.getInputSchema();
-    LineageRecorder lineageRecorder = new LineageRecorder(context, config.referenceName);
+    LineageRecorder lineageRecorder = new LineageRecorder(context, config.getReferenceName());
     lineageRecorder.createExternalDataset(inputSchema);
 
     Configuration configuration = new Configuration();
     PubSubOutputFormat.configure(configuration, config);
-    context.addOutput(Output.of(config.referenceName,
+    context.addOutput(Output.of(config.getReferenceName(),
                                 new SinkOutputFormatProvider(PubSubOutputFormat.class, configuration)));
 
     // record field level lineage information
@@ -129,38 +129,51 @@ public void transform(StructuredRecord input, Emitter<KeyValue<NullWritable, Tex
   public static class Config extends GCPReferenceSinkConfig {
     @Description("Cloud Pub/Sub topic to publish records to")
     @Macro
-    public String topic;
+    private String topic;
 
     // batching options
     @Description("Maximum count of messages in a batch. The default value is 100.")
     @Macro
     @Nullable
-    public Long messageCountBatchSize;
+    private Long messageCountBatchSize;
 
     @Description("Maximum size of a batch in kilo bytes. The default value is 1KB.")
     @Macro
     @Nullable
-    public Long requestThresholdKB;
+    private Long requestThresholdKB;
 
     @Description("Maximum delay in milli-seconds for publishing the batched messages. The default value is 1 ms.")
     @Macro
     @Nullable
-    public Long publishDelayThresholdMillis;
+    private Long publishDelayThresholdMillis;
 
     @Description("Maximum number of message publishing failures to tolerate per partition " +
       "before the pipeline will be failed. The default value is 0.")
     @Macro
     @Nullable
-    public Long errorThreshold;
+    private Long errorThreshold;
 
     @Description("Maximum amount of time in seconds to spend retrying publishing failures. " +
       "The default value is 30 seconds.")
     @Macro
     @Nullable
-    public Integer retryTimeoutSeconds;
-
+    private Integer retryTimeoutSeconds;
+
+
+    public Config(String referenceName, String topic, @Nullable Long messageCountBatchSize,
+                  @Nullable Long requestThresholdKB, @Nullable Long publishDelayThresholdMillis,
+                  @Nullable Long errorThreshold, @Nullable Integer retryTimeoutSeconds) {
+      this.referenceName = referenceName;
+      this.topic = topic;
+      this.messageCountBatchSize = messageCountBatchSize;
+      this.requestThresholdKB = requestThresholdKB;
+      this.publishDelayThresholdMillis = publishDelayThresholdMillis;
+      this.errorThreshold = errorThreshold;
+      this.retryTimeoutSeconds = retryTimeoutSeconds;
+    }
 
     public void validate() {
+      super.validate();
       if (!containsMacro("messageCountBatchSize") && messageCountBatchSize != null && messageCountBatchSize < 1) {
         throw new IllegalArgumentException("Maximum count of messages in a batch should be positive for Pub/Sub");
       }
@@ -199,5 +212,14 @@ public long getErrorThreshold() {
     public int getRetryTimeoutSeconds() {
       return retryTimeoutSeconds == null ? 30 : retryTimeoutSeconds;
     }
+
+    public String getTopic() {
+      return topic;
+    }
+
+    @Nullable
+    public Long getRequestThresholdKB() {
+      return requestThresholdKB;
+    }
   }
 }
@@ -67,7 +67,7 @@ public static void configure(Configuration configuration, GooglePublisher.Config
     }
     String projectId = config.getProject();
     configuration.set(PROJECT, projectId);
-    configuration.set(TOPIC, config.topic);
+    configuration.set(TOPIC, config.getTopic());
     configuration.set(COUNT_BATCH_SIZE, String.valueOf(config.getMessageCountBatchSize()));
     configuration.set(REQUEST_BYTES_THRESHOLD, String.valueOf(config.getRequestBytesThreshold()));
     configuration.set(DELAY_THRESHOLD, String.valueOf(config.getPublishDelayThresholdMillis()));
Original file line number	Diff line number	Diff line change
`@@ -67,7 +67,7 @@ public static void configure(Configuration configuration, GooglePublisher.Config`
`67`	`67`	`}`
`68`	`68`	`String projectId = config.getProject();`
`69`	`69`	`configuration.set(PROJECT, projectId);`
`70`		`- configuration.set(TOPIC, config.topic);`
	`70`	`+ configuration.set(TOPIC, config.getTopic());`
`71`	`71`	`configuration.set(COUNT_BATCH_SIZE, String.valueOf(config.getMessageCountBatchSize()));`
`72`	`72`	`configuration.set(REQUEST_BYTES_THRESHOLD, String.valueOf(config.getRequestBytesThreshold()));`
`73`	`73`	`configuration.set(DELAY_THRESHOLD, String.valueOf(config.getPublishDelayThresholdMillis()));`