Merge pull request #58 from data-integrations/bugfix_release/CDAP-14542-bq-job-project-property

albertshau · web-flow · commit 05b09b42322b · 2018-10-31T17:31:48.000-07:00
CDAP-14542 add a property for the dataset project
diff --git a/docs/BigQueryTable-batchsink.md b/docs/BigQueryTable-batchsink.md
@@ -24,7 +24,9 @@ Properties
 **Reference Name:** Name used to uniquely identify this sink for lineage, annotating metadata, etc.
 
 **Project ID**: Google Cloud Project ID, which uniquely identifies a project.
-It can be found on the Dashboard in the Google Cloud Platform Console.
+It can be found on the Dashboard in the Google Cloud Platform Console. This is the project
+that the BigQuery job will run in. If a temporary bucket needs to be created, the service account
+must have permission in this project to create buckets.
 
 **Dataset**: Dataset the table belongs to. A dataset is contained within a specific project.
 Datasets are top-level containers that are used to organize and control access to tables and views.
diff --git a/docs/BigQueryTable-batchsource.md b/docs/BigQueryTable-batchsource.md
@@ -24,7 +24,13 @@ Properties
 **Reference Name:** Name used to uniquely identify this source for lineage, annotating metadata, etc.
 
 **Project ID**: Google Cloud Project ID, which uniquely identifies a project.
-It can be found on the Dashboard in the Google Cloud Platform Console.
+It can be found on the Dashboard in the Google Cloud Platform Console. This is the project
+that the BigQuery job will run in. If a temporary bucket needs to be created, the service account
+must have permission in this project to create buckets.
+
+**Dataset Project**: Project the dataset belongs to. This is only required if the dataset is not
+in the same project that the BigQuery job will run in. If no value is given,
+it will default to the configured Project ID.
 
 **Dataset**: Dataset the table belongs to. A dataset is contained within a specific project.
 Datasets are top-level containers that are used to organize and control access to tables and views.
diff --git a/src/main/java/co/cask/gcp/bigquery/BigQuerySource.java b/src/main/java/co/cask/gcp/bigquery/BigQuerySource.java
@@ -91,8 +91,8 @@ public void prepareRun(BatchSourceContext context) throws Exception {
     uuid = UUID.randomUUID();
     configuration = BigQueryUtils.getBigQueryConfig(config.getServiceAccountFilePath(), config.getProject());
 
-    String bucket = config.bucket;
-    if (config.bucket == null) {
+    String bucket = config.getBucket();
+    if (bucket == null) {
       bucket = uuid.toString();
       // By default, this option is false, meaning the job can not delete the bucket. So enable it only when bucket name
       // is not provided.
@@ -106,7 +106,8 @@ public void prepareRun(BatchSourceContext context) throws Exception {
     String temporaryGcsPath = String.format("gs://%s/hadoop/input/%s", bucket, uuid);
     AvroBigQueryInputFormat.setTemporaryCloudStorageDirectory(configuration, temporaryGcsPath);
     AvroBigQueryInputFormat.setEnableShardedExport(configuration, false);
-    BigQueryConfiguration.configureBigQueryInput(configuration, config.getProject(), config.dataset, config.table);
+    BigQueryConfiguration.configureBigQueryInput(configuration, config.getDatasetProject(),
+                                                 config.getDataset(), config.getTable());
 
     Job job = Job.getInstance(configuration);
     job.setOutputKeyClass(LongWritable.class);
@@ -141,7 +142,7 @@ public void transform(KeyValue<LongWritable, GenericData.Record> input, Emitter<
   public void onRunFinish(boolean succeeded, BatchSourceContext context) {
     org.apache.hadoop.fs.Path gcsPath = new org.apache.hadoop.fs.Path(String.format("gs://%s", uuid.toString()));
     try {
-      if (config.bucket == null) {
+      if (config.getBucket() == null) {
           FileSystem fs = gcsPath.getFileSystem(configuration);
           if (fs.exists(gcsPath)) {
             fs.delete(gcsPath, true);
@@ -163,18 +164,20 @@ public void onRunFinish(boolean succeeded, BatchSourceContext context) {
    */
   @Path("getSchema")
   public Schema getSchema(BigQuerySourceConfig request) throws Exception {
-    Table table = BigQueryUtils.getBigQueryTable(request.getServiceAccountFilePath(), request.getProject(),
-                                                 request.dataset, request.table);
+    String dataset = request.getDataset();
+    String table = request.getTable();
+    String project = request.getDatasetProject();
+    Table bqTable = BigQueryUtils.getBigQueryTable(request.getServiceAccountFilePath(), project, dataset, table);
     if (table == null) {
       // Table does not exist
-      throw new IllegalArgumentException(String.format("BigQuery table '%s.%s' does not exist",
-                                                       request.dataset, request.table));
+      throw new IllegalArgumentException(String.format("BigQuery table '%s:%s.%s' does not exist",
+                                                       project, dataset, table));
     }
 
-    com.google.cloud.bigquery.Schema bgSchema = table.getDefinition().getSchema();
+    com.google.cloud.bigquery.Schema bgSchema = bqTable.getDefinition().getSchema();
     if (bgSchema == null) {
-      throw new IllegalArgumentException(String.format("Cannot read from table '%s.%s' because it has no schema.",
-                                                       request.dataset, request.table));
+      throw new IllegalArgumentException(String.format("Cannot read from table '%s:%s.%s' because it has no schema.",
+                                                       project, dataset, table));
     }
     List<Schema.Field> fields = getSchemaFields(bgSchema);
     return Schema.recordOf("output", fields);
@@ -185,32 +188,34 @@ public Schema getSchema(BigQuerySourceConfig request) throws Exception {
    * {@link #getSchema(BigQuerySourceConfig)} method.
    */
   private void validateOutputSchema() throws IOException {
-    Table table = BigQueryUtils.getBigQueryTable(config.getServiceAccountFilePath(), config.getProject(),
-                                                 config.dataset, config.table);
+    String dataset = config.getDataset();
+    String tableName = config.getTable();
+    String project = config.getDatasetProject();
+    Table table = BigQueryUtils.getBigQueryTable(config.getServiceAccountFilePath(), project, dataset, tableName);
     if (table == null) {
       // Table does not exist
-      throw new IllegalArgumentException(String.format("BigQuery table '%s.%s' does not exist.", config.dataset,
-                                                       config.table));
+      throw new IllegalArgumentException(String.format("BigQuery table '%s:%s.%s' does not exist.",
+                                                       project, dataset, table));
     }
 
     com.google.cloud.bigquery.Schema bgSchema = table.getDefinition().getSchema();
     if (bgSchema == null) {
-      throw new IllegalArgumentException(String.format("Cannot read from table '%s.%s' because it has no schema.",
-                                                       config.dataset, config.table));
+      throw new IllegalArgumentException(String.format("Cannot read from table '%s:%s.%s' because it has no schema.",
+                                                       project, dataset, table));
     }
 
     // Output schema should not have more fields than BigQuery table
     List<String> diff = BigQueryUtils.getSchemaMinusBqFields(config.getSchema().getFields(), bgSchema.getFields());
     if (!diff.isEmpty()) {
       throw new IllegalArgumentException(String.format("Output schema has field(s) '%s' which are not present in table"
-                                                         + " '%s.%s' schema.", diff, config.dataset, config.table));
+                                                         + " '%s:%s.%s' schema.", diff, project, dataset, table));
     }
 
     FieldList fields = bgSchema.getFields();
     // Match output schema field type with bigquery column type
     for (Schema.Field field : config.getSchema().getFields()) {
       validateSimpleTypes(field);
-      BigQueryUtils.validateFieldSchemaMatches(fields.get(field.getName()), field, config.dataset, config.table);
+      BigQueryUtils.validateFieldSchemaMatches(fields.get(field.getName()), field, dataset, tableName);
     }
   }
 
diff --git a/src/main/java/co/cask/gcp/bigquery/BigQuerySourceConfig.java b/src/main/java/co/cask/gcp/bigquery/BigQuerySourceConfig.java
@@ -20,7 +20,9 @@
 import co.cask.cdap.api.annotation.Macro;
 import co.cask.cdap.api.annotation.Name;
 import co.cask.cdap.api.data.schema.Schema;
+import co.cask.gcp.common.GCPConfig;
 import co.cask.gcp.common.GCPReferenceSourceConfig;
+import com.google.cloud.ServiceOptions;
 
 import java.io.IOException;
 import javax.annotation.Nullable;
@@ -32,25 +34,53 @@ public final class BigQuerySourceConfig extends GCPReferenceSourceConfig {
   @Macro
   @Description("The dataset the table belongs to. A dataset is contained within a specific project. "
     + "Datasets are top-level containers that are used to organize and control access to tables and views.")
-  public String dataset;
+  private String dataset;
 
   @Macro
   @Description("The table to read from. A table contains individual records organized in rows. "
     + "Each record is composed of columns (also called fields). "
     + "Every table is defined by a schema that describes the column names, data types, and other information.")
-  public String table;
+  private String table;
 
   @Macro
   @Nullable
   @Description("The Google Cloud Storage bucket to store temporary data in. "
     + "It will be automatically created if it does not exist, but will not be automatically deleted. "
-    + "Temporary data will be deleted after it has been read. " +
-    "If it is not provided, a unique bucket will be created and then deleted after the run finishes.")
-  public String bucket;
+    + "Temporary data will be deleted after it has been read. "
+    + "If it is not provided, a unique bucket will be created and then deleted after the run finishes. "
+    + "The service account must have permission to create buckets in the configured project.")
+  private String bucket;
 
   @Macro
   @Description("The schema of the table to read.")
-  public String schema;
+  private String schema;
+
+  @Macro
+  @Nullable
+  @Description("The project the dataset belongs to. This is only required if the dataset is not "
+    + "in the same project that the BigQuery job will run in. If no value is given, it will default to the configured "
+    + "project ID.")
+  private String datasetProject;
+
+  public String getDataset() {
+    return dataset;
+  }
+
+  public String getTable() {
+    return table;
+  }
+
+  @Nullable
+  public String getBucket() {
+    return bucket;
+  }
+
+  public String getDatasetProject() {
+    if (GCPConfig.AUTO_DETECT.equalsIgnoreCase(datasetProject)) {
+      return ServiceOptions.getDefaultProjectId();
+    }
+    return datasetProject == null ? getProject() : datasetProject;
+  }
 
   /**
    * @return the schema of the dataset
diff --git a/src/main/java/co/cask/gcp/bigquery/BigQueryUtils.java b/src/main/java/co/cask/gcp/bigquery/BigQueryUtils.java
@@ -17,6 +17,7 @@
 package co.cask.gcp.bigquery;
 
 import co.cask.cdap.api.data.schema.Schema;
+import co.cask.gcp.gcs.GCSConfigHelper;
 import com.google.cloud.bigquery.BigQuery;
 import com.google.cloud.bigquery.BigQueryOptions;
 import com.google.cloud.bigquery.Field;
@@ -99,6 +100,7 @@ static Configuration getBigQueryConfig(@Nullable String serviceAccountFilePath,
     configuration.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem");
     configuration.set("fs.AbstractFileSystm.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS");
     configuration.set("fs.gs.project.id", projectId);
+    configuration.set("fs.gs.working.dir", GCSConfigHelper.ROOT_DIR);
     configuration.set(BigQueryConfiguration.PROJECT_ID_KEY, projectId);
     return configuration;
   }
diff --git a/widgets/BigQueryTable-batchsource.json b/widgets/BigQueryTable-batchsource.json
@@ -23,6 +23,14 @@
             "default": "auto-detect"
           }
         },
+        {
+          "widget-type": "textbox",
+          "label": "Dataset Project ID",
+          "name": "datasetProject",
+          "widget-attributes" : {
+            "placeholder": "Project the dataset belongs to, if different from the Project ID."
+          }
+        },
         {
           "widget-type": "textbox",
           "label": "Dataset",