CDAP-14542 add a property for the dataset project

albertshau · albertshau · commit 14cecf7a4311 · 2018-10-31T17:30:57.000-07:00
Added an optional property for the dataset project that can be
used if it does not reside in the project in which the job will
be run.

Also fixing warning messages
diff --git a/docs/BigQueryTable-batchsink.md b/docs/BigQueryTable-batchsink.md
@@ -24,7 +24,9 @@ Properties
 **Reference Name:** Name used to uniquely identify this sink for lineage, annotating metadata, etc.
 
 **Project ID**: Google Cloud Project ID, which uniquely identifies a project.
-It can be found on the Dashboard in the Google Cloud Platform Console.
+It can be found on the Dashboard in the Google Cloud Platform Console. This is the project
+that the BigQuery job will run in. If a temporary bucket needs to be created, the service account
+must have permission in this project to create buckets.
 
 **Dataset**: Dataset the table belongs to. A dataset is contained within a specific project.
 Datasets are top-level containers that are used to organize and control access to tables and views.
diff --git a/docs/BigQueryTable-batchsource.md b/docs/BigQueryTable-batchsource.md
@@ -24,7 +24,13 @@ Properties
 **Reference Name:** Name used to uniquely identify this source for lineage, annotating metadata, etc.
 
 **Project ID**: Google Cloud Project ID, which uniquely identifies a project.
-It can be found on the Dashboard in the Google Cloud Platform Console.
+It can be found on the Dashboard in the Google Cloud Platform Console. This is the project
+that the BigQuery job will run in. If a temporary bucket needs to be created, the service account
+must have permission in this project to create buckets.
+
+**Dataset Project**: Project the dataset belongs to. This is only required if the dataset is not
+in the same project that the BigQuery job will run in. If no value is given,
+it will default to the configured Project ID.
 
 **Dataset**: Dataset the table belongs to. A dataset is contained within a specific project.
 Datasets are top-level containers that are used to organize and control access to tables and views.
diff --git a/src/main/java/co/cask/gcp/bigquery/BigQuerySource.java b/src/main/java/co/cask/gcp/bigquery/BigQuerySource.java
@@ -91,8 +91,8 @@ public void prepareRun(BatchSourceContext context) throws Exception {
     uuid = UUID.randomUUID();
     configuration = BigQueryUtils.getBigQueryConfig(config.getServiceAccountFilePath(), config.getProject());
 
-    String bucket = config.bucket;
-    if (config.bucket == null) {
+    String bucket = config.getBucket();
+    if (bucket == null) {
       bucket = uuid.toString();
       // By default, this option is false, meaning the job can not delete the bucket. So enable it only when bucket name
       // is not provided.
@@ -106,7 +106,8 @@ public void prepareRun(BatchSourceContext context) throws Exception {
     String temporaryGcsPath = String.format("gs://%s/hadoop/input/%s", bucket, uuid);
     AvroBigQueryInputFormat.setTemporaryCloudStorageDirectory(configuration, temporaryGcsPath);
     AvroBigQueryInputFormat.setEnableShardedExport(configuration, false);
-    BigQueryConfiguration.configureBigQueryInput(configuration, config.getProject(), config.dataset, config.table);
+    BigQueryConfiguration.configureBigQueryInput(configuration, config.getDatasetProject(),
+                                                 config.getDataset(), config.getTable());
 
     Job job = Job.getInstance(configuration);
     job.setOutputKeyClass(LongWritable.class);
@@ -141,7 +142,7 @@ public void transform(KeyValue<LongWritable, GenericData.Record> input, Emitter<
   public void onRunFinish(boolean succeeded, BatchSourceContext context) {
     org.apache.hadoop.fs.Path gcsPath = new org.apache.hadoop.fs.Path(String.format("gs://%s", uuid.toString()));
     try {
-      if (config.bucket == null) {
+      if (config.getBucket() == null) {
           FileSystem fs = gcsPath.getFileSystem(configuration);
           if (fs.exists(gcsPath)) {
             fs.delete(gcsPath, true);
@@ -163,18 +164,20 @@ public void onRunFinish(boolean succeeded, BatchSourceContext context) {
    */
   @Path("getSchema")
   public Schema getSchema(BigQuerySourceConfig request) throws Exception {
-    Table table = BigQueryUtils.getBigQueryTable(request.getServiceAccountFilePath(), request.getProject(),
-                                                 request.dataset, request.table);
+    String dataset = request.getDataset();
+    String table = request.getTable();
+    String project = request.getDatasetProject();
+    Table bqTable = BigQueryUtils.getBigQueryTable(request.getServiceAccountFilePath(), project, dataset, table);
     if (table == null) {
       // Table does not exist
-      throw new IllegalArgumentException(String.format("BigQuery table '%s.%s' does not exist",
-                                                       request.dataset, request.table));
+      throw new IllegalArgumentException(String.format("BigQuery table '%s:%s.%s' does not exist",
+                                                       project, dataset, table));
     }
 
-    com.google.cloud.bigquery.Schema bgSchema = table.getDefinition().getSchema();
+    com.google.cloud.bigquery.Schema bgSchema = bqTable.getDefinition().getSchema();
     if (bgSchema == null) {
-      throw new IllegalArgumentException(String.format("Cannot read from table '%s.%s' because it has no schema.",
-                                                       request.dataset, request.table));
+      throw new IllegalArgumentException(String.format("Cannot read from table '%s:%s.%s' because it has no schema.",
+                                                       project, dataset, table));
     }
     List<Schema.Field> fields = getSchemaFields(bgSchema);
     return Schema.recordOf("output", fields);
@@ -185,32 +188,34 @@ public Schema getSchema(BigQuerySourceConfig request) throws Exception {
    * {@link #getSchema(BigQuerySourceConfig)} method.
    */
   private void validateOutputSchema() throws IOException {
-    Table table = BigQueryUtils.getBigQueryTable(config.getServiceAccountFilePath(), config.getProject(),
-                                                 config.dataset, config.table);
+    String dataset = config.getDataset();
+    String tableName = config.getTable();
+    String project = config.getDatasetProject();
+    Table table = BigQueryUtils.getBigQueryTable(config.getServiceAccountFilePath(), project, dataset, tableName);
     if (table == null) {
       // Table does not exist
-      throw new IllegalArgumentException(String.format("BigQuery table '%s.%s' does not exist.", config.dataset,
-                                                       config.table));
+      throw new IllegalArgumentException(String.format("BigQuery table '%s:%s.%s' does not exist.",
+                                                       project, dataset, table));
     }
 
     com.google.cloud.bigquery.Schema bgSchema = table.getDefinition().getSchema();
     if (bgSchema == null) {
-      throw new IllegalArgumentException(String.format("Cannot read from table '%s.%s' because it has no schema.",
-                                                       config.dataset, config.table));
+      throw new IllegalArgumentException(String.format("Cannot read from table '%s:%s.%s' because it has no schema.",
+                                                       project, dataset, table));
     }
 
     // Output schema should not have more fields than BigQuery table
     List<String> diff = BigQueryUtils.getSchemaMinusBqFields(config.getSchema().getFields(), bgSchema.getFields());
     if (!diff.isEmpty()) {
       throw new IllegalArgumentException(String.format("Output schema has field(s) '%s' which are not present in table"
-                                                         + " '%s.%s' schema.", diff, config.dataset, config.table));
+                                                         + " '%s:%s.%s' schema.", diff, project, dataset, table));
     }
 
     FieldList fields = bgSchema.getFields();
     // Match output schema field type with bigquery column type
     for (Schema.Field field : config.getSchema().getFields()) {
       validateSimpleTypes(field);
-      BigQueryUtils.validateFieldSchemaMatches(fields.get(field.getName()), field, config.dataset, config.table);
+      BigQueryUtils.validateFieldSchemaMatches(fields.get(field.getName()), field, dataset, tableName);
     }
   }
 
diff --git a/src/main/java/co/cask/gcp/bigquery/BigQuerySourceConfig.java b/src/main/java/co/cask/gcp/bigquery/BigQuerySourceConfig.java
@@ -20,7 +20,9 @@
 import co.cask.cdap.api.annotation.Macro;
 import co.cask.cdap.api.annotation.Name;
 import co.cask.cdap.api.data.schema.Schema;
+import co.cask.gcp.common.GCPConfig;
 import co.cask.gcp.common.GCPReferenceSourceConfig;
+import com.google.cloud.ServiceOptions;
 
 import java.io.IOException;
 import javax.annotation.Nullable;
@@ -32,25 +34,53 @@ public final class BigQuerySourceConfig extends GCPReferenceSourceConfig {
   @Macro
   @Description("The dataset the table belongs to. A dataset is contained within a specific project. "
     + "Datasets are top-level containers that are used to organize and control access to tables and views.")
-  public String dataset;
+  private String dataset;
 
   @Macro
   @Description("The table to read from. A table contains individual records organized in rows. "
     + "Each record is composed of columns (also called fields). "
     + "Every table is defined by a schema that describes the column names, data types, and other information.")
-  public String table;
+  private String table;
 
   @Macro
   @Nullable
   @Description("The Google Cloud Storage bucket to store temporary data in. "
     + "It will be automatically created if it does not exist, but will not be automatically deleted. "
-    + "Temporary data will be deleted after it has been read. " +
-    "If it is not provided, a unique bucket will be created and then deleted after the run finishes.")
-  public String bucket;
+    + "Temporary data will be deleted after it has been read. "
+    + "If it is not provided, a unique bucket will be created and then deleted after the run finishes. "
+    + "The service account must have permission to create buckets in the configured project.")
+  private String bucket;
 
   @Macro
   @Description("The schema of the table to read.")
-  public String schema;
+  private String schema;
+
+  @Macro
+  @Nullable
+  @Description("The project the dataset belongs to. This is only required if the dataset is not "
+    + "in the same project that the BigQuery job will run in. If no value is given, it will default to the configured "
+    + "project ID.")
+  private String datasetProject;
+
+  public String getDataset() {
+    return dataset;
+  }
+
+  public String getTable() {
+    return table;
+  }
+
+  @Nullable
+  public String getBucket() {
+    return bucket;
+  }
+
+  public String getDatasetProject() {
+    if (GCPConfig.AUTO_DETECT.equalsIgnoreCase(datasetProject)) {
+      return ServiceOptions.getDefaultProjectId();
+    }
+    return datasetProject == null ? getProject() : datasetProject;
+  }
 
   /**
    * @return the schema of the dataset
diff --git a/src/main/java/co/cask/gcp/bigquery/BigQueryUtils.java b/src/main/java/co/cask/gcp/bigquery/BigQueryUtils.java
@@ -17,6 +17,7 @@
 package co.cask.gcp.bigquery;
 
 import co.cask.cdap.api.data.schema.Schema;
+import co.cask.gcp.gcs.GCSConfigHelper;
 import com.google.cloud.bigquery.BigQuery;
 import com.google.cloud.bigquery.BigQueryOptions;
 import com.google.cloud.bigquery.Field;
@@ -99,6 +100,7 @@ static Configuration getBigQueryConfig(@Nullable String serviceAccountFilePath,
     configuration.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem");
     configuration.set("fs.AbstractFileSystm.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS");
     configuration.set("fs.gs.project.id", projectId);
+    configuration.set("fs.gs.working.dir", GCSConfigHelper.ROOT_DIR);
     configuration.set(BigQueryConfiguration.PROJECT_ID_KEY, projectId);
     return configuration;
   }
diff --git a/widgets/BigQueryTable-batchsource.json b/widgets/BigQueryTable-batchsource.json
@@ -23,6 +23,14 @@
             "default": "auto-detect"
           }
         },
+        {
+          "widget-type": "textbox",
+          "label": "Dataset Project ID",
+          "name": "datasetProject",
+          "widget-attributes" : {
+            "placeholder": "Project the dataset belongs to, if different from the Project ID."
+          }
+        },
         {
           "widget-type": "textbox",
           "label": "Dataset",