1818
1919import com .google .api .gax .paging .Page ;
2020import com .google .auth .oauth2 .GoogleCredentials ;
21+ import com .google .cloud .RetryOption ;
2122import com .google .cloud .bigquery .BigQuery ;
2223import com .google .cloud .bigquery .BigQueryException ;
2324import com .google .cloud .bigquery .Dataset ;
4950import io .cdap .cdap .etl .api .connector .DirectConnector ;
5051import io .cdap .cdap .etl .api .connector .PluginSpec ;
5152import io .cdap .cdap .etl .api .connector .SampleRequest ;
53+ import io .cdap .cdap .etl .api .connector .SampleType ;
5254import io .cdap .cdap .etl .api .engine .sql .BatchSQLEngine ;
5355import io .cdap .cdap .etl .api .validation .ValidationException ;
5456import io .cdap .plugin .common .ConfigUtil ;
6264import io .cdap .plugin .gcp .bigquery .util .BigQueryDataParser ;
6365import io .cdap .plugin .gcp .bigquery .util .BigQueryUtil ;
6466import io .cdap .plugin .gcp .common .GCPUtils ;
67+ import org .threeten .bp .Duration ;
6568
6669import java .io .IOException ;
6770import java .util .HashMap ;
@@ -95,8 +98,13 @@ public List<StructuredRecord> sample(ConnectorContext context, SampleRequest sam
9598 throw new IllegalArgumentException ("Path should contain both dataset and table name." );
9699 }
97100 String dataset = path .getDataset ();
98- return getTableData (getBigQuery (config .getProject ()), config .getDatasetProject (), dataset , table ,
99- sampleRequest .getLimit ());
101+ String query = getTableQuery (String .format ("`%s.%s.%s`" , config .getDatasetProject (), dataset , table ),
102+ sampleRequest .getLimit (),
103+ SampleType .fromString (sampleRequest .getProperties ().get ("sampleType" )),
104+ sampleRequest .getProperties ().get ("strata" ),
105+ UUID .randomUUID ().toString ().replace ("-" , "_" ));
106+ String id = UUID .randomUUID ().toString ();
107+ return getQueryResult (waitForJob (getBigQuery (config .getProject ()), query , sampleRequest .getTimeoutMs (), id ), id );
100108 }
101109
102110 @ Override
@@ -117,7 +125,7 @@ public void test(ConnectorContext context) throws ValidationException {
117125 GCPUtils .loadServiceAccountCredentials (config .getServiceAccount (), config .isServiceAccountFilePath ());
118126 } catch (Exception e ) {
119127 failureCollector .addFailure (String .format ("Service account key provided is not valid: %s" , e .getMessage ()),
120- "Please provide a valid service account key." );
128+ "Please provide a valid service account key." );
121129 }
122130 }
123131 // if either project or credentials cannot be loaded , no need to continue
@@ -130,7 +138,7 @@ public void test(ConnectorContext context) throws ValidationException {
130138 bigQuery .listDatasets (BigQuery .DatasetListOption .pageSize (1 ));
131139 } catch (Exception e ) {
132140 failureCollector .addFailure (String .format ("Could not connect to BigQuery: %s" , e .getMessage ()),
133- "Please specify correct connection properties." );
141+ "Please specify correct connection properties." );
134142 }
135143 }
136144
@@ -143,15 +151,15 @@ public BrowseDetail browse(ConnectorContext context, BrowseRequest browseRequest
143151 if (dataset == null ) {
144152 // browse project to list all datasets
145153 return config .rootDataset == null ?
146- listDatasets (getBigQuery (config .getDatasetProject ()), browseRequest .getLimit ()) :
147- BrowseDetail .builder ().setTotalCount (1 ).addEntity (
148- BrowseEntity .builder (config .rootDataset , "/" + config .rootDataset , ENTITY_TYPE_DATASET )
149- .canBrowse (true ).build ()).build ();
154+ listDatasets (getBigQuery (config .getDatasetProject ()), browseRequest .getLimit ()) :
155+ BrowseDetail .builder ().setTotalCount (1 ).addEntity (
156+ BrowseEntity .builder (config .rootDataset , "/" + config .rootDataset , ENTITY_TYPE_DATASET )
157+ .canBrowse (true ).build ()).build ();
150158 }
151159 String table = path .getTable ();
152160 if (table == null ) {
153161 return listTables (getBigQuery (config .getProject ()), config .getDatasetProject (), dataset ,
154- browseRequest .getLimit ());
162+ browseRequest .getLimit ());
155163 }
156164 return getTableDetail (getBigQuery (config .getProject ()), config .getDatasetProject (), dataset , table );
157165 }
@@ -202,7 +210,7 @@ private BrowseDetail listTables(BigQuery bigQuery, String datasetProject, String
202210
203211 private BrowseDetail listDatasets (BigQuery bigQuery , Integer limit ) {
204212 Page <Dataset > datasetPage = config .showHiddenDatasets () ?
205- bigQuery .listDatasets (BigQuery .DatasetListOption .all ()) : bigQuery .listDatasets ();
213+ bigQuery .listDatasets (BigQuery .DatasetListOption .all ()) : bigQuery .listDatasets ();
206214 int countLimit = limit == null || limit <= 0 ? Integer .MAX_VALUE : limit ;
207215 int count = 0 ;
208216 BrowseDetail .Builder browseDetailBuilder = BrowseDetail .builder ();
@@ -233,31 +241,99 @@ private BigQuery getBigQuery(String project) throws IOException {
233241 return GCPUtils .getBigQuery (project , credentials );
234242 }
235243
236- private List <StructuredRecord > getTableData (BigQuery bigQuery , String datasetProject , String dataset , String table ,
237- int limit )
238- throws IOException {
239- String query =
240- String .format ("SELECT * FROM `%s.%s.%s` LIMIT %d" , datasetProject , dataset , table , limit );
244+ /**
245+ * Get the SQL query used to sample the table
246+ * @param tableName name of the table
247+ * @param limit limit on rows returned
248+ * @param sampleType sampling method
249+ * @param strata strata column (if given)
250+ * @param sessionID UUID
251+ * @return String
252+ * @throws IllegalArgumentException if no strata column is given for a stratified query
253+ */
254+ protected String getTableQuery (String tableName , int limit , SampleType sampleType , @ Nullable String strata ,
255+ String sessionID ) {
256+ switch (sampleType ) {
257+ case RANDOM :
258+ return String .format ("WITH table AS (\n " +
259+ " SELECT *, RAND() AS r_%s\n " +
260+ " FROM %s\n " +
261+ " WHERE RAND() < 2*%d/(SELECT COUNT(*) FROM %s)\n " +
262+ ")\n " +
263+ "SELECT * EXCEPT (r_%s)\n " +
264+ "FROM table\n " +
265+ "ORDER BY r_%s\n " +
266+ "LIMIT %d" ,
267+ sessionID , tableName , limit , tableName , sessionID , sessionID , limit );
268+ case STRATIFIED :
269+ if (strata == null ) {
270+ throw new IllegalArgumentException ("No strata column given." );
271+ }
272+ return String .format ("SELECT * EXCEPT (`sqn_%s`, `c_%s`)\n " +
273+ "FROM (\n " +
274+ "SELECT *, row_number() OVER (ORDER BY %s, RAND()) AS sqn_%s,\n " +
275+ "COUNT(*) OVER () as c_%s,\n " +
276+ "FROM %s\n " +
277+ ") %s\n " +
278+ "WHERE MOD(sqn_%s, CAST(c_%s / %d AS INT64)) = 1\n " +
279+ "ORDER BY %s\n " +
280+ "LIMIT %d" ,
281+ sessionID , sessionID , strata , sessionID , sessionID , tableName , tableName , sessionID ,
282+ sessionID , limit , strata , limit );
283+ default :
284+ return String .format ("SELECT * FROM %s LIMIT %d" , tableName , limit );
285+ }
286+ }
287+
288+ /**
289+ * Wait for job to complete or time out (if timeout is given)
290+ * @param bigQuery BigQuery client
291+ * @param query SQL query
292+ * @param timeoutMs timeout (if given)
293+ * @param id job ID
294+ * @return Job
295+ * @throws IOException if the job is interrupted
296+ */
297+ private Job waitForJob (BigQuery bigQuery , String query , @ Nullable Long timeoutMs , String id ) throws IOException {
298+
299+ // set up job
241300 QueryJobConfiguration queryConfig = QueryJobConfiguration .newBuilder (query ).build ();
242- String id = UUID .randomUUID ().toString ();
243301 JobId jobId = JobId .of (id );
244302 Job queryJob = bigQuery .create (JobInfo .newBuilder (queryConfig ).setJobId (jobId ).build ());
245- // Wait for the job to finish
303+
304+ // wait for job
246305 try {
247- queryJob = queryJob .waitFor ();
306+ if (timeoutMs == null ) {
307+ return queryJob .waitFor ();
308+ } else {
309+ return queryJob .waitFor (RetryOption .totalTimeout (Duration .ofMillis (timeoutMs )));
310+ }
248311 } catch (InterruptedException e ) {
249312 throw new IOException (String .format ("Query job %s interrupted." , id ), e );
250313 }
314+ }
251315
252- // check for errors
316+ /**
317+ * Retrieve the results of a SQL query
318+ * @param queryJob query job after completion or timeout
319+ * @param id job ID
320+ * @return List of structured records
321+ * @throws IOException if query encounters an error or times out
322+ */
323+ protected List <StructuredRecord > getQueryResult (@ Nullable Job queryJob , String id ) throws IOException {
324+
325+ // Check for errors
253326 if (queryJob == null ) {
254327 throw new IOException (String .format ("Job %s no longer exists." , id ));
328+ } else if (!queryJob .isDone ()) {
329+ queryJob .cancel ();
330+ throw new IOException (String .format ("Job %s timed out." , id ));
255331 } else if (queryJob .getStatus ().getError () != null ) {
256332 throw new IOException (String .format ("Failed to query table : %s" , queryJob .getStatus ().getError ().toString ()));
257333 }
258334
259335 // Get the results
260- TableResult result = null ;
336+ TableResult result ;
261337 try {
262338 result = queryJob .getQueryResults ();
263339 } catch (InterruptedException e ) {
@@ -266,7 +342,6 @@ private List<StructuredRecord> getTableData(BigQuery bigQuery, String datasetPro
266342 return BigQueryDataParser .parse (result );
267343 }
268344
269-
270345 @ Override
271346 public ConnectorSpec generateSpec (ConnectorContext context ,
272347 ConnectorSpecRequest connectorSpecRequest ) throws IOException {
@@ -297,6 +372,9 @@ public ConnectorSpec generateSpec(ConnectorContext context,
297372 .addRelatedPlugin (new PluginSpec (BigQuerySink .NAME , BatchSink .PLUGIN_TYPE , properties ))
298373 .addRelatedPlugin (new PluginSpec (BigQueryMultiSink .NAME , BatchSink .PLUGIN_TYPE , properties ))
299374 .addRelatedPlugin (new PluginSpec (BigQuerySQLEngine .NAME , BatchSQLEngine .PLUGIN_TYPE , properties ))
375+ .addSupportedSampleType (SampleType .RANDOM )
376+ .addSupportedSampleType (SampleType .STRATIFIED )
300377 .build ();
301378 }
302379}
380+
0 commit comments