Add fromDataset() operator for Seqera Platform dataset downloads

claude · claude · commit f93f3ab9d1f0 · 2025-11-12T02:56:07.000Z
Implements a new Channel.fromDataset() operator that downloads datasets from Seqera Platform. This feature enables workflows to fetch datasets directly from the platform API. Changes: - Add DatasetExplorer class for handling dataset download logic - Add Channel.fromDataset() factory method with support for dataset ID, version, and fileName parameters - Add comprehensive unit tests for DatasetExplorer - Use Tower access token authentication (TOWER_ACCESS_TOKEN env var or tower.accessToken config) Usage example: ch_input = Channel.fromList( samplesheetToList( fromDataset([fileName: 'samplesheet.csv'], params.input), "assets/schema_input.json" ) ) TODOs for future enhancements: - Support querying multiple datasets using list-datasets API - Support automatic version detection/latest version - Auto-detect fileName from dataset metadata Related to PR nextflow-io#6515 which added dataset upload functionality.
diff --git a/modules/nextflow/src/main/groovy/nextflow/Channel.groovy b/modules/nextflow/src/main/groovy/nextflow/Channel.groovy
@@ -36,6 +36,7 @@ import groovyx.gpars.dataflow.DataflowWriteChannel
 import groovyx.gpars.dataflow.operator.ControlMessage
 import groovyx.gpars.dataflow.operator.PoisonPill
 import nextflow.dag.NodeMarker
+import nextflow.datasource.DatasetExplorer
 import nextflow.datasource.SraExplorer
 import nextflow.exception.AbortOperationException
 import nextflow.extension.CH
@@ -608,4 +609,52 @@ class Channel  {
         fromPath0Future = future.exceptionally(Channel.&handlerException)
     }
 
+    /**
+     * Download a dataset from Seqera Platform and return its content as a String
+     *
+     * @param datasetId The dataset identifier
+     * @return A String containing the dataset content
+     */
+    static String fromDataset(String datasetId) {
+        fromDataset(Collections.emptyMap(), datasetId)
+    }
+
+    /**
+     * Download a dataset from Seqera Platform and return its content as a String
+     *
+     * @param opts Optional parameters (endpoint, version, fileName)
+     * @param datasetId The dataset identifier
+     * @return A String containing the dataset content
+     *
+     * @example
+     * <pre>
+     * // Basic usage - requires fileName parameter
+     * def csv = Channel.fromDataset([fileName: 'data.csv'], 'ds.123abc')
+     *
+     * // Specify version and endpoint
+     * def csv = Channel.fromDataset([
+     *     fileName: 'data.csv',
+     *     version: '2',
+     *     endpoint: 'https://api.tower.nf'
+     * ], 'ds.123abc')
+     *
+     * // Use with nf-schema for samplesheet parsing
+     * ch_input = Channel.fromList(
+     *     samplesheetToList(
+     *         fromDataset([fileName: 'samplesheet.csv'], params.input),
+     *         "assets/schema_input.json"
+     *     )
+     * )
+     * </pre>
+     *
+     * TODO: Support querying multiple datasets using list-datasets API
+     * TODO: Support automatic version detection/latest version
+     */
+    static String fromDataset(Map opts, String datasetId) {
+        CheckHelper.checkParams('fromDataset', opts, DatasetExplorer.PARAMS)
+
+        def explorer = new DatasetExplorer(datasetId, opts)
+        return explorer.apply()
+    }
+
 }
diff --git a/modules/nextflow/src/main/groovy/nextflow/datasource/DatasetExplorer.groovy b/modules/nextflow/src/main/groovy/nextflow/datasource/DatasetExplorer.groovy
@@ -0,0 +1,153 @@
+/*
+ * Copyright 2013-2024, Seqera Labs
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package nextflow.datasource
+
+import groovy.json.JsonSlurper
+import groovy.transform.CompileStatic
+import groovy.util.logging.Slf4j
+import nextflow.Global
+import nextflow.Session
+import nextflow.exception.AbortOperationException
+import nextflow.util.SimpleHttpClient
+
+/**
+ * Download datasets from Seqera Platform
+ *
+ * @author Edmund Miller
+ */
+@Slf4j
+@CompileStatic
+class DatasetExplorer {
+
+    static public Map PARAMS = [
+            endpoint: String,
+            version: String,
+            fileName: String
+    ]
+
+    private String datasetId
+    private String endpoint
+    private String version
+    private String fileName
+    private String accessToken
+    private JsonSlurper jsonSlurper = new JsonSlurper()
+
+    DatasetExplorer() {
+    }
+
+    DatasetExplorer(String datasetId, Map opts) {
+        this.datasetId = datasetId
+        init(opts)
+    }
+
+    DatasetExplorer setDatasetId(String datasetId) {
+        this.datasetId = datasetId
+        return this
+    }
+
+    protected void init(Map opts) {
+        this.endpoint = opts.endpoint as String ?: getConfigEndpoint()
+        this.version = opts.version as String ?: '1'
+        this.fileName = opts.fileName as String
+    }
+
+    protected Map getEnv() {
+        System.getenv()
+    }
+
+    protected String getConfigEndpoint() {
+        def session = Global.session as Session
+        def result = session?.config?.navigate('tower.endpoint')
+        if (!result)
+            result = 'https://api.tower.nf'
+        return result as String
+    }
+
+    protected String getAccessToken() {
+        def session = Global.session as Session
+        def token = session?.config?.navigate('tower.accessToken')
+        if (!token)
+            token = getEnv().get('TOWER_ACCESS_TOKEN')
+        if (!token)
+            throw new AbortOperationException("Missing Seqera Platform access token -- Make sure there's a variable TOWER_ACCESS_TOKEN in your environment or tower.accessToken in your config")
+        return token as String
+    }
+
+    /**
+     * Fetch dataset metadata to determine the fileName if not provided
+     * TODO: Implement this when the list-datasets API is available
+     */
+    protected String getDatasetFileName() {
+        if (fileName)
+            return fileName
+
+        // TODO: In the future, we can query the dataset metadata to get the fileName
+        // For now, we'll use a default pattern or require the user to provide it
+        throw new AbortOperationException("fileName parameter is required for fromDataset(). Future versions will support automatic detection.")
+    }
+
+    protected String getDownloadUrl() {
+        final name = getDatasetFileName()
+        return "${endpoint}/datasets/${datasetId}/v/${version}/n/${URLEncoder.encode(name, "UTF-8")}"
+    }
+
+    /**
+     * Download the dataset and return its content
+     */
+    String apply() {
+        if (!accessToken)
+            accessToken = getAccessToken()
+
+        final url = getDownloadUrl()
+        log.debug "Fetching dataset from: $url"
+
+        try {
+            final client = new SimpleHttpClient()
+            client.setAuthToken("Bearer ${accessToken}")
+
+            // Make HTTP GET request
+            final connection = new URL(url).openConnection() as HttpURLConnection
+            connection.setRequestMethod('GET')
+            connection.setRequestProperty('Authorization', "Bearer ${accessToken}")
+            connection.setRequestProperty('Accept', 'text/csv, text/plain, */*')
+
+            final responseCode = connection.responseCode
+
+            if (responseCode == 200) {
+                final content = connection.inputStream.text
+                log.trace "Dataset content received:\n${content?.take(500)}"
+                return content
+            }
+            else if (responseCode == 403) {
+                throw new AbortOperationException("Access forbidden to dataset ${datasetId} -- Check your permissions and access token")
+            }
+            else if (responseCode == 404) {
+                throw new AbortOperationException("Dataset ${datasetId} not found -- Check the dataset ID, version, and fileName")
+            }
+            else {
+                final errorMsg = connection.errorStream?.text ?: "HTTP ${responseCode}"
+                throw new AbortOperationException("Failed to download dataset ${datasetId}: ${errorMsg}")
+            }
+        }
+        catch (AbortOperationException e) {
+            throw e
+        }
+        catch (Exception e) {
+            throw new AbortOperationException("Error downloading dataset ${datasetId}: ${e.message}", e)
+        }
+    }
+}
diff --git a/modules/nextflow/src/test/groovy/nextflow/datasource/DatasetExplorerTest.groovy b/modules/nextflow/src/test/groovy/nextflow/datasource/DatasetExplorerTest.groovy
@@ -0,0 +1,148 @@
+/*
+ * Copyright 2013-2024, Seqera Labs
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package nextflow.datasource
+
+import nextflow.exception.AbortOperationException
+import spock.lang.Specification
+
+/**
+ * Test DatasetExplorer functionality
+ *
+ * @author Edmund Miller
+ */
+class DatasetExplorerTest extends Specification {
+
+    def 'should return download url' () {
+        given:
+        def explorer = Spy(DatasetExplorer)
+        explorer.datasetId = DATASET_ID
+        explorer.endpoint = ENDPOINT
+        explorer.version = VERSION
+
+        when:
+        explorer.fileName = FILENAME
+        def result = explorer.getDownloadUrl()
+
+        then:
+        result == EXPECTED
+
+        where:
+        DATASET_ID  | ENDPOINT                  | VERSION | FILENAME      | EXPECTED
+        'ds.123abc' | 'https://api.tower.nf'    | '1'     | 'data.csv'    | 'https://api.tower.nf/datasets/ds.123abc/v/1/n/data.csv'
+        'ds.456def' | 'https://api.tower.nf'    | '2'     | 'sample.tsv'  | 'https://api.tower.nf/datasets/ds.456def/v/2/n/sample.tsv'
+        'ds.789ghi' | 'https://custom.api.com'  | '1'     | 'test.csv'    | 'https://custom.api.com/datasets/ds.789ghi/v/1/n/test.csv'
+    }
+
+    def 'should return download url with encoded filename' () {
+        given:
+        def explorer = Spy(DatasetExplorer)
+        explorer.datasetId = 'ds.123'
+        explorer.endpoint = 'https://api.tower.nf'
+        explorer.version = '1'
+        explorer.fileName = 'my file.csv'
+
+        when:
+        def result = explorer.getDownloadUrl()
+
+        then:
+        result == 'https://api.tower.nf/datasets/ds.123/v/1/n/my+file.csv'
+    }
+
+    def 'should use default endpoint' () {
+        given:
+        def explorer = Spy(DatasetExplorer)
+
+        when:
+        def result = explorer.getConfigEndpoint()
+
+        then:
+        1 * explorer.getEnv() >> [:]
+        result == 'https://api.tower.nf'
+    }
+
+    def 'should retrieve access token from environment' () {
+        given:
+        def explorer = Spy(DatasetExplorer)
+
+        when:
+        def result = explorer.getAccessToken()
+
+        then:
+        1 * explorer.getEnv() >> [TOWER_ACCESS_TOKEN: 'test_token_123']
+        result == 'test_token_123'
+    }
+
+    def 'should throw error when access token is missing' () {
+        given:
+        def explorer = Spy(DatasetExplorer)
+
+        when:
+        explorer.getAccessToken()
+
+        then:
+        1 * explorer.getEnv() >> [:]
+        thrown(AbortOperationException)
+    }
+
+    def 'should throw error when fileName is missing' () {
+        given:
+        def explorer = new DatasetExplorer('ds.123', [:])
+
+        when:
+        explorer.getDatasetFileName()
+
+        then:
+        thrown(AbortOperationException)
+    }
+
+    def 'should use provided fileName' () {
+        given:
+        def explorer = new DatasetExplorer('ds.123', [fileName: 'test.csv'])
+
+        when:
+        def result = explorer.getDatasetFileName()
+
+        then:
+        result == 'test.csv'
+    }
+
+    def 'should initialize with options' () {
+        given:
+        def opts = [
+            endpoint: 'https://custom.api.com',
+            version: '2',
+            fileName: 'data.csv'
+        ]
+
+        when:
+        def explorer = new DatasetExplorer('ds.123', opts)
+
+        then:
+        explorer.datasetId == 'ds.123'
+        explorer.endpoint == 'https://custom.api.com'
+        explorer.version == '2'
+        explorer.fileName == 'data.csv'
+    }
+
+    def 'should use default version' () {
+        given:
+        def explorer = new DatasetExplorer('ds.123', [:])
+
+        expect:
+        explorer.version == '1'
+    }
+}